In [1]:
import os
import shutil

import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join
import json 

import plotly
import plotly.express as px
import plotly.io as pio

# featureCounts

In [2]:
experiment_id = "NS-23.0061"

metadata_subdir = os.path.expanduser("~/fht.samba.data/experiments/ATACseq/{}/analysis/metadata/".format(experiment_id))
print('metadata_subdir exists:{}\n'.format(os.path.exists(metadata_subdir)))

PEAKS_dir = os.path.expanduser("~/fht.samba.data/experiments/ATACseq/{}/alignment/macs2/".format(experiment_id))
print('PEAKS_dir exists:{}\n'.format(os.path.exists(PEAKS_dir)))

BAM_DIR = os.path.expanduser("~/fht.samba.data/experiments/ATACseq/{}/alignment/bam/".format(experiment_id))
print('BAM_DIR exists:{}\n'.format(os.path.exists(BAM_DIR)))

metadata_subdir exists:True

PEAKS_dir exists:True

BAM_DIR exists:True



In [3]:
def create_dir(name_dir):
    if os.path.exists(name_dir):
        shutil.rmtree(name_dir)
    os.mkdir(name_dir)
    return name_dir
    
All_samples_dir = create_dir(name_dir = '../All_samples/')
print(All_samples_dir)

../All_samples/


In [None]:
def read_metadata_files(metadata_subdir, experiment_id): 
    
    metadata_files = [f for f in listdir(metadata_subdir) if isfile(join(metadata_subdir, f))]
    print('Files in metadata subdirectory:\n{}\n'.format(metadata_files))
    
    input_metadata_dir = list(filter(lambda x: ('_metadata_r' in x and experiment_id in x), metadata_files))[0]
    metadata_df = pd.read_table(metadata_subdir + input_metadata_dir, delimiter = '\t',  index_col = 0)
    print('input_metadata_dir: {} - shape: {} \n'.format(input_metadata_dir, metadata_df.shape))
    return metadata_df


metadata_df = read_metadata_files(metadata_subdir, experiment_id)

# FeatureCounts for all narrowPeak (PCA plot)

In [5]:
def all_sample_featureCounts(metadata_df):
    sample_list = list(metadata_df.index)
    print("sample_list: \n{}\n".format(sample_list))

    # 1) filter chromosomes that are not part of the chromosome set (should be added to previous step) and combined test+grp and negative_cntrl_grp
    # write path of each narrowPeak file
    narrowPeak_list = [PEAKS_dir + group + '_peaks.narrowPeak' for group in sample_list]
    print("narrowPeak_list: \n{}\n".format(narrowPeak_list))
    for narrowPeak in narrowPeak_list:
        cmd_cat = """grep -h '^chr' {narrowPeak} >> {All_samples_dir}All_samples.narrowPeak""".format(narrowPeak=narrowPeak, All_samples_dir=All_samples_dir)
        !{cmd_cat}

    # 2) sort by chromosome positions (chr, start)
    !sort -k1,1V -k2,2n -k3,3n {All_samples_dir}All_samples.narrowPeak > {All_samples_dir}All_samples.sorted.narrowPeak

    # 3) bedtools merge
    !bedtools merge -d 50 -i  {All_samples_dir}All_samples.sorted.narrowPeak > {All_samples_dir}All_samples.merged.narrowPeak

    # 6) add header and convert merged.bed file to saf file for each sample
    cmd_saf = """awk 'BEGIN{{FS=OFS="\t"; print "GeneID\tchr\tstart\tend\tstrand"}} {{print $1":"$2"-"$3, $1, $2+1, $3, "."}}'  {All_samples_dir}All_samples.merged.narrowPeak > {All_samples_dir}All_samples.merged.saf""".format(All_samples_dir=All_samples_dir)
    ! {cmd_saf}

    # 7) Call featureCounts on saf file for each sample
    bam_list = [BAM_DIR + group + '.small.chrsorted.BAM' for group in sample_list]
    print("\nbam_list: \n{}\n".format(bam_list))
    bam_list_bash = '\t'.join(bam_list)
    !featureCounts -T 48 -a {All_samples_dir}All_samples.merged.saf -F SAF  -o {All_samples_dir}All_samples.orig.txt {bam_list_bash}

    # 8) Remove header and reformat header for DPA logFC computation
    !sed -i '1,2d' {All_samples_dir}All_samples.orig.txt
    
    # 9) reformat header for DPA logFC computation
    sample_list_bash = '\t'.join(sample_list)
    sample_number = [1,2,3,4] + list(np.arange(7, 7+ len(sample_list), 1))
    sample_number_str = [str(i) for i in sample_number]
    sample_number_bash = '$' + ', $'.join(sample_number_str)
    cmd_header = """awk 'BEGIN{{FS=OFS="\t"; print "peak_id\tchr\tstart\tend\t{sample_list_bash}"}} {{print {sample_number_bash}}}' {All_samples_dir}All_samples.orig.txt > {All_samples_dir}All_samples.txt""".format(sample_number_bash=sample_number_bash, sample_list_bash=sample_list_bash, All_samples_dir=All_samples_dir)
    ! {cmd_header}
    
    !head {All_samples_dir}All_samples.txt

all_sample_featureCounts(metadata_df)

sample_list: 
['SRR5876158', 'SRR5876159', 'SRR5876160', 'SRR5876161', 'SRR5876162', 'SRR5876163', 'SRR5876164', 'SRR5876165', 'SRR5876661', 'SRR5876662', 'SRR5876663', 'SRR5876664']

narrowPeak_list: 
['/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876158_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876159_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876160_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876161_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876162_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876163_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/alignment/macs2/SRR5876164_peaks.narrowPeak', '/home/fuzan/fht.samba.data/experiments/ATACseq/NS-23.0061/ali