In [1]:
import os
import shutil

import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join

In [2]:
experiment_id = "NS-23.0061"

metadata_subdir = os.path.expanduser("~/fht.samba.data/experiments/ATACseq/{}/analysis/metadata/".format(experiment_id))
print('metadata_subdir exists:{}\n'.format(os.path.exists(metadata_subdir)))

dpa_data_dir ='../dpa_data/'
print('dpa_data_dir exists:{}\n'.format(os.path.exists(dpa_data_dir)))

metadata_subdir exists:True

dpa_data_dir exists:True



In [3]:
def create_dir(name_dir):
    if os.path.exists(name_dir):
        shutil.rmtree(name_dir)
    os.mkdir(name_dir)
    return name_dir

In [4]:
bed_dir = create_dir('../beds/')  
bed_dir

'../beds/'

In [5]:
def read_contrast(metadata_subdir, experiment_id): 
    
    metadata_files = [f for f in listdir(metadata_subdir) if isfile(join(metadata_subdir, f))]
    print('Files in metadata subdirectory:\n{}\n'.format(metadata_files))

    input_orig_contrast_dir = list(filter(lambda x: ('_contrasts_r' in x and experiment_id in x), metadata_files))[0] 
    contrasts_df = pd.read_table(metadata_subdir + input_orig_contrast_dir, delimiter = '\t', index_col = 0)
    print('input_orig_contrast_dir: {} - shape: {} \n'.format(input_orig_contrast_dir, contrasts_df.shape))  
    
    return contrasts_df

contrasts_df = read_contrast(metadata_subdir, experiment_id)
contrasts_df

Files in metadata subdirectory:
['ATACseq_ARID1AKO_ARID1BKD.csv', 'ATACseq_ARID1AKO_ARID1BKD.txt', 'NS-23.0061_metadata_r12x34.txt', 'ATACseq_ARID1AKO_ARID1BKD.xlsx', 'NS-23.0061_group_dict.json', 'NS-23.0061_contrast_dict.json', 'NS-23.0061_contrasts_r6x3.txt']

input_orig_contrast_dir: NS-23.0061_contrasts_r6x3.txt - shape: (6, 3) 



Unnamed: 0_level_0,test_group,negative_ctrl_grp,file_friendly_name
contrast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ARID1BKD_HCT116 - WT_HCT116,ARID1BKD_HCT116,WT_HCT116,ARID1BKD_HCT116_WT_HCT116
ARID1AKO_HCT116 - WT_HCT116,ARID1AKO_HCT116,WT_HCT116,ARID1AKO_HCT116_WT_HCT116
ARID1AKO_ARID1BKD_HCT116 - WT_HCT116,ARID1AKO_ARID1BKD_HCT116,WT_HCT116,ARID1AKO_ARID1BKD_HCT116_WT_HCT116
ARID1AKO_ARID1BKD_HCT116 - ARID1AKO_HCT116,ARID1AKO_ARID1BKD_HCT116,ARID1AKO_HCT116,ARID1AKO_ARID1BKD_HCT116_ARID1AKO_HCT116
ARID1AKO_ARID1BKD_HCT116 - ARID1BKD_HCT116,ARID1AKO_ARID1BKD_HCT116,ARID1BKD_HCT116,ARID1AKO_ARID1BKD_HCT116_ARID1BKD_HCT116
ARID1BKD_TOV21G - WT_TOV21G,ARID1BKD_TOV21G,WT_TOV21G,ARID1BKD_TOV21G_WT_TOV21G


In [6]:
# contrasts_df = contrasts_df.loc[contrasts_df.index.str.contains('KMS20|KO52'), : ]
# contrasts_df

In [7]:
def filter_DPA(dpa_data_dir, DPA_file):
    
    peak_df = pd.read_csv(dpa_data_dir + DPA_file,sep="\t", index_col=0)   
    peak_df['-log10(pval)']=-np.log10(peak_df['P_Value'])    
   
    peak_df_up= peak_df[( peak_df['-log10(pval)']>1.3)&( peak_df['logFC']>=0.5)].reset_index()[['chr','start', 'end']]    
    peak_df_down=peak_df[(peak_df['-log10(pval)']>1.3)&(peak_df['logFC']<=-0.5)].reset_index()[['chr','start', 'end']]
    peak_df_unchanged=peak_df[(peak_df['-log10(pval)']<=1.3)|(np.abs(peak_df['logFC'])<0.5)].reset_index()[['chr','start', 'end']]
    
    locs= peak_df_unchanged.shape[0] + peak_df_up.shape[0] + peak_df_down.shape[0] == peak_df.shape[0]
    print('down: {}, unchanged: {}, up: {}'.format(peak_df_down.shape[0], peak_df_unchanged.shape[0], peak_df_up.shape[0]))
    print('Check that the number of rows of the df is equal to the sum of the up, down and unchanged df :{}\n'.format(locs))
    
    return peak_df_up, peak_df_down, peak_df_unchanged

In [8]:
def split_DPA_df(experiment_id, contrasts_df, dpa_data_dir):

    dpa_data_files = [f for f in listdir(dpa_data_dir) if isfile(join(dpa_data_dir, f))]
    print("dpa_data_files:  {}".format(dpa_data_files))

    for contrast in contrasts_df['file_friendly_name']:

        print("\ncontrast:  {}\n".format(contrast))
        
        DPA_file = list(filter(lambda x: (experiment_id in x and contrast in x), dpa_data_files))[0]
        print("DPA_file:  {}".format(DPA_file))

        peak_df_up, peak_df_down, peak_df_unchanged = filter_DPA(dpa_data_dir, DPA_file)

        # Create filename for each df
        peak_df_up_filename = "ATAC_seq_{}_up.bed".format(contrast)
        peak_df_down_filename = "ATAC_seq_{}_down.bed".format(contrast)
        peak_df_unchanged_filename = "ATAC_seq_{}_unchanged.bed".format(contrast)

        # Save dataframe in bed_dir
        peak_df_up.to_csv(bed_dir + peak_df_up_filename, sep="\t",header=None, index=False)
        peak_df_down.to_csv(bed_dir + peak_df_down_filename, sep="\t",header=None,index=False)
        peak_df_unchanged.to_csv(bed_dir + peak_df_unchanged_filename, sep="\t",header=None, index=False)
    
split_DPA_df(experiment_id, contrasts_df, dpa_data_dir)

dpa_data_files:  ['NS-23.0061_ARID1BKD_TOV21G_WT_TOV21G_DPA_r64683x12.txt', 'NS-23.0061_ARID1AKO_ARID1BKD_HCT116_ARID1BKD_HCT116_DPA_r80370x12.txt', 'NS-23.0061_ARID1AKO_HCT116_WT_HCT116_DPA_r74739x12.txt', 'NS-23.0061_ARID1AKO_ARID1BKD_HCT116_ARID1AKO_HCT116_DPA_r70760x12.txt', 'NS-23.0061_ARID1BKD_HCT116_WT_HCT116_DPA_r76523x12.txt', 'NS-23.0061_ARID1AKO_ARID1BKD_HCT116_WT_HCT116_DPA_r75906x12.txt', 'NS-23.0061_differential_peak_area_counts_r3x6.txt']

contrast:  ARID1BKD_HCT116_WT_HCT116

DPA_file:  NS-23.0061_ARID1BKD_HCT116_WT_HCT116_DPA_r76523x12.txt
down: 3100, unchanged: 70466, up: 2957
Check that the number of rows of the df is equal to the sum of the up, down and unchanged df :True


contrast:  ARID1AKO_HCT116_WT_HCT116

DPA_file:  NS-23.0061_ARID1AKO_HCT116_WT_HCT116_DPA_r74739x12.txt
down: 12537, unchanged: 54214, up: 7988
Check that the number of rows of the df is equal to the sum of the up, down and unchanged df :True


contrast:  ARID1AKO_ARID1BKD_HCT116_WT_HCT116

DPA_f