# Collect Juicer HiC Statistics
- **Author** - Frank Grenn
- **Date Started** - January 2020
- **Quick Description:** simple script to collect some statistics from the juicer scripts for all samples and put them in one file for comparison

In [None]:
import pandas as pd
import os


In [None]:
RESULTS_DIR="/path/to/juicer/sample/directories"


In [None]:
sample_directories = [ name for name in os.listdir(RESULTS_DIR) if os.path.isdir(os.path.join(RESULTS_DIR, name)) and not name=="all_chr_run" ]
print(len(sample_directories))
print(sample_directories)

In [None]:
results_df = pd.DataFrame(columns=['Sample Name','Sequenced Read Pairs'\
                                   ,'Alignable (Normal+Chimeric Paired)','Percent Alignable (Normal+Chimeric Paired)'\
                                   , 'Unique Reads', 'Percent Unique Reads','PCR Duplicates', 'Percent PCR Duplicates'\
                                   , 'Optical Duplicates' ,'Percent Optical Duplicates','Hi-C Contacts'\
                                   ,'Percent Hi-C Contacts of Sequence Read Pairs','Percent Hi-C Contacts of Unique Reads'\
                                   ,'Inter-chromosomal Contacts','Percent Inter-chromosomal Contacts of Sequenced Read Pairs'\
                                   ,'Percent Inter-chromosomal Contacts of Unique Reads','Intra-chromosomal Contacts'\
                                   ,'Percent Intra-chromosomal Contacts of Sequenced Read Pairs'\
                                   ,'Percent Intra-chromosomal Contacts of Unique Reads'\
                                   ,"Short Range (<20Kb) Contacts","Percent Short Range (<20Kb) Contacts of Sequenced Read Pairs"\
                                   ,"Percent Short Range (<20Kb) Contacts of Unique Reads"\
                                   ,"Long Range (>20Kb) Contacts","Percent Long Range (>20Kb) Contacts of Sequenced Read Pairs"\
                                   ,"Percent Long Range (>20Kb) Contacts of Unique Reads"\
                                   , 'Loop Count'])

In [None]:
for sample in sample_directories:
    #check if the aligned folder exists
    aligned_dir = RESULTS_DIR+"/"+sample+"/aligned"
    if(os.path.isdir(aligned_dir)):
        stats = pd.read_csv(aligned_dir+"/inter_30.txt",sep=":",header=None,names=["Name","Data"])
        
        srp = stats.loc[stats["Name"]=="Sequenced Read Pairs"]["Data"].iloc[0].split()[0]
        align = stats.loc[stats["Name"]=="Alignable (Normal+Chimeric Paired)"]["Data"].iloc[0].split()[0]
        palign = stats.loc[stats["Name"]=="Alignable (Normal+Chimeric Paired)"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        unique = stats.loc[stats["Name"]=="Unique Reads"]["Data"].iloc[0].split()[0]
        punique = stats.loc[stats["Name"]=="Unique Reads"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        pcrdup = stats.loc[stats["Name"]=="PCR Duplicates"]["Data"].iloc[0].split()[0]
        ppcrdup = stats.loc[stats["Name"]=="PCR Duplicates"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        optdup = stats.loc[stats["Name"]=="Optical Duplicates"]["Data"].iloc[0].split()[0]
        poptdup = stats.loc[stats["Name"]=="Optical Duplicates"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        contacts = stats.loc[stats["Name"]=="Hi-C Contacts"]["Data"].iloc[0].split()[0]
        srpcontacts = stats.loc[stats["Name"]=="Hi-C Contacts"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        uniquecontacts = stats.loc[stats["Name"]=="Hi-C Contacts"]["Data"].iloc[0].split()[3].replace("(","").replace("%","").replace(")","")
        inter = stats.loc[stats["Name"]=="Inter-chromosomal"]["Data"].iloc[0].split()[0]
        srpinter = stats.loc[stats["Name"]=="Inter-chromosomal"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        uniqueinter = stats.loc[stats["Name"]=="Inter-chromosomal"]["Data"].iloc[0].split()[3].replace("(","").replace("%","").replace(")","")
        intra = stats.loc[stats["Name"]=="Intra-chromosomal"]["Data"].iloc[0].split()[0]
        srpintra = stats.loc[stats["Name"]=="Intra-chromosomal"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        uniqueintra = stats.loc[stats["Name"]=="Intra-chromosomal"]["Data"].iloc[0].split()[3].replace("(","").replace("%","").replace(")","")
        
        short = stats.loc[stats["Name"]=="Short Range (<20Kb)"]["Data"].iloc[0].split()[0]
        srpshort = stats.loc[stats["Name"]=="Short Range (<20Kb)"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        uniqueshort = stats.loc[stats["Name"]=="Short Range (<20Kb)"]["Data"].iloc[0].split()[3].replace("(","").replace("%","").replace(")","")
        
        long = stats.loc[stats["Name"]=="Long Range (>20Kb)"]["Data"].iloc[0].split()[0]
        srplong = stats.loc[stats["Name"]=="Long Range (>20Kb)"]["Data"].iloc[0].split()[1].replace("(","").replace("%","").replace(")","")
        uniquelong = stats.loc[stats["Name"]=="Long Range (>20Kb)"]["Data"].iloc[0].split()[3].replace("(","").replace("%","").replace(")","")
        
        num_loops = sum(1 for line in open(aligned_dir+"/inter_30_loops/merged_loops.bedpe"))

        results_df = results_df.append({'Sample Name':sample, 'Sequenced Read Pairs': srp,'Alignable (Normal+Chimeric Paired)':align,'Percent Alignable (Normal+Chimeric Paired)':palign\
                          ,'Unique Reads':unique,'Percent Unique Reads':punique,'PCR Duplicates':pcrdup,'Percent PCR Duplicates':ppcrdup,'Optical Duplicates':optdup\
                          ,'Percent Optical Duplicates':poptdup,'Hi-C Contacts':contacts,'Percent Hi-C Contacts of Sequence Read Pairs':srpcontacts,'Percent Hi-C Contacts of Unique Reads':uniquecontacts\
                          ,'Inter-chromosomal Contacts':inter,'Percent Inter-chromosomal Contacts of Sequenced Read Pairs':srpinter\
                          ,'Percent Inter-chromosomal Contacts of Unique Reads':uniqueinter\
                          ,'Intra-chromosomal Contacts':intra,'Percent Intra-chromosomal Contacts of Sequenced Read Pairs':srpintra\
                          ,'Percent Intra-chromosomal Contacts of Unique Reads':uniqueintra\
                          ,'Short Range (<20Kb) Contacts':short, 'Percent Short Range (<20Kb) Contacts of Sequenced Read Pairs':srpshort,'Percent Short Range (<20Kb) Contacts of Unique Reads':uniqueshort\
                          ,"Long Range (>20Kb) Contacts":long, "Percent Long Range (>20Kb) Contacts of Sequenced Read Pairs":srplong,"Percent Long Range (>20Kb) Contacts of Unique Reads":uniquelong\
                          ,'Loop Count':num_loops},ignore_index=True)
results_df.head()

In [None]:
results_df.to_csv("/path/for/output/results_chrM_remove.csv",index=None)