# Use the Compare Tool from Juicer for Identifying Unique Loops Between Samples
this looks at loops in two sample bedpes and marks them as one of the following:  
- A\*/B\* - unique loops in first/second file that overlap with no loops in second/first file  
- A/B - loops in first/second file that overlap with loops in second/first file, but are not identical to any loops in second/first file.
    - depends on the threshold radius option value (-m). If the threshold option is set to 0 then none of these will be assigned and the tools will only look for exact matching loops (A\*/B\*). 
- Common - loops identical to both first and second file  

### NOTE
this may give unexpected results when the -m option is set to a larger range. For example, if run to compare a file against itself it may assign some loops as unique (all loops should be common if comparing a file against itself)

In [None]:
import pandas as pd
import os

In [None]:
#the value that will go into the -m option in the compare list juicer tool. Is the radius (in bp) to check for overlap. In other words a value above zero will allow for unidentical loops to be counted as overlapping if they are close enough. (ex: 25000)
THRESHOLD=25000#None

#name to add to files generated depending on the threshold value
if(THRESHOLD is None):
    fname_threshold = "default"
else:
    fname_threshold = THRESHOLD

In [None]:
#directory containing all the sample folders. each sample folder should be named after the sample
JUICER_DIR="/path/to/juicer"
#the rest of these are created in the next cell
ANALYSIS_DIR = JUICER_DIR+"/overlap_analysis"
SAMPLES_DIR = JUICER_DIR+"/overlap_analysis/samples"
SAMPLES_NH_DIR = JUICER_DIR+"/overlap_analysis/samples_no_header"
SCRIPT_DIR = JUICER_DIR+"/overlap_analysis/scripts"
OVERLAP_DIR= JUICER_DIR+"/overlap_analysis/overlap"
SHUFFLE_DIR = JUICER_DIR+"/overlap_analysis/shuffle"
MISC_DIR = JUICER_DIR+"/overlap_analysis/misc"
COMPARE_DIR=JUICER_DIR+"/overlap_analysis/compare_lists"
RESULTS_DIR = JUICER_DIR+"/overlap_analysis/results"
OUT_DIR = f"{RESULTS_DIR}/compare_lists_{fname_threshold}"

In [None]:
!mkdir {COMPARE_DIR}
!mkdir {RESULTS_DIR}
!mkdir {OUT_DIR}

In [None]:
#list of the sample names taken from the juicer directory
SAMPLES = sorted([ name for name in os.listdir(JUICER_DIR) if os.path.isdir(os.path.join(JUICER_DIR, name)) and 'HICS' in name ])
print(len(SAMPLES))
print(SAMPLES)


In [None]:
with open(SCRIPT_DIR+"/compare_lists.swarm","w") as swarm_file:
    for i in range(0,len(SAMPLES)):
    
        for j in range(i,len(SAMPLES)):
            sample1=SAMPLES[i]
            sample2=SAMPLES[j]
            
            #if we didn't assign a value to the threshold radius then use whatever the default is. otherwise use it
            if(THRESHOLD is None):
                swarm_file.write(f"/path/to/juicer_tools compare 0 hg38 {SAMPLES_DIR}/{sample1}.bedpe {SAMPLES_DIR}/{sample2}.bedpe {COMPARE_DIR}/{sample1}_{sample2}_{fname_threshold}_compare_loop_list.bedpe \n")
            else:
                swarm_file.write(f"/path/to/juicer_tools compare 0 hg38 -m {THRESHOLD} {SAMPLES_DIR}/{sample1}.bedpe {SAMPLES_DIR}/{sample2}.bedpe {COMPARE_DIR}/{sample1}_{sample2}_{fname_threshold}_compare_loop_list.bedpe \n")
            #print(str(i)+" " + str(j))
    swarm_file.close()

In [None]:
print(f"swarm -f {SCRIPT_DIR}/compare_lists.swarm -g 50 -t 10 --module=juicer --sbatch '--mail-type=ALL' --time=24:00:00")

## collect data

In [None]:


common_c={}
common_p={}
unique_c={}
unique_p={}
similar_c={}
similar_p={}



for i in range(0,len(SAMPLES)):
    row = {}
    for j in range(0,len(SAMPLES)):
        row[SAMPLES[j]]=None
    common_c[SAMPLES[i]]=row.copy()
    common_p[SAMPLES[i]]=row.copy()
    unique_c[SAMPLES[i]]=row.copy()
    unique_p[SAMPLES[i]]=row.copy()
    similar_c[SAMPLES[i]]=row.copy()
    similar_p[SAMPLES[i]]=row.copy()


In [None]:


for i in range(0,len(SAMPLES)):
    for j in range(i,len(SAMPLES)):
        sample1=SAMPLES[i]
        sample2=SAMPLES[j]
        
        sample1_loop_count = len(pd.read_csv(f"{SAMPLES_DIR}/{sample1}.bedpe",sep='\t').index)
        sample2_loop_count = len(pd.read_csv(f"{SAMPLES_DIR}/{sample2}.bedpe",sep='\t').index)

        file = pd.read_csv(f"{COMPARE_DIR}/{sample1}_{sample2}_{fname_threshold}_compare_loop_list.bedpe",sep='\t')
        #unique loops to sample1
        As=len(file[file['parent_list']=='A*'].drop_duplicates().index)
        #similar loops to sample1
        A=len(file[file['parent_list']=='A'].drop_duplicates().index)
        #unique loops to sample2
        Bs=len(file[file['parent_list']=='B*'].drop_duplicates().index)
        #similar loops to sample2
        B=len(file[file['parent_list']=='B'].drop_duplicates().index)
        #common loops to both samples
        Common=len(file[file['parent_list']=='Common'].drop_duplicates().index)
        
        unique_c[sample1][sample2]=As
        unique_c[sample2][sample1]=Bs
        
        unique_p[sample1][sample2]=As/sample1_loop_count * 100
        unique_p[sample2][sample1]=Bs/sample2_loop_count * 100
        
        similar_c[sample1][sample2]=A
        similar_c[sample2][sample1]=B
        
        similar_p[sample1][sample2]=A/sample1_loop_count * 100
        similar_p[sample2][sample1]=B/sample2_loop_count * 100
        
        common_c[sample1][sample2]=Common
        common_c[sample2][sample1]=Common
        
        common_p[sample1][sample2]=Common/sample1_loop_count * 100
        common_p[sample2][sample1]=Common/sample2_loop_count * 100
        

uc = pd.DataFrame(data = unique_c)
uc.to_csv(f"{OUT_DIR}/compare_loop_list_{fname_threshold}_unique_counts.csv")
up = pd.DataFrame(data = unique_p)
up.to_csv(f"{OUT_DIR}/compare_loop_list_{fname_threshold}_unique_percents.csv")

sc = pd.DataFrame(data = similar_c)
sc.to_csv(f"{OUT_DIR}/compare_loop_list_{fname_threshold}_similar_counts.csv")
sp = pd.DataFrame(data = similar_p)
sp.to_csv(f"{OUT_DIR}/compare_loop_list_{fname_threshold}_similar_percents.csv")
        
        
cc = pd.DataFrame(data = common_c)
cc.to_csv(f"{OUT_DIR}/compare_loop_list_{fname_threshold}_common_counts.csv")
cp = pd.DataFrame(data = common_p)
cp.to_csv(f"{OUT_DIR}/compare_loop_list_{fname_threshold}_common_percents.csv")
        

In [None]:
uc

In [None]:
sp

In [None]:
cc

## Validate

In [None]:
#load the compare result file
comp=pd.read_csv(f"{COMPARE_DIR}/HICS_CS25i_FBn_d25_S6_HICS_CS25i_d0_S9_{fname_threshold}_compare_loop_list.bedpe", sep='\t')
print(comp.head())

In [None]:
sample1 = pd.read_csv(f"{SAMPLES_DIR}/HICS_CS25i_FBn_d25_S6.bedpe")
print(sample1.shape)

In [None]:
sample2 = pd.read_csv(f"{SAMPLES_DIR}/HICS_CS25i_d0_S9.bedpe")
print(sample2.shape)

In [None]:
#see how many were listed as common between sample1 and sample2
compC= comp[comp['parent_list']=='Common']
print(compC.shape)
print(compC.head())

In [None]:
#see how many sample1 loops were within the threshold radius for overlap with sample2 loops but were not identical between the two samples
compA= comp[comp['parent_list']=='A']
print(compA.shape)
print(compA.head())

In [None]:
#see how many loops were completely unique to sample1
compAs= comp[comp['parent_list']=='A*']
print(compAs.shape)
print(compAs.head())

In [None]:
#see how many sample2 loops were within the threshold radius for overlap with sample1 loops but were not identical between the two samples
compB= comp[comp['parent_list']=='B']
print(compB.shape)
print(compB.head())

In [None]:
#see how many loops were completely unique to sample2
compBs= comp[comp['parent_list']=='B*']
print(compBs.shape)
print(compBs.head())

#### merge between the subset dataframes to check they are unique or belong to the right sample

In [None]:

df = pd.merge(compA,compAs, on=['chr1','x1','x2','chr2','y1','y2'],how = 'inner')
print(compA.shape)
print(compAs.shape)
print(df.shape)

In [None]:

df = pd.merge(compA,compB, on=['chr1','x1','x2','chr2','y1','y2'],how = 'inner')
print(compA.shape)
print(compB.shape)
print(df.shape)

In [None]:

df = pd.merge(compA,compC, on=['chr1','x1','x2','chr2','y1','y2'],how = 'inner')
print(compA.shape)
print(compC.shape)
print(df.shape)

### compare bedtools overlap implementation vs juicer compare list tool

In [None]:
bed_s1_s2_overlap = pd.read_csv(f"{OVERLAP_DIR}/HICS_CS25i_FBn_d25_S6_HICS_CS25i_d0_S9_overlap.txt", sep='\t',header=None)

print(bed_s1_s2_overlap.shape)
print(bed_s1_s2_overlap.head())

In [None]:
bed_s1 = bed_s1_s2_overlap.iloc[:,0:6]
bed_s1.columns = ['chr1','x1','x2','chr2','y1','y2']
bed_s1['chr1']=bed_s1['chr1'].str.replace('chr','')
bed_s1['chr2']=bed_s1['chr2'].str.replace('chr','')
bed_s1 = bed_s1.drop_duplicates()
print(bed_s1.shape)
print(bed_s1.head())

In [None]:
bed_s2_s1_overlap = pd.read_csv(f"{OVERLAP_DIR}/HICS_CS25i_d0_S9_HICS_CS25i_FBn_d25_S6_overlap.txt", sep='\t',header=None)

print(bed_s2_s1_overlap.shape)
print(bed_s2_s1_overlap.head())

In [None]:
bed_s2 = bed_s2_s1_overlap.iloc[:,0:6]
bed_s2.columns = ['chr1','x1','x2','chr2','y1','y2']
bed_s2['chr1']=bed_s2['chr1'].str.replace('chr','')
bed_s2['chr2']=bed_s2['chr2'].str.replace('chr','')
bed_s2 = bed_s2.drop_duplicates()
print(bed_s2.shape)
print(bed_s2.head())

In [None]:
#check how many are identical/commone between the two

mer = pd.merge(bed_s1, bed_s2, on=['chr1','x1','x2','chr2','y1','y2'], how = 'inner')
print(mer.shape)
print(mer.head())

In [None]:
#merge compare list sample2/B uniques with bedtools overlap for sample2
print(compBs.shape)
print(bed_s2.shape)

mer = pd.merge(compBs, bed_s2, on=['chr1','x1','x2','chr2','y1','y2'], how = 'inner')
print(mer.shape)
print(mer.head())

^ so different ones count as unique between the two methods

In [None]:
#merge compare list commons with bedtools overlap for sample2
print(compC.shape)
print(bed_s2.shape)

mer = pd.merge(compC, bed_s2, on=['chr1','x1','x2','chr2','y1','y2'], how = 'inner')
print(mer.shape)
print(mer.head())

^ so same ones count as identical between the two 

In [None]:
#merge compare list sample2/B similars with bedtools overlap for sample2
print(compB.shape)
print(bed_s2.shape)

mer = pd.merge(compB, bed_s2, on=['chr1','x1','x2','chr2','y1','y2'], how = 'inner')
print(mer.shape)
print(mer.head())

^ so different ones count as similar between the two methods