# Compare cLoop Generated Loop Data Overlap Between Samples

In [8]:
#directory containing all the sample folders. each sample folder should be named after the sample
SAMPLES_DIR="/path/with/sample/folders"
#samples we want to compare overlap between in the samples directory
SAMPLES=['35233','35234','35235','35236','35237','35238','35239','35240']
#path to the conversion script
LOOP2BEDPE_SCRIPT="/path/to/loop2bedpe.py"


#the single sample to convert if only using one
SAMPLE="35236"
#1 to only include significant loops in the conversion, 0 for everything
SIG=1 

## (1) Convert cLoops .loop file to .bedpe format

### (a) for all

In [9]:
def formatcmds(this_sample):
    this_cmd = 'python {SCRIPT} {PATH}/{SAMPLE}/{SAMPLE}.loop {PATH}/{SAMPLE} {SIG} \
    '
    return(this_cmd.format(SCRIPT=LOOP2BEDPE_SCRIPT, PATH=SAMPLES_DIR, SAMPLE=this_sample, SIG=SIG))

In [56]:
cmds = [formatcmds(sample_id) for sample_id in SAMPLES]

script_file = "{}/all_loop2bedpe.sh".format(SAMPLES_DIR)

with open(script_file, 'w') as file_handler:
    file_handler.write("#!/bin/bash\n")
    for this_cmd in cmds:
        file_handler.write("{}\n".format(this_cmd))

In [57]:
print("sbatch --mem=20g --cpus-per-task=2 {}/all_loop2bedpe.sh".format(SAMPLES_DIR))

sbatch --mem=20g --cpus-per-task=2 /data/CARD/HICtemp/no_chrM/cLoops/all_loop2bedpe.sh


### (b) for one

In [3]:
print("module load python")
print("cd {}/{}".format(SAMPLES_DIR,SAMPLE))
print("python {} {}/{}/{}.loop {} 1".format(LOOP2BEDPE_SCRIPT, SAMPLES_DIR,SAMPLE, SAMPLE, SAMPLES_DIR))

module load python
cd /data/CARD/HICtemp/no_chrM/cLoops/35236
python /data/LNG/Frank/HiC_project/loop2bedpe.py /data/CARD/HICtemp/no_chrM/cLoops/35236/35236.loop /data/CARD/HICtemp/no_chrM/cLoops 1


## (2) Use bedtools to intersect the bedpe loop files

### (a) for all

In [58]:
import os
for sample in SAMPLES:
    os.mkdir(SAMPLES_DIR+'/'+sample+'/overlap')

In [66]:
for sample in SAMPLES:
    with open("{}/{}/overlap/run_overlap.sh".format(SAMPLES_DIR,sample), 'w') as file_handler:
        file_handler.write("#!/bin/bash\n\
        module load bedtools\n")
        for comp_sample in SAMPLES:
            if(sample!=comp_sample):
                file_handler.write("bedtools pairtopair -a {}/{}/loops.bedpe -b {}/{}/loops.bedpe -type both > \
{}/{}/overlap/{}_overlap_{}.txt\n".format(SAMPLES_DIR,sample, SAMPLES_DIR, comp_sample, SAMPLES_DIR, sample, sample, comp_sample) )
    os.chmod("{}/{}/overlap/run_overlap.sh".format(SAMPLES_DIR,sample),0o777)
        

In [63]:
with open("{}/all_overlap.swarm".format(SAMPLES_DIR),'w') as file_handler:
    for sample in SAMPLES:
        file_handler.write("{}/{}/overlap/run_overlap.sh\n".format(SAMPLES_DIR, sample))

In [64]:
print("swarm -f {}/all_overlap.swarm".format(SAMPLES_DIR))

swarm -f /data/CARD/HICtemp/no_chrM/cLoops/all_overlap.swarm


#### organize the data

##### get line counts

In [15]:
import pandas as pd
all_data={}
for sample in SAMPLES:
    comp_data={}
    for comp_sample in SAMPLES:
        if sample!=comp_sample:
            data=pd.read_csv("{}/{}/overlap/{}_overlap_{}.txt".format(SAMPLES_DIR,sample,sample,comp_sample),sep="\t")
            comp_data[comp_sample]=len(data.index)
        if sample==comp_sample:
            comp_data[comp_sample]=None
    all_data[sample]=comp_data
    
lines_df = pd.DataFrame(data=all_data)
print(lines_df)

lines_df.to_csv("{}/cLoop_overlap_count.csv".format(SAMPLES_DIR),sep=',')


        35233   35234   35235   35236   35237   35238   35239   35240
35233     NaN  1634.0  1961.0  2062.0  2097.0  2005.0  2129.0  2175.0
35234  1634.0     NaN  1288.0  1299.0  1616.0  1558.0  1525.0  1432.0
35235  1961.0  1288.0     NaN  2463.0  1829.0  1824.0  1884.0  2214.0
35236  2062.0  1299.0  2463.0     NaN  1890.0  1973.0  2014.0  2641.0
35237  2097.0  1616.0  1829.0  1890.0     NaN  1987.0  2173.0  2165.0
35238  2005.0  1558.0  1824.0  1973.0  1987.0     NaN  2073.0  2009.0
35239  2129.0  1525.0  1884.0  2014.0  2173.0  2073.0     NaN  2275.0
35240  2175.0  1432.0  2214.0  2641.0  2165.0  2009.0  2275.0     NaN


##### get percentages  

(overlaps between `row sample` and `col sample`) / (number of `col sample` loops)

In [14]:
import pandas as pd
all_data={}
for sample in SAMPLES:
    sample_lines = len(pd.read_csv("{}/{}/loops.bedpe".format(SAMPLES_DIR, sample),sep="\t").index)
    
    comp_data={}
    
    for comp_sample in SAMPLES:
        if sample!=comp_sample:
            data=pd.read_csv("{}/{}/overlap/{}_overlap_{}.txt".format(SAMPLES_DIR,sample,sample,comp_sample),sep="\t")
            overlap_lines=len(data.index)
            
            comp_data[comp_sample]=overlap_lines/sample_lines*100
            string = "%.9f" % comp_data[comp_sample]
            print(str(overlap_lines)+" loops overlap between "+ sample +" and " + comp_sample+ "/"+ str(sample_lines) +" "+ sample +"loops ="+ string + "%")
        if sample==comp_sample:
            comp_data[comp_sample]=None
    all_data[sample]=comp_data
    
percent_df = pd.DataFrame(data=all_data)
print(percent_df)

percent_df.to_csv("{}/cLoop_overlap_percent.csv".format(SAMPLES_DIR),sep=',')

1634 loops overlap between 35233 and 35234/3422 35233loops =47.749853887%
1961 loops overlap between 35233 and 35235/3422 35233loops =57.305669199%
2062 loops overlap between 35233 and 35236/3422 35233loops =60.257159556%
2097 loops overlap between 35233 and 35237/3422 35233loops =61.279953244%
2005 loops overlap between 35233 and 35238/3422 35233loops =58.591466978%
2129 loops overlap between 35233 and 35239/3422 35233loops =62.215078901%
2175 loops overlap between 35233 and 35240/3422 35233loops =63.559322034%
1634 loops overlap between 35234 and 35233/2518 35234loops =64.892772041%
1288 loops overlap between 35234 and 35235/2518 35234loops =51.151707705%
1299 loops overlap between 35234 and 35236/2518 35234loops =51.588562351%
1616 loops overlap between 35234 and 35237/2518 35234loops =64.177918983%
1558 loops overlap between 35234 and 35238/2518 35234loops =61.874503574%
1525 loops overlap between 35234 and 35239/2518 35234loops =60.563939635%
1432 loops overlap between 35234 and 3

### (b) for one

In [5]:
SAMPLE1="35235"
SAMPLE2="35236"

In [6]:
print("module load bedtools")
print("bedtools pairtopair -a {}/{}/loops.bedpe -b {}/{}/loops.bedpe -type both > {}/{}/overlap/{}_overlap_{}.txt".format(SAMPLES_DIR,SAMPLE1, SAMPLES_DIR, SAMPLE2, SAMPLES_DIR,SAMPLE1,SAMPLE1,SAMPLE2))


module load bedtools
bedtools pairtopair -a /data/CARD/HICtemp/no_chrM/cLoops/35235/loops.bedpe -b /data/CARD/HICtemp/no_chrM/cLoops/35236/loops.bedpe -type both > /data/CARD/HICtemp/no_chrM/cLoops/35235/overlap/35235_overlap_35236.txt
