In [22]:
#load packages
import pandas as pd
import os
from functions import get_closest_TADs

### Map conserved DCIs to "best TADs" from ClusterTAD

In [23]:
conserved_DCIs = pd.read_table('../output/conserved_DCIs_thr7.txt', sep='\t')
print(conserved_DCIs.shape)
conserved_DCIs.head()

(344, 7)


Unnamed: 0,Chr,Start,End,DCI_Score,Coordinate,ClosestTAD,Distance
0,chr1,1320000,1360000,8.578795,"(1320000, 0)","(1320000, 0)",0.0
1,chr1,4080000,4120000,8.7492,"(4080000, 0)","(4080000, 0)",0.0
2,chr1,4120000,4160000,8.304266,"(4120000, 0)","(4120000, 0)",0.0
3,chr1,4280000,4320000,7.157846,"(4280000, 0)","(4280000, 0)",0.0
4,chr1,4320000,4360000,7.490269,"(4320000, 0)","(4320000, 0)",0.0


In [24]:
#define function to concat all bestTADs
def concat_bestTADs(path):
    #list all files
    TADs_per_chr = os.listdir(path)
    #initialize dataframe
    bestTADs = pd.DataFrame()
    for tad_file in TADs_per_chr:
        current_chr = pd.read_table('{}{}'.format(path, tad_file), sep='\t')
        #get chr
        current_chr['chr'] = tad_file.split('_')[-1].split('.')[0]
        bestTADs = pd.concat([bestTADs, current_chr])
    return bestTADs

In [25]:
#load best TADs for each PGCLC replicate
pgclc1 = concat_bestTADs('../data/clusterTAD/CD38P_3/')
pgclc2 = concat_bestTADs('../data/clusterTAD/CD38P_2/')
pgclc3 = concat_bestTADs('../data/clusterTAD/CD38P_1/')

In [26]:
#rename columns
pgclc1.rename(columns={'chr':'Chr', 'from.cord':'Start', 'to.cord':'End'}, inplace=True)
pgclc2.rename(columns={'chr':'Chr', 'from.cord':'Start', 'to.cord':'End'}, inplace=True)
pgclc3.rename(columns={'chr':'Chr', 'from.cord':'Start', 'to.cord':'End'}, inplace=True)
#keep only relevant columns
pgclc1 = pgclc1[['Chr','Start','End']]
pgclc2 = pgclc2[['Chr','Start','End']]
pgclc3 = pgclc3[['Chr','Start','End']]

In [28]:
#find TADs that are closest to conserved DCIs (also close to HML2 seqs)
chromosomes = ['chr'+str(i) for i in range(1,23)] + ['chrX','chrY']
print("Conserved DCIs close to PGCLC1 TADs")
closeTADs_pgclc1 = get_closest_TADs(conserved_DCIs, pgclc1, chromosomes, threshold=None)
print("Conserved DCIs close to PGCLC2 TADs")
closeTADs_pgclc2 = get_closest_TADs(conserved_DCIs, pgclc2, chromosomes, threshold=None)
print("Conserved DCIs close to PGCLC3 TADs")
closeTADs_pgclc3 = get_closest_TADs(conserved_DCIs, pgclc3, chromosomes, threshold=None)

Conserved DCIs close to PGCLC1 TADs
Not enough TADs found for chromosome chr22
Not enough TADs found for chromosome chrX
Not enough TADs found for chromosome chrY
Conserved DCIs close to PGCLC2 TADs
Not enough TADs found for chromosome chr22
Not enough TADs found for chromosome chrX
Not enough TADs found for chromosome chrY
Conserved DCIs close to PGCLC3 TADs
Not enough TADs found for chromosome chr22
Not enough TADs found for chromosome chrX
Not enough TADs found for chromosome chrY


In [29]:
#remove DCI_Score column
closeTADs_pgclc1.drop(columns=['DCI_Score'], inplace=True)
closeTADs_pgclc2.drop(columns=['DCI_Score'], inplace=True)
closeTADs_pgclc3.drop(columns=['DCI_Score'], inplace=True)

#add TAD size column
closeTADs_pgclc1['TAD_Size'] = closeTADs_pgclc1['End'] - closeTADs_pgclc1['Start']
closeTADs_pgclc2['TAD_Size'] = closeTADs_pgclc2['End'] - closeTADs_pgclc2['Start']
closeTADs_pgclc3['TAD_Size'] = closeTADs_pgclc3['End'] - closeTADs_pgclc3['Start']

In [31]:
print(closeTADs_pgclc1.shape)
display(closeTADs_pgclc1.head(10))
print(closeTADs_pgclc2.shape)
display(closeTADs_pgclc2.head(10))
print(closeTADs_pgclc3.shape)
display(closeTADs_pgclc3.head(10))

(153, 7)


Unnamed: 0,Chr,Start,End,Coordinate,ClosestTAD,Distance,TAD_Size
0,chr1,720000,2520000,"(720000, 0)","(1320000, 0)",600000.0,1800000
1,chr1,4000000,4720000,"(4000000, 0)","(4080000, 0)",80000.0,720000
2,chr1,4760000,6640000,"(4760000, 0)","(4720000, 0)",40000.0,1880000
3,chr1,6680000,7440000,"(6680000, 0)","(5480000, 0)",1200000.0,760000
4,chr1,18880000,21120000,"(18880000, 0)","(19320000, 0)",440000.0,2240000
5,chr1,48400000,49640000,"(48400000, 0)","(48440000, 0)",40000.0,1240000
6,chr1,49680000,63120000,"(49680000, 0)","(49720000, 0)",40000.0,13440000
7,chr1,63160000,64560000,"(63160000, 0)","(52040000, 0)",11120000.0,1400000
8,chr1,109000000,115280000,"(109000000, 0)","(110160000, 0)",1160000.0,6280000
9,chr1,115320000,116000000,"(115320000, 0)","(111440000, 0)",3880000.0,680000


(147, 7)


Unnamed: 0,Chr,Start,End,Coordinate,ClosestTAD,Distance,TAD_Size
0,chr1,720000,2520000,"(720000, 0)","(1320000, 0)",600000.0,1800000
1,chr1,4000000,4600000,"(4000000, 0)","(4080000, 0)",80000.0,600000
2,chr1,4640000,6760000,"(4640000, 0)","(4600000, 0)",40000.0,2120000
3,chr1,6800000,7200000,"(6800000, 0)","(5480000, 0)",1320000.0,400000
4,chr1,18880000,21120000,"(18880000, 0)","(19320000, 0)",440000.0,2240000
5,chr1,48720000,49080000,"(48720000, 0)","(48440000, 0)",280000.0,360000
6,chr1,49120000,63960000,"(49120000, 0)","(49440000, 0)",320000.0,14840000
7,chr1,64000000,64160000,"(64000000, 0)","(52040000, 0)",11960000.0,160000
8,chr1,109000000,115280000,"(109000000, 0)","(110160000, 0)",1160000.0,6280000
9,chr1,115320000,116000000,"(115320000, 0)","(111440000, 0)",3880000.0,680000


(151, 7)


Unnamed: 0,Chr,Start,End,Coordinate,ClosestTAD,Distance,TAD_Size
0,chr1,720000,2520000,"(720000, 0)","(1320000, 0)",600000.0,1800000
1,chr1,4000000,4680000,"(4000000, 0)","(4080000, 0)",80000.0,680000
2,chr1,4720000,6640000,"(4720000, 0)","(4720000, 0)",0.0,1920000
3,chr1,6680000,7440000,"(6680000, 0)","(5480000, 0)",1200000.0,760000
4,chr1,19520000,21080000,"(19520000, 0)","(19320000, 0)",200000.0,1560000
5,chr1,48600000,49120000,"(48600000, 0)","(48440000, 0)",160000.0,520000
6,chr1,49160000,62160000,"(49160000, 0)","(49440000, 0)",280000.0,13000000
7,chr1,62200000,65880000,"(62200000, 0)","(52040000, 0)",10160000.0,3680000
8,chr1,109000000,115240000,"(109000000, 0)","(110160000, 0)",1160000.0,6240000
9,chr1,115280000,116000000,"(115280000, 0)","(111440000, 0)",3840000.0,720000


In [32]:
#only keep TADs that are within 1Mb of conserved DCIs
closeTADs_pgclc1_sub = closeTADs_pgclc1[closeTADs_pgclc1.Distance <= 1000000]
closeTADs_pgclc2_sub = closeTADs_pgclc2[closeTADs_pgclc2.Distance <= 1000000]
closeTADs_pgclc3_sub = closeTADs_pgclc3[closeTADs_pgclc3.Distance <= 1000000]

In [33]:
print(closeTADs_pgclc1_sub.shape)
display(closeTADs_pgclc1_sub.sort_values(by='Distance', ascending=True).head(10))
print(closeTADs_pgclc2_sub.shape)
display(closeTADs_pgclc2_sub.sort_values(by='Distance', ascending=True).head(10))
print(closeTADs_pgclc3_sub.shape)
display(closeTADs_pgclc3_sub.sort_values(by='Distance', ascending=True).head(10))

(96, 7)


Unnamed: 0,Chr,Start,End,Coordinate,ClosestTAD,Distance,TAD_Size
108,chr11,1200000,3240000,"(1200000, 0)","(1200000, 0)",0.0,2040000
76,chr7,68720000,69320000,"(68720000, 0)","(68720000, 0)",0.0,600000
20,chr2,85600000,87040000,"(85600000, 0)","(85600000, 0)",0.0,1440000
123,chr12,128440000,128840000,"(128440000, 0)","(128440000, 0)",0.0,400000
70,chr7,56360000,56960000,"(56360000, 0)","(56320000, 0)",40000.0,600000
2,chr1,4760000,6640000,"(4760000, 0)","(4720000, 0)",40000.0,1880000
98,chr10,37640000,38800000,"(37640000, 0)","(37680000, 0)",40000.0,1160000
5,chr1,48400000,49640000,"(48400000, 0)","(48440000, 0)",40000.0,1240000
6,chr1,49680000,63120000,"(49680000, 0)","(49720000, 0)",40000.0,13440000
72,chr7,57400000,58000000,"(57400000, 0)","(57360000, 0)",40000.0,600000


(85, 7)


Unnamed: 0,Chr,Start,End,Coordinate,ClosestTAD,Distance,TAD_Size
137,chr18,13040000,13760000,"(13040000, 0)","(13040000, 0)",0.0,720000
103,chr11,1200000,3240000,"(1200000, 0)","(1200000, 0)",0.0,2040000
88,chr9,16200000,17560000,"(16200000, 0)","(16200000, 0)",0.0,1360000
72,chr7,68080000,69120000,"(68080000, 0)","(68080000, 0)",0.0,1040000
118,chr12,128440000,130520000,"(128440000, 0)","(128440000, 0)",0.0,2080000
119,chr13,19160000,39320000,"(19160000, 0)","(19160000, 0)",0.0,20160000
78,chr8,43120000,43800000,"(43120000, 0)","(43120000, 0)",0.0,680000
84,chr8,122600000,126760000,"(122600000, 0)","(122640000, 0)",40000.0,4160000
2,chr1,4640000,6760000,"(4640000, 0)","(4600000, 0)",40000.0,2120000
141,chr19,23920000,24560000,"(23920000, 0)","(23880000, 0)",40000.0,640000


(94, 7)


Unnamed: 0,Chr,Start,End,Coordinate,ClosestTAD,Distance,TAD_Size
2,chr1,4720000,6640000,"(4720000, 0)","(4720000, 0)",0.0,1920000
76,chr7,68720000,69400000,"(68720000, 0)","(68720000, 0)",0.0,680000
71,chr7,56320000,56840000,"(56320000, 0)","(56320000, 0)",0.0,520000
106,chr11,1200000,3240000,"(1200000, 0)","(1200000, 0)",0.0,2040000
121,chr12,128440000,128640000,"(128440000, 0)","(128440000, 0)",0.0,200000
122,chr12,128680000,130520000,"(128680000, 0)","(128720000, 0)",40000.0,1840000
72,chr7,57320000,57440000,"(57320000, 0)","(57360000, 0)",40000.0,120000
38,chr3,48680000,50320000,"(48680000, 0)","(48640000, 0)",40000.0,1640000
96,chr10,37640000,38800000,"(37640000, 0)","(37680000, 0)",40000.0,1160000
80,chr8,4240000,5000000,"(4240000, 0)","(4320000, 0)",80000.0,760000


In [34]:
#write to files
# closeTADs_pgclc1_sub.to_csv('../output/circa/pgclc1_TADs_close_to_DCIs.txt', 
#                             header=True, index=None, sep='\t')
# closeTADs_pgclc2_sub.to_csv('../output/circa/pgclc2_TADs_close_to_DCIs.txt', 
#                             header=True, index=None, sep='\t')
# closeTADs_pgclc3_sub.to_csv('../output/circa/pgclc3_TADs_close_to_DCIs.txt', 
#                             header=True, index=None, sep='\t')