### This notebook generates a new masterlist with each DHS assigned "ownereship" to one or more genes (comma separeted). 

### the model used to assign ownership of DHSs to genes was chosen through a parameter search to optimize "completeness" and "uniqueness" for assigning as many DHSs as possible with lowest degeneracy of DHS-gene ownership 

### the winning set of paramteres was "model 5" 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
FimoDF = pd.read_csv('20619_FIMOtable_clusters.txt', sep='\t')


In [3]:
num_motifclusters_present_per_DHS = np.sum(FimoDF.values > 0, axis=1)

In [4]:
num_motifclusters_total_per_DHS = np.sum(FimoDF.values, axis=1)

In [6]:
ML = pd.read_csv('masterlist_DHSs_733samples_WM20180608_all_indexIDs.txt', header=None, names=['chr', 'start', 'end', 'id', 'ML4', 'ML5', 'ML6', 'ML7', 'ML8', 'ML9', 'ML10'], dtype = str, sep='\t')


In [15]:
ML.head()

Unnamed: 0,chr,start,end,id,ML4,ML5,ML6,ML7,ML8,ML9,ML10,middle,mean_signal,num_motifcats_present,num_motif_total,kbp
0,chr1,16140,16200,1.10011,0.129388,1,1,60,16170,16170,16170,16170.0,0.129388,9,13,0.06
1,chr1,51868,52040,1.10021,0.0800339,1,1,172,51970,51970,51970,51954.0,0.080034,3,18,0.172
2,chr1,57280,57354,1.10025,1.093002,4,4,74,57350,57350,57350,57317.0,0.273251,1,3,0.074
3,chr1,66370,66482,1.10027,1.4697246,8,8,112,66430,66410,66430,66426.0,0.183716,0,0,0.112
4,chr1,79100,79231,1.1003,0.226098,2,2,131,79150,79150,79150,79165.5,0.113049,4,5,0.131


In [7]:
ML['middle'] = (ML.start.values.astype(int) + ML.end.values.astype(int))/2
ML['mean_signal'] = ML.ML4.values.astype(float) /ML.ML5.values.astype(float)
ML['num_motifcats_present'] = num_motifclusters_present_per_DHS
ML['num_motif_total'] = num_motifclusters_total_per_DHS
ML['kbp'] = (ML['end'].values.astype(int) - ML['start'].values.astype(int))/1e3
chr_list = ['chr'+str(i) for i in range(1,23)] + ['chrX','chrY']

In [10]:
ownedDHS_DF_ar = []
for the_chr in chr_list:
    finname = '81519_fiducial/81519_fine_fiducialgene_DHS_inds_set5_'+the_chr+'.csv'
    tempDF = pd.read_csv(finname, sep='\t')
    tempDF['chrom'] = [the_chr]*tempDF.shape[0]
    ownedDHS_DF_ar.append(tempDF)
    
ownedDHS_DF = pd.concat(ownedDHS_DF_ar)


In [11]:
ownedDHS_DF.head()

Unnamed: 0,gene,DHSind,chrom
0,SCYL3,205642,chr1
1,SCYL3,205644,chr1
2,SCYL3,205645,chr1
3,SCYL3,205651,chr1
4,SCYL3,205653,chr1


### now i will convert these to ML coordinates
How? I think I'll just do it very inefficviently 

In [12]:
ownedDHS_DFY = ownedDHS_DF[ownedDHS_DF.chrom == 'chrY']

In [14]:
ownedDHS_DFY.shape

(746, 3)

In [16]:
MLcutY = ML['chr'] == 'chrY'

In [18]:
ML[MLcutY].shape

(7761, 16)

In [22]:
np.argsort(ownedDHS_DFY['DHSind'].values)

array([611, 612, 613, 614, 615, 309, 310, 311,  34, 312,  35,  36, 313,
        37, 314, 315,  38, 316, 317,  39, 318,  40, 319,  41, 320,  42,
        43, 321,  44,  45, 322, 323,  46,  47, 324, 325,  48, 326,  49,
       327,  50,  51, 328,  52,  53, 329, 330,  54, 331,  55,  56, 332,
       333,  57, 334,  58, 335,  59,  60, 336,  61, 337,  62, 338, 339,
        63, 340, 341,  64, 342,  65, 343,  66,  67, 344,  68, 345, 346,
        69, 347,  70, 348, 349,  71, 350,  72, 351,  73, 352,  74, 353,
        75, 354,  76,  77, 355,  78, 356,  79,  80,  81,  82,  83,  84,
        85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
       529, 530, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
       183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
       196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
       209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
       222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 23

In [63]:
ind_dict = {}
for i in range(ownedDHS_DFY.values.shape[0]):
    line = ownedDHS_DFY.iloc[i]
    if line.DHSind in ind_dict:
        ind_dict[line.DHSind] += ','+line.gene
    else:
        ind_dict[line.DHSind] = line.gene

In [68]:
ind_dict = {}
for i in range(ML[MLcutY].shape[0]):
    ind_dict[i]=''
for i in range(ownedDHS_DFY.values.shape[0]):
    line = ownedDHS_DFY.iloc[i]
    if len(ind_dict[line.DHSind]) > 0:
        ind_dict[line.DHSind] += ','+line.gene
    else:
        ind_dict[line.DHSind] = line.gene


In [69]:
ind_dict

{0: '',
 1: '',
 2: '',
 3: '',
 4: '',
 5: '',
 6: 'SRY',
 7: 'SRY',
 8: 'SRY',
 9: 'SRY',
 10: 'SRY',
 11: '',
 12: '',
 13: '',
 14: '',
 15: '',
 16: '',
 17: '',
 18: '',
 19: '',
 20: '',
 21: '',
 22: '',
 23: '',
 24: '',
 25: '',
 26: '',
 27: '',
 28: '',
 29: '',
 30: '',
 31: '',
 32: '',
 33: '',
 34: '',
 35: '',
 36: '',
 37: '',
 38: '',
 39: '',
 40: '',
 41: '',
 42: '',
 43: '',
 44: '',
 45: '',
 46: '',
 47: '',
 48: '',
 49: '',
 50: 'RPS4Y1',
 51: '',
 52: '',
 53: '',
 54: 'RPS4Y1',
 55: 'RPS4Y1',
 56: '',
 57: '',
 58: '',
 59: '',
 60: '',
 61: '',
 62: '',
 63: '',
 64: '',
 65: 'ZFY,RPS4Y1',
 66: 'ZFY',
 67: '',
 68: '',
 69: 'ZFY,RPS4Y1',
 70: '',
 71: 'ZFY,RPS4Y1',
 72: 'RPS4Y1',
 73: 'ZFY,RPS4Y1',
 74: 'ZFY,RPS4Y1',
 75: '',
 76: 'ZFY,RPS4Y1',
 77: 'ZFY,RPS4Y1',
 78: 'ZFY,RPS4Y1',
 79: 'ZFY',
 80: 'ZFY,RPS4Y1',
 81: 'ZFY,RPS4Y1',
 82: '',
 83: '',
 84: 'ZFY,RPS4Y1',
 85: '',
 86: '',
 87: '',
 88: '',
 89: '',
 90: '',
 91: 'ZFY,RPS4Y1',
 92: '',
 93: '',

In [70]:
pd.Series(ind_dict).iloc[np.arange(ML[MLcutY].shape[0])]

0          
1          
2          
3          
4          
5          
6       SRY
7       SRY
8       SRY
9       SRY
10      SRY
11         
12         
13         
14         
15         
16         
17         
18         
19         
20         
21         
22         
23         
24         
25         
26         
27         
28         
29         
       ... 
7731       
7732       
7733       
7734       
7735       
7736       
7737       
7738       
7739       
7740       
7741       
7742       
7743       
7744       
7745       
7746       
7747       
7748       
7749       
7750       
7751       
7752       
7753       
7754       
7755       
7756       
7757       
7758       
7759       
7760       
Length: 7761, dtype: object

In [72]:
pd.Series(ind_dict).iloc[np.arange(ML[MLcutY].shape[0])].values[5:9]

array(['', 'SRY', 'SRY', 'SRY'], dtype=object)

In [56]:
mldomains = []
for i in range(ML[MLcutY].shape[0]):
    line = ML[MLcutY].iloc[i]
    if i in ind_dict:
        genestr =''
        for gene in ind_dict[i]:
            genestr += gene+','
        genestr = genestr[:-1]
        mldomains.append(genestr)
        if len(ind_dict[i]) > 1:
            print(i)
    else:
        mldomains.append('')

65
69
71
73
74
76
77
78
80
81
84
91
94
96
97
99
100
105
106
108
110
112
113
116
118
120
122
123
124
125
126
127
133
136
138
139
142
143
144
147
148
149
2764
2765
2766
2767
2768
2769
7311
7312
7313
7401
7402
7403
7404
7405
7406
7407
7411


In [39]:
len(mldomains)

7761

In [40]:
ML[MLcutY].shape

(7761, 16)

In [41]:
sting = ML[MLcutY].copy()

In [42]:
sting['genes'] = mldomains

In [57]:
sting.iloc[50:75]


Unnamed: 0,chr,start,end,id,ML4,ML5,ML6,ML7,ML8,ML9,ML10,middle,mean_signal,num_motifcats_present,num_motif_total,kbp,genes
3584187,chrY,2839700,2839880,Y.14447,2.43262,2,2,180,2839790,2839790,2839790,2839790.0,1.21631,3,5,0.18,RPS4Y1
3584188,chrY,2840860,2841080,Y.14448,10.2764497,12,12,220,2840970,2840950,2841048,2840970.0,0.856371,11,37,0.22,
3584189,chrY,2841052,2841340,Y.14449,0.9984909,3,3,288,2841170,2841151,2841246,2841196.0,0.33283,7,7,0.288,
3584190,chrY,2841280,2841385,Y.1445,1.281596,5,5,105,2841330,2841309,2841350,2841332.5,0.256319,4,4,0.105,
3584191,chrY,2841380,2841540,Y.14451,4.00007,1,1,160,2841470,2841470,2841470,2841460.0,4.00007,7,20,0.16,RPS4Y1
3584192,chrY,2841497,2841666,Y.14452,657.104254,352,352,169,2841590,2841550,2841650,2841581.5,1.866773,3,10,0.169,RPS4Y1
3584193,chrY,2841540,2841880,Y.14453,4.778257,4,4,340,2841670,2841670,2841670,2841710.0,1.194564,12,35,0.34,
3584194,chrY,2841700,2841884,Y.14455,30.729998,37,37,184,2841790,2841690,2841872,2841792.0,0.83054,12,33,0.184,
3584195,chrY,2841986,2842100,Y.14456,1.356315,4,4,114,2842060,2842032,2842088,2842043.0,0.339079,8,11,0.114,
3584196,chrY,2842046,2842150,Y.14458,0.125105,1,1,104,2842150,2842150,2842150,2842098.0,0.125105,5,6,0.104,


In [73]:
stinglist = []
for the_chr in chr_list:
    print ('doing ',the_chr)
    ownedDHS_DFchr = ownedDHS_DF[ownedDHS_DF.chrom == the_chr].copy()
    
    print('doing dict')
    
    
    
    
    MLcutchr = ML['chr'] == the_chr
    
    ind_dict = {}
    for i in range(ML[MLcutchr].shape[0]):
        ind_dict[i]=''
    for i in range(ownedDHS_DFchr.values.shape[0]):
        line = ownedDHS_DFchr.iloc[i]
        if len(ind_dict[line.DHSind]) > 0:
            ind_dict[line.DHSind] += ','+line.gene
        else:
            ind_dict[line.DHSind] = line.gene
    print('doing ML locs')
    mldomains = pd.Series(ind_dict).iloc[np.arange(ML[MLcutchr].shape[0])].values
    sting = ML[MLcutchr].copy()
    sting['genes'] = mldomains
    stinglist.append(sting)



doing  chr1
doing dict
doing ML locs
doing  chr2
doing dict
doing ML locs
doing  chr3
doing dict
doing ML locs
doing  chr4
doing dict
doing ML locs
doing  chr5
doing dict
doing ML locs
doing  chr6
doing dict
doing ML locs
doing  chr7
doing dict
doing ML locs
doing  chr8
doing dict
doing ML locs
doing  chr9
doing dict
doing ML locs
doing  chr10
doing dict
doing ML locs
doing  chr11
doing dict
doing ML locs
doing  chr12
doing dict
doing ML locs
doing  chr13
doing dict
doing ML locs
doing  chr14
doing dict
doing ML locs
doing  chr15
doing dict
doing ML locs
doing  chr16
doing dict
doing ML locs
doing  chr17
doing dict
doing ML locs
doing  chr18
doing dict
doing ML locs
doing  chr19
doing dict
doing ML locs
doing  chr20
doing dict
doing ML locs
doing  chr21
doing dict
doing ML locs
doing  chr22
doing dict
doing ML locs
doing  chrX
doing dict
doing ML locs
doing  chrY
doing dict
doing ML locs


In [74]:
modML = pd.concat(stinglist)

In [76]:
modML.head()

Unnamed: 0,chr,start,end,id,ML4,ML5,ML6,ML7,ML8,ML9,ML10,middle,mean_signal,num_motifcats_present,num_motif_total,kbp,genes
0,chr1,16140,16200,1.10011,0.129388,1,1,60,16170,16170,16170,16170.0,0.129388,9,13,0.06,
1,chr1,51868,52040,1.10021,0.0800339,1,1,172,51970,51970,51970,51954.0,0.080034,3,18,0.172,
2,chr1,57280,57354,1.10025,1.093002,4,4,74,57350,57350,57350,57317.0,0.273251,1,3,0.074,
3,chr1,66370,66482,1.10027,1.4697246,8,8,112,66430,66410,66430,66426.0,0.183716,0,0,0.112,OR4F5
4,chr1,79100,79231,1.1003,0.226098,2,2,131,79150,79150,79150,79165.5,0.113049,4,5,0.131,


In [78]:
modML.genes.value_counts()

                                     2495389
RBFOX1                                  1242
PTPRN2                                  1149
CNTNAP2                                 1073
AUTS2                                   1006
RUNX1                                    913
ZFHX3                                    907
ASIC2                                    893
KAZN                                     878
NRXN3                                    826
DLG2                                     817
DLGAP2                                   815
PTPRT                                    813
PTPRD                                    806
MACROD2                                  800
OPCML                                    799
DAB1                                     791
MAGI2                                    777
ANKS1B                                   768
CAMTA1                                   751
WWOX                                     746
PDE4D                                    744
RAD51B    

In [79]:
ownedDHS_DF.shape

(1455279, 3)

In [82]:
modML.tail()

Unnamed: 0,chr,start,end,id,ML4,ML5,ML6,ML7,ML8,ML9,ML10,middle,mean_signal,num_motifcats_present,num_motif_total,kbp,genes
3591893,chrY,56882540,56882719,Y.994281,0.0380792,1,1,179,56882610,56882610,56882610,56882629.5,0.038079,6,8,0.179,
3591894,chrY,56882864,56882980,Y.994286,0.115489,1,1,116,56882930,56882930,56882930,56882922.0,0.115489,10,31,0.116,
3591895,chrY,56883733,56883960,Y.994292,2.456885,5,5,227,56883830,56883742,56883870,56883846.5,0.491377,15,22,0.227,
3591896,chrY,56884440,56884580,Y.994297,0.0537589,1,1,140,56884510,56884510,56884510,56884510.0,0.053759,7,7,0.14,
3591897,chrY,56885400,56885520,Y.99435,0.0627187,1,1,120,56885430,56885430,56885430,56885460.0,0.062719,3,6,0.12,


In [83]:
modML.to_csv('90419_masterlist_with_genes.csv', sep='\t', index=False)


In [84]:
modML.to_csv('90419_masterlist_with_genes.bed', sep='\t', index=False, header=False)
