# Mount my Google Drive and decompress gz

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import gzip
import shutil
with gzip.open('/content/drive/MyDrive/CosmicGenomeScreensMutantExport.tsv.gz', 'rb') as f_in:
    with open('DECOMPcosmic.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
  
print('Done!')


Done!


# Load Cosmic and merge to its filtered index

In [1]:
import pandas as pd

In [2]:
Cosmic_cols = pd.read_csv('CosmicGenomeScreensMutantExport.tsv', sep='\t', usecols=[1,20]) # 1'Accession Number', 20 'Mutation AA'
Cosmic_cols.shape
# (44398535, 1)

(46212382, 2)

In [3]:
Cosmic_cols.head()

Unnamed: 0,Accession Number,Mutation AA
0,ENST00000354590.7,p.S315Y
1,ENST00000354590.7,p.?
2,ENST00000354590.7,p.Q26*
3,ENST00000354590.7,p.A106V
4,ENST00000354590.7,p.T363P


In [4]:
index_Cosmic = pd.read_csv('Filtered Cosmic Index.csv')
index_Cosmic.set_index(['index'], drop=False, append=False, inplace=True, verify_integrity=True) 
index_Cosmic.shape
# (9539276, 1)

(10378441, 1)

In [5]:
index_Cosmic

Unnamed: 0_level_0,index
index,Unnamed: 1_level_1
0,0
3,3
4,4
10,10
11,11
...,...
46212358,46212358
46212365,46212365
46212372,46212372
46212375,46212375


In [6]:
df_Cosmic = Cosmic_cols.merge(index_Cosmic, how='inner', left_index=True, right_index=True, validate='1:1' )
df_Cosmic.shape
# (9539276, 2)

(10378441, 3)

In [7]:
df_Cosmic

Unnamed: 0,Accession Number,Mutation AA,index
0,ENST00000354590.7,p.S315Y,0
3,ENST00000354590.7,p.A106V,3
4,ENST00000354590.7,p.T363P,4
10,ENST00000539214.5,p.P268S,10
11,ENST00000539214.5,p.W40C,11
...,...,...,...
46212358,ENST00000642797.1,p.D501E,46212358
46212365,ENST00000646465.1,p.H1486Q,46212365
46212372,ENST00000321919.13,p.L287I,46212372
46212375,ENST00000506092.6,p.P1016L,46212375


# Add Uniprot ID to cosmic

In [8]:
mappit = pd.read_csv('ID_mapping.txt', sep='\t', header=0, usecols=[2,3,4,])
# Mapping data from BioMart.

In [9]:
mappit.head(5)

Unnamed: 0,UniProtKB isoform ID,UniProtKB/Swiss-Prot ID,Transcript stable ID version
0,,P03886,ENST00000361390.2
1,,P03891,ENST00000361453.3
2,,P00395,ENST00000361624.2
3,,P00403,ENST00000361739.1
4,,P03928,ENST00000361851.1


In [10]:
mappit.shape
# (116677, 6)

(116677, 3)

In [11]:
# Take only the entries with no alternative isoforms- so only 
# keep cannonical versions of the prot.
# mappit = mappit[mappit['UniProtKB isoform ID'].isnull()]

In [12]:
# mappit.shape
# (83632, 6)

In [13]:
# Keep only transcripts that correspond to Unique protein IDs- bear in mind more than 
# one transcript might encode for the same prot (think silent mutations...)
mappit.dropna(axis = 0, subset = ['UniProtKB/Swiss-Prot ID'], inplace = True)

In [14]:
mappit.shape
# (14,433,  6)
# (47,478,  3) Using non canonical.

(47478, 3)

In [15]:
mappit = mappit.rename(columns={'Transcript stable ID version': 'trans_version'})
df_Cosmic = df_Cosmic.rename(columns={'Accession Number': 'trans_version'})
# Homogenize column name to use for merging both df's on it.

In [16]:
df_Cosmic = df_Cosmic.merge(mappit, how='inner', on='trans_version', validate='m:m' )
# New, merged df.

# Load res phspsites and join/clean

In [17]:
phosphodata = pd.read_csv('resolution filtered pdb pfam mapped phosphodata.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [18]:
print(phosphodata.columns, df_Cosmic.columns)

Index(['UP_POS', 'Modification_name', 'PFAM_NAME', 'PDB ID', 'uniprot_acc',
       'pfamA_acc', 'seq_start', 'seq_end', 'Resolution (Å)', 'Domain size'],
      dtype='object') Index(['trans_version', 'Mutation AA', 'index', 'UniProtKB isoform ID',
       'UniProtKB/Swiss-Prot ID'],
      dtype='object')


In [19]:
print(phosphodata.shape, df_Cosmic.shape)

(3010568, 10) (4504491, 5)


In [20]:
df_Cosmic

Unnamed: 0,trans_version,Mutation AA,index,UniProtKB isoform ID,UniProtKB/Swiss-Prot ID
0,ENST00000354590.7,p.S315Y,0,,Q96K62
1,ENST00000354590.7,p.A106V,3,,Q96K62
2,ENST00000354590.7,p.T363P,4,,Q96K62
3,ENST00000354590.7,p.R70S,42,,Q96K62
4,ENST00000354590.7,p.E254K,43,,Q96K62
...,...,...,...,...,...
4504486,ENST00000637303.1,p.M79I,40240094,,A0A1B0GUC4
4504487,ENST00000380742.8,p.Q252K,39816614,Q16637-2,Q16637
4504488,ENST00000518075.1,p.K7N,43356033,Q9HD64-2,Q9HD64
4504489,ENST00000382287.5,p.T3M,43933298,,O14599


In [24]:
phosphodata

Unnamed: 0,UP_POS,Modification_name,PFAM_NAME,PDB ID,uniprot_acc,pfamA_acc,seq_start,seq_end,Resolution (Å),Domain size
0,39,PhosphoS,14-3-3,2bq0,P31946,PF00244,11,231,2.5,220
1,39,PhosphoS,14-3-3,2bq0,P31946,PF00244,11,231,2.5,220
2,39,PhosphoS,14-3-3,2c23,P31946,PF00244,11,231,2.65,220
3,39,PhosphoS,14-3-3,4dnk,P31946,PF00244,11,231,2.2,220
4,39,PhosphoS,14-3-3,4dnk,P31946,PF00244,11,231,2.2,220
...,...,...,...,...,...,...,...,...,...,...
3010563,135,PhosphoS,Myb_DNA-binding,2yum,Q8IYH5,PF00249,652,703,,51
3010564,260,PhosphoS,Myb_DNA-binding,2yum,Q8IYH5,PF00249,652,703,,51
3010565,391,PhosphoS,Myb_DNA-binding,2yum,Q8IYH5,PF00249,652,703,,51
3010566,606,PhosphoS,Myb_DNA-binding,2yum,Q8IYH5,PF00249,652,703,,51


In [21]:
# def to_int(x):
#     try:
#         return int(x[3:-1])
#     except:
#         return(int(x[4:-1]))

def to_int(x):
  return int(x[3:-1])

In [22]:
def format_selenocysteine(x):
    if x[2:5] != ('Sec'):
        return x
    elif x[2:5] == ('Sec'):
        return x.replace(x[2:5] , 'U')

df_Cosmic['Mutation AA'] = df_Cosmic['Mutation AA'].apply(format_selenocysteine)

In [27]:
def preprocess(pfam_phpsites):
    
    pfam_phpsites = pfam_phpsites.rename(columns={'uniprot_acc': 'Acc ID'})
    df = df_Cosmic.merge(pfam_phpsites, how='inner', on='Acc ID', sort=False, indicator=False, copy=False, validate='m:m' )
    df['Mutation AA_sliced'] = df['Mutation AA'].apply(lambda x: to_int(x))
    df['Distance'] = df['UP_POS'] - df['Mutation AA_sliced']
    df = df[df['Distance'] < 6]
    df = df[df['Distance'] > -6]
    df.to_csv("final.csv", mode="a", header=False, index=False)

reader = pd.read_csv("resolution filtered pdb pfam mapped phosphodata.csv", chunksize=1000)
df_Cosmic = df_Cosmic.rename(columns={'UniProtKB/Swiss-Prot ID': 'Acc ID'})
i = 0
for r in reader:
    if i%20==0:
        print(i)
    i += 1
    preprocess(r) 

0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460
1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000


In [58]:
#Mutation AA, ACC number
#takes long time to run
def domain_mapping(Cosmic,pfam_phpsites):
    
    Cosmic = Cosmic.rename(columns={'UniProtKB/Swiss-Prot ID': 'Acc ID'})
    pfam_phpsites = pfam_phpsites.rename(columns={'uniprot_acc': 'Acc ID'})
    
    n = 100  #chunk row size
    list_df = [Cosmic[i:i+n] for i in range(0,Cosmic.shape[0],n)]
    for idx,df in enumerate(list_df):
        df = df.merge(pfam_phpsites, how='inner', on='Acc ID', sort=False, indicator=False, copy=False, validate='m:m' )
        df['Mutation AA_sliced'] = df['Mutation AA'].apply(lambda x: to_int(x))
        df['Distance'] = df['UP_POS'] - df['Mutation AA_sliced']
        df = df[df['Distance'] < 6]
        df = df[df['Distance'] > -6]
        list_df[idx] = df
        
    return pd.concat(list_df)

In [59]:
cosmic = domain_mapping(df_Cosmic,phosphodata)

MemoryError: Unable to allocate 4.27 GiB for an array with shape (4, 143429040) and data type int64

In [None]:
print(cosmic.columns, cosmic.shape)
# (248587, 16)

In [None]:
cosmic.sort_values('Resolution (Å)', ascending=True).drop_duplicates(['UP_POS','uniprot_acc','pfamA_acc'],keep='first')

In [None]:
cosmic.to_csv('/content/drive/My Drive/almost final domains dataset.csv', header=True, index=False)

In [None]:
# resolutions_unique = cosmic.sort_values('Resolution (Å)', ascending=True).drop_duplicates(['UP_POS','uniprot_acc','pfamA_acc'],keep='first')

In [None]:
# cosmic = cosmic[['index', 'UniProtKB isoform ID',
#        'Acc ID', 'UP_POS', 'Modification_name', 'PFAM_NAME', 'PDB ID',
#        'pfamA_acc', 'seq_start', 'seq_end', 'Resolution (Å)', 'Domain size',
#        'Distance']]

In [None]:
# cosmic.head(50)

In [None]:
# def check_domain_in_boundary(df):
#     return df[df.apply(lambda x: (x['UP_POS'] >= x['seq_start']) & (x['UP_POS']<= x['seq_end']), axis=1)]

# inside_domain = check_domain_in_boundary(cosmic)

In [None]:
# inside_domain.shape

In [None]:
# prot = cosmic[cosmic['Acc ID'] ==  'O75874']

In [1]:
import pandas as pd

In [2]:
final = pd.read_csv('final.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
cols = ['Transcript','Mutation AA','index','uniprot_isoform','uniprotID','position_scop3p','php_relation_AA','PFAM_name','PFAM_ID','pfamA_acc','seq_start','seq_end','Resolution (Å)','Domain size','position_cosmic','distance']

In [5]:
final.columns = cols

In [6]:
final.head()

Unnamed: 0,Transcript,Mutation AA,index,uniprot_isoform,uniprotID,position_scop3p,php_relation_AA,PFAM_name,PFAM_ID,pfamA_acc,seq_start,seq_end,Resolution (Å),Domain size,position_cosmic,distance
0,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,3ual,PF00244,10,232,1.8,222,34,4
1,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,3ubw,PF00244,10,232,1.9,222,34,4
2,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,6eih,PF00244,10,232,2.7,222,34,4
3,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,7c8e,PF00244,10,232,3.16,222,34,4
4,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,7c8e,PF00244,10,232,3.16,222,34,4


In [7]:
#final.to_csv('final.csv')

In [None]:
final[['Resolution (Å)']] = final[['Resolution (Å)']].fillna(100)

In [20]:
tmp = []
for i,v in enumerate(final['Resolution (Å)']):
    tmp.append(float(str(v).split(',')[0]))
    

In [21]:
final['Resolution (Å)'] = tmp

In [22]:
final

Unnamed: 0,Transcript,Mutation AA,index,uniprot_isoform,uniprotID,position_scop3p,php_relation_AA,PFAM_name,PFAM_ID,pfamA_acc,seq_start,seq_end,Resolution (Å),Domain size,position_cosmic,distance
0,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,3ual,PF00244,10,232,1.80,222,34,4
1,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,3ubw,PF00244,10,232,1.90,222,34,4
2,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,6eih,PF00244,10,232,2.70,222,34,4
3,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,7c8e,PF00244,10,232,3.16,222,34,4
4,ENST00000571732.5,p.R34G,746719,P62258-2,P62258,38,PhosphoT,14-3-3,7c8e,PF00244,10,232,3.16,222,34,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38421722,ENST00000554173.1,p.S61G,38311653,,Q9UK55,61,PhosphoS,Serpin,3h5c,PF00079,77,441,3.26,364,61,0
38421723,ENST00000554173.1,p.E57K,44485870,,Q9UK55,56,PhosphoS,Serpin,3f1s,PF00079,77,441,2.30,364,57,-1
38421724,ENST00000554173.1,p.E57K,44485870,,Q9UK55,56,PhosphoS,Serpin,3h5c,PF00079,77,441,3.26,364,57,-1
38421725,ENST00000554173.1,p.E57K,44485870,,Q9UK55,61,PhosphoS,Serpin,3f1s,PF00079,77,441,2.30,364,57,4


In [24]:
final.sort_values('Resolution (Å)', ascending=True).drop_duplicates(['position_scop3p','uniprotID','pfamA_acc'],keep='first')

Unnamed: 0,Transcript,Mutation AA,index,uniprot_isoform,uniprotID,position_scop3p,php_relation_AA,PFAM_name,PFAM_ID,pfamA_acc,seq_start,seq_end,Resolution (Å),Domain size,position_cosmic,distance
5745503,ENST00000447182.6,p.M91T,2614700,O00560-2,O00560,88,PhosphoS,PDZ,1r6j,PF00595,198,270,0.73,72,91,-3
5740906,ENST00000447182.6,p.H41Q,35580882,O00560-2,O00560,46,PhosphoY,PDZ,1r6j,PF00595,198,270,0.73,72,41,5
5747388,ENST00000413219.6,p.M92T,30431617,O00560-1,O00560,91,PhosphoY,PDZ,1r6j,PF00595,114,191,0.73,77,92,-1
5746074,ENST00000424270.6,p.R61C,9628125,O00560-3,O00560,56,PhosphoY,PDZ,1r6j,PF00595,114,191,0.73,77,61,-5
1760490,ENST00000366844.7,p.K483N,39486804,Q8N8S7-1,Q8N8S7,487,PhosphoT,WH1,7a5m,PF00568,1,108,0.78,107,483,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1736401,ENST00000236957.9,p.A82V,17262534,,P24534,79,PhosphoY,EF1_GNE,1b64,PF00736,141,225,100.00,84,82,-3
1736333,ENST00000392221.5,p.L123V,39503914,,P24534,128,PhosphoS,EF1_GNE,1b64,PF00736,141,225,100.00,84,123,5
1736331,ENST00000392221.5,p.S112N,39271865,,P24534,112,PhosphoS,EF1_GNE,1b64,PF00736,141,225,100.00,84,112,0
1736328,ENST00000392221.5,p.G105V,34334397,,P24534,106,PhosphoS,EF1_GNE,1b64,PF00736,141,225,100.00,84,105,1


In [26]:
data = pd.read_csv('original canonical only merged dataframe.csv')

In [27]:
data

Unnamed: 0,Mutation AA,index,ACC_ID,UP_POS,Modification_name,sliced AA,Distance
0,p.P359L,1057,Q9UQF2,355,PhosphoS,359,-4
1,p.S895F,1591,Q69YQ0,893,PhosphoS,895,-2
2,p.S45P,1758,P35222,45,PhosphoS,45,0
3,p.T338I,2369,Q9H0H5,342,PhosphoT,338,4
4,p.S45P,3130,P35222,45,PhosphoS,45,0
...,...,...,...,...,...,...,...
57836,p.S33P,44393309,P35222,30,PhosphoY,33,-3
57837,p.P1746S,44393579,Q14686,1749,PhosphoT,1746,3
57838,p.P1746S,44393579,Q14686,1751,PhosphoS,1746,5
57839,p.E382K,44396386,P08172,380,PhosphoS,382,-2
