# Mount my Google Drive and decompress gz

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import gzip
import shutil
with gzip.open('/content/drive/MyDrive/CosmicGenomeScreensMutantExport.tsv.gz', 'rb') as f_in:
    with open('DECOMPcosmic.tsv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
  
print('Done!')


Done!


# COSMIC DF

### Load tsv as pandas dataframe (minimal columns)

In [2]:
import pandas as pd

In [3]:
# df_Cosmic = pd.read_csv('/content/DECOMPcosmic.tsv', sep='\t', usecols=[30], chunksize=100)
# sample = df_Cosmic.get_chunk()
# sample = sample.loc[sample['Mutation somatic status'].str.contains('somatic', case=False, na=False)]
# sample.head(40)

df_Cosmic = pd.read_csv('CosmicGenomeScreensMutantExport.tsv', sep='\t', usecols=[7,11,20,21,29])
# df_Cosmic['Mutation somatic status'].tail(20)
# df_Cosmic['Mutation AA'].head(35)

In [4]:
df_Cosmic = df_Cosmic.reset_index()

In [5]:
print(df_Cosmic.shape, df_Cosmic.columns)

#(44,398,535  ,    7) Index([0'Gene name', 1'Accession Number', 7'Primary site', 11'Primary histology',
      #  20'Mutation AA', 21'Mutation Description', 30'Mutation somatic status'],
      # dtype='object')

(46212382, 6) Index(['index', 'Primary site', 'Primary histology', 'Mutation AA',
       'Mutation Description', 'Mutation somatic status'],
      dtype='object')


In [7]:
df_Cosmic = df_Cosmic[df_Cosmic['Mutation AA'].apply(lambda x: x[2] in ['S','T','Y'])]

In [8]:
df_Cosmic[~df_Cosmic['Mutation AA'].apply(lambda x: x[-1] in ['S','T','Y'])]

Unnamed: 0,index,Primary site,Primary histology,Mutation AA,Mutation Description,Mutation somatic status
4,4,stomach,carcinoma,p.T363P,Substitution - Missense,Confirmed somatic variant
54,54,liver,carcinoma,p.Y71C,Substitution - Missense,Confirmed somatic variant
76,76,oesophagus,carcinoma,p.S742F,Substitution - Missense,Confirmed somatic variant
118,118,lung,carcinoma,p.S1364C,Substitution - Missense,Confirmed somatic variant
120,120,biliary_tract,carcinoma,p.S942R,Substitution - Missense,Confirmed somatic variant
...,...,...,...,...,...,...
46212234,46212234,kidney,carcinoma,p.T4198A,Substitution - Missense,Confirmed somatic variant
46212240,46212240,skin,carcinoma,p.S829F,Substitution - Missense,Confirmed somatic variant
46212296,46212296,skin,carcinoma,p.S772L,Substitution - Missense,Confirmed somatic variant
46212337,46212337,large_intestine,carcinoma,p.S37F,Substitution - Missense,Confirmed somatic variant


### Filter the dataset

Use only missense (substitution) mutations, filter rest

In [6]:
df_Cosmic = (df_Cosmic.loc[df_Cosmic['Mutation Description'].str.contains('missense', case=False, na=False)])
# df_Cosmic.shape
# (10,726,238.   , 7)

In [6]:
df_Cosmic.head()

Unnamed: 0,index,Primary site,Primary histology,Mutation AA,Mutation Description,Mutation somatic status
0,0,endometrium,carcinoma,p.S315Y,Substitution - Missense,Confirmed somatic variant
3,3,endometrium,carcinoma,p.A106V,Substitution - Missense,Confirmed somatic variant
4,4,stomach,carcinoma,p.T363P,Substitution - Missense,Confirmed somatic variant
10,10,thyroid,carcinoma,p.P268S,Substitution - Missense,Confirmed somatic variant
11,11,skin,malignant_melanoma,p.W40C,Substitution - Missense,Confirmed somatic variant


Use only somatic mutations, filter out the rest





In [7]:
df_Cosmic = df_Cosmic.loc[df_Cosmic['Mutation somatic status'].str.contains('somatic', case=False, na=False)]
df_Cosmic.shape
# (9,540,587.  , 7)

(10381425, 6)

Delete NS rows

In [8]:
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Primary site'].str.contains('NS')]
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Primary histology'].str.contains('NS')]
df_Cosmic.shape

# 9382202, 5)

(10219901, 6)

Clean out some oddball mutations (delins, Sec{selenocysteine})

.-So deletions insertions (delins) are where two or more consecutive AA are changed for other ones, and it's not a frameshift or conversion :O... and ec is really "Sec"- selenocysteine, the 21th AA :o. Also 'fs'... it refers to frameshift...

In [9]:
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Mutation AA'].str.contains('delins')]
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Mutation AA'].str.contains('Sec')]
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Mutation AA'].str.contains('fs')]
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Mutation AA'].str.contains('de')]
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Mutation AA'].str.contains('ins')]
df_Cosmic = df_Cosmic.loc[~df_Cosmic['Mutation AA'].str.contains('du')]
# df_Cosmic = df_Cosmic[df_Cosmic['Mutation AA'].apply(lambda x: x[2] in ['S','T','Y'])]

df_Cosmic.shape
# (9539287, 7). before filtering out the fs's
# (9380961, 5) after these myriad filters
# (1339030, 5) only STY amino acids and after all these filters

(10216983, 6)

### Generate a column with the current index, to use as index for further merging and duplicate deletion.
When merging 2 dataframes on a column which isn't the index, the index is reset... but keeping it will be useful for adding data (from yet unused columns) on the fly.

In [14]:
#df_Cosmic['indice'] = df_Cosmic.index

In [14]:
df_Cosmic.groupby('Mutation Description').size()

Mutation Description
Substitution - Missense    10216983
dtype: int64

In [10]:
df_Cosmic.to_csv('STY Filtered Cosmic Index.csv',columns=['index'], header=True, index=False)

# Add columns on index!

Load csv with desired columns, merge on index to the pre-filtered Cosmic df.

In [None]:
# mets_Cosmic = pd.read_csv('/content/DECOMPcosmic.tsv', sep='\t', usecols=[34]) #tumour origin (metastasis, primary)
# mets_Cosmic.head(10)

In [None]:
# col_Cosmic = pd.read_csv('/content/DECOMPcosmic.tsv', sep='\t', usecols=[0]) # gene name
# col_Cosmic.head(10)
# df_Cosmic.columns

Index(['Primary site', 'Primary histology', 'Mutation AA',
       'Mutation Description', 'Mutation somatic status', 'indice',
       'Accession Number', 'Gene name'],
      dtype='object')

In [None]:
# enhanced_Cosmic = df_Cosmic.merge(mets_Cosmic, how='inner', left_index=True, right_index=True, validate='1:1' )
# enhanced_Cosmic.shape
# (9540587, 8)
# enhanced_Cosmic.head(3)


# df_Cosmic = df_Cosmic.merge(col_Cosmic, how='inner', left_index=True, right_index=True, validate='1:1' )
# df_Cosmic.shape

(9380985, 8)

# Alternate phosphodata with Pfam domain info

In [None]:
domain_mapper = pd.read_csv('/content/drive/MyDrive/pdb_pfam_mapping.txt', sep='\t', header=1, usecols=[4,5,10,11,12])

domain_mapper.shape
# (720721, 5) original size, limited columns

(720721, 5)

In [None]:
# domain_mapper.columns
# Index([0'PDB', 1'CHAIN', 2'PDB_START', 3'PDB_END', 4'PFAM_ACCESSION', 5'PFAM_NAME',
#        6'AUTH_PDBRES_START', 7'AUTH_PDBRES_START_INS_CODE', 8'AUTH_PDBRES_END',
#        9'AUTH_PDBRES_END_INS_CODE', 10'UNIPROT_ACCESSION', 11'UNP_START',
#        12'UNP_END'],
#       dtype='object')

# 4,5,10,11,12 - used columns

In [None]:
# drop repeats of domain mapper, since the families have several entries but they are generated in function of 
# the uniprot sequence which is unique

domain_mapper.drop_duplicates(subset= ['PFAM_ACCESSION'], keep='first', inplace=True, ignore_index=False)

In [None]:
domain_mapper.shape
# (9305, 5) unique domains / pfam accession numbers

(9305, 5)

In [None]:
# Load phosphodata and use exclusively isoforms
scop3p = pd.read_table("/content/drive/MyDrive/Scop3P_confident_Psites.txt")
mappit = pd.read_csv('/content/drive/MyDrive/ID_mapping.txt', sep='\t', header=0, usecols=[2,3,4])

In [None]:
# mappit.columns

In [None]:
# Clean phosphodata
mappit = mappit[mappit['UniProtKB isoform ID'].notnull()]
# mappit.dropna(axis = 0, subset = ['UniProtKB/Swiss-Prot ID'], inplace = True)

In [None]:
mappit.shape
# (116677, 3) complete dataset
# (33045, 3) only not nulls, ie only the rows of isoforms
# (47478, 3) both isoforms and pure cannonical

(33045, 3)

In [None]:
mappit = mappit.rename(columns={'UniProtKB/Swiss-Prot ID': 'ACC_ID'})

In [None]:
phospho_muts = scop3p.merge(mappit, how='inner', on='ACC_ID', validate='m:m' )

In [None]:
phospho_muts.shape
# (190959, 5) includes only isoforms
# (229419, 5) both isoforms and pure cannonical

(190959, 5)

In [None]:
#prepare domain mapper for merging with phosphodata 
domain_mapper = domain_mapper.rename(columns={'UNIPROT_ACCESSION': 'ACC_ID'})

In [None]:
pfam_merged = phospho_muts.merge(domain_mapper, how='inner', on='ACC_ID', validate='m:m' )

In [None]:
pfam_merged.shape
# (28872, 9) only isoforms used here
# (33063, 9) both isoforms and pure cannonical

(28872, 9)

In [None]:
pfam_merged = pfam_merged[pfam_merged.apply(lambda x: (x['UP_POS'] >= x['UNP_START']) & (x['UP_POS']<= x['UNP_END']), axis=1)]

In [None]:
pfam_merged.shape
# (2602, 9) phosphosites coincide within limits of domains of pfam. Using exclusively the isoforms

(2602, 9)

In [None]:
df_Cosmic = df_Cosmic.rename(columns={'Accession Number': 'merger'})
pfam_merged = pfam_merged.rename(columns={'Transcript stable ID version': 'merger'})

In [None]:
pfam_mergy = df_Cosmic.merge(pfam_merged, how='inner', on='merger', left_index=False, right_index=False, indicator = False, validate='m:m' )

In [None]:
pfam_mergy.shape

(380564, 15)

# Phosphodata

Load SCOP3P data and mapping data from BioMart!



In [None]:
scop3p = pd.read_table("/content/drive/MyDrive/Scop3P_confident_Psites.txt")
# Phosphosite data from SCOP3P.

In [None]:
# scop3p.shape
# (81404, 3)

In [None]:
mappit = pd.read_csv('/content/drive/MyDrive/ID_mapping.txt', sep='\t', header=0, usecols=[2,3,4,])
#Mapping data from BioMart.

In [None]:
# mappit.shape
# (116677, 6)

In [None]:
mappit = mappit[mappit['UniProtKB isoform ID'].isnull()]
# Take only the entries with no alternative isoforms- so only 
# keep cannonical versions of the prot.

In [None]:
# mappit.shape
# (83632, 6)

In [None]:
mappit.dropna(axis = 0, subset = ['UniProtKB/Swiss-Prot ID'], inplace = True)
# Keep only transcripts that correspond to Unique protein IDs- more than one transcript
# might encode for the same prot (think silent mutations...)

In [None]:
mappit.shape
# (14433, 6)
# (47,478, 3) Using non canonical.

(14433, 3)

In [None]:
mappit = mappit.rename(columns={'UniProtKB/Swiss-Prot ID': 'ACC_ID'})
# Homogenize column name to use for merging both df's on it.

In [None]:
phospho_muts = scop3p.merge(mappit, how='inner', on='ACC_ID', validate='m:m' )
# New, merged df.

In [None]:
# phospho_muts.shape
# (3,8460, 8)

In [None]:
df_Cosmic = df_Cosmic.rename(columns={'Accession Number': 'merger'})
phospho_muts = phospho_muts.rename(columns={'Transcript stable ID version': 'merger'})
# Homogenize column names of Cosmic with the previously merged mapping+phosphosite data.
# The column to be used for merging corresponds to a unique transcript version.

# Merging COSMIC with Phosphodata

### Birth of 'Mergy' dataframe (originality and creativity over 9 thousand)

In [None]:
mergy = df_Cosmic.merge(phospho_muts, how='inner', on='merger', left_index=False, right_index=False, indicator = False, validate='m:m' )

In [None]:
# mergy.head(100)

In [None]:
mergy.shape
# (2938179, 15)
# (37629939, 11) With non canonical transcripts.
# mergy.columns

(2892309, 11)

### Save Mergy

In [None]:
# mergy.shape
# (2938179, 14)... merged on cannonical transcripts 
pfam_merged.to_csv('/content/drive/My Drive/non canon index preserved merge.csv', header=True, index=False)

# Load Mergy

In [None]:
import pandas as pd
mergy = pd.read_csv('/content/drive/My Drive/non canon index preserved mergy.csv', usecols=[3,6,8,9])
# mergy.shape
# mergy.head(100)

# Concat non canon with canon

In [None]:
frames = [mergy, pfam_merged]

result = pd.concat(frames)

In [None]:
result.head(19)

In [None]:
# result.set_index(['indice'], drop=True, append=False, inplace=True, verify_integrity=False) 
# result.drop_duplicates(subset= ['indice'], keep='first', inplace=True, ignore_index=False)

In [None]:
result.shape

(411766, 15)

In [None]:
# result.head(50)

Unnamed: 0,merger,Primary site,Primary histology,Mutation AA,Mutation Description,Mutation somatic status,indice,ACC_ID,UP_POS,Modification_name,UniProtKB isoform ID,PFAM_ACCESSION,PFAM_NAME,UNP_START,UNP_END
0,ENST00000256246.5,haematopoietic_and_lymphoid_tissue,lymphoid_neoplasm,p.S1064L,Substitution - Missense,Confirmed somatic variant,11.0,Q9BXT5,914,PhosphoS,,,,,
1,ENST00000256246.5,stomach,carcinoma,p.Q1135R,Substitution - Missense,Confirmed somatic variant,19093.0,Q9BXT5,914,PhosphoS,,,,,
2,ENST00000256246.5,thyroid,other,p.N70K,Substitution - Missense,Confirmed somatic variant,75217.0,Q9BXT5,914,PhosphoS,,,,,
3,ENST00000256246.5,large_intestine,carcinoma,p.K2108T,Substitution - Missense,Confirmed somatic variant,175393.0,Q9BXT5,914,PhosphoS,,,,,
4,ENST00000256246.5,skin,malignant_melanoma,p.P272S,Substitution - Missense,Confirmed somatic variant,214815.0,Q9BXT5,914,PhosphoS,,,,,
5,ENST00000256246.5,pancreas,carcinoma,p.T1968A,Substitution - Missense,Confirmed somatic variant,228669.0,Q9BXT5,914,PhosphoS,,,,,
6,ENST00000256246.5,liver,other,p.S630F,Substitution - Missense,Confirmed somatic variant,240892.0,Q9BXT5,914,PhosphoS,,,,,
7,ENST00000256246.5,liver,other,p.S1446A,Substitution - Missense,Confirmed somatic variant,280683.0,Q9BXT5,914,PhosphoS,,,,,
8,ENST00000256246.5,pancreas,carcinoma,p.L2001P,Substitution - Missense,Confirmed somatic variant,305987.0,Q9BXT5,914,PhosphoS,,,,,
9,ENST00000256246.5,prostate,carcinoma,p.Y1202N,Substitution - Missense,Confirmed somatic variant,339800.0,Q9BXT5,914,PhosphoS,,,,,


# Map the phosphodata properly.

#### Prepare a new column to generate position/coordinate window.

In [None]:
# mergy.columns

#Index([0'merger', 1'Primary site', 2'Primary histology', 3'Mutation AA',
      #  4'Mutation Description', 5'Mutation somatic status', 6'indice', 7'ACC_ID',
      #  8'UP_POS', 9'Modification_name', 10'UniProtKB isoform ID'],

      #  3, 6, 8, 9

In [None]:
# mergy.drop_duplicates(subset= ['indice'], keep='first', inplace=True, ignore_index=False)

In [None]:
mergy = mergy.loc[~mergy['Mutation AA'].str.contains("Gfs")]

#  look up what p.D923Gfs*16 is 

In [None]:
mergy.shape
# non canon (37629928, 6)
# non canon without repeats due to RAM limitation

(37629928, 4)

In [None]:
mergy['sliced AA'] = mergy['Mutation AA'].str.slice(3,-1).astype(int)

# Prepare a new column to compare if these matching transcripts are also 
# matching on the position/coordinates.

In [None]:
mergy.shape
# (2938179, 16)

(2892309, 11)

#### Save to use fewer columns.

In [None]:
mergy.to_csv('/content/drive/My Drive/devil.csv', header=True, index=False)

#### Load the devil!

In [None]:
# import pandas as pd
# mergy = pd.read_csv('/content/drive/My Drive/devil.csv')

# mergy.shape
# Index([0'Mutation AA', 1'indice', 2'UP_POS', 3'Modification_name', 4'sliced AA']

In [None]:
mergy 

Unnamed: 0,Mutation AA,indice,UP_POS,Modification_name,sliced AA
0,p.G394C,1,346,PhosphoT,394
1,p.G394C,1,402,PhosphoT,394
2,p.V205A,144319,346,PhosphoT,205
3,p.V205A,144319,402,PhosphoT,205
4,p.K87N,407161,346,PhosphoT,87
...,...,...,...,...,...
37629923,p.Q252K,38182123,175,PhosphoS,252
37629924,p.Q252K,38182123,180,PhosphoS,252
37629925,p.Q252K,38182123,187,PhosphoS,252
37629926,p.Q252K,38182123,192,PhosphoS,252


#### (Don't use) Exact position

The 2 lines in the block just underneath provide exact position matches, not a ±5 window position match. It also saves this state of the dataframe. A post-AA match was also generated!

In [None]:
# mergy_exact = mergy.loc[mergy['sliced AA'] == mergy['UP_POS']]
# mergy_exact.to_csv('/content/drive/My Drive/exact position matched phosphosites pre-AA match.csv', encoding='utf-8', index=False)
# mergy_exact.shape
# (8119, 16)

# Post AA
# (8030, 19)

#### Generate a ±5 AA window from the COSMIC AA position. Use it to match phosphosites provided from SCOP3P.

From here onwards the resulting dataframe will yield matches of the phosphosites with a window of 5± Amino Acids according to the position provided by the COSMIC dataframe.

In [None]:
def rango(x):
    y = list(range((x-5), (x+6)))
    return(y)


#mergy['window'] = mergy['sliced AA'].apply(rango)
# Generate a ±5 AA window from the COSMIC AA position. Use it to match phosphosites
# provided from SCOP3P.

In [None]:
mergy = pfam_mergy[pfam_mergy.apply(lambda x: x['UP_POS'] in rango(x['sliced AA']),axis=1)]
# Keep only phosphosites matching mutation positions (5± window) on COSMIC.

mergy.shape
# (57841, 17)

# with window, no duplicates
# (9131, 10)

(4556, 16)

Save post window matched merged df (contains duplicates)

In [None]:
# mergy.to_csv('/content/drive/My Drive/best indexed matched phosphosites window pre-AA match.csv', encoding='utf-8', index=False)
# This state is the matched positions (using the ±5 Amino Acid window),
# but without the Amino Acid matching.


#### Work to be done- see if some amino acids from cosmic which are not
# phosphorylatable mutate into phospho-residues!

# Also, if phosphoresidues mutate into other phosphoresidues. In these cases,
# use both the window and the exact positions just to have all of the 
# data available I guess.

Use code underneath to drop duplicates if you want. Bear in mind they haven't been matched for AA.

In [None]:
# pfam_mergy.drop_duplicates(subset= ['indice'], keep='first', inplace=True, ignore_index=False)
# mergy.set_index(['indice'], drop=True, append=False, inplace=True, verify_integrity=False) 
# mergy.sort_index(inplace = True)
# print(mergy.shape, mergy.head(5))
# (48891, 10) 
# mergy.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
pfam_mergy.shape

(3782, 16)

#### Match Amino Acids

In [None]:
pfam_mergy['modified AA'] = [x[2] for x in list(pfam_mergy['Mutation AA'])]
# mergy['Modification_name_trial'] = [x[-1] for x in list(mergy['Modification_name'])]
# Prepare columns from both original dataframes (COSMIC and combined phosphosite+mapping)
# to identify matching AA substitutions.

In [None]:
pfam_mergy['Modification_name_trial'] = [x[-1] for x in list(pfam_mergy['Modification_name'])]

In [None]:
pfam_mergy = pfam_mergy.loc[pfam_mergy['modified AA'] == pfam_mergy['Modification_name_trial']]
# Final match! Same transcript, same position (WINDOW!), same Amino Acid. :)

pfam_mergy.shape
# (10756, 18)

# usando el indice de coscmic para remover repetidos:
# (9471, 15)

(449, 18)

Save window and AA matched merged df. Still has duplicates!

In [None]:
# mergy.to_csv('/content/drive/My Drive/best indexed matched phosphosites window post-AA match.csv', encoding='utf-8', index=False)
# Creates (one of the) last version of the datafram with matched transcripts,
# phosphosites (using a 5± AA window), and the identical amino acids. Cheers!

In [None]:
# code to store the csv in google drive :)
# from google.colab import drive
# drive.mount('drive')

#### Reinstate Cosmic's Index & Drop repeats!

In [None]:
pfam_mergy.drop_duplicates(subset= ['indice'], keep='first', inplace=True, ignore_index=False)
mergy.set_index(['indice'], drop=True, append=False, inplace=True, verify_integrity=False) 
# mergy.sort_index(inplace = True)
print(pfam_mergy.shape, pfam_mergy.head(5))

(345, 18)                   merger     Primary site  ... modified AA Modification_name_trial
618    ENST00000396444.7             lung  ...           T                       T
886    ENST00000396444.7           cervix  ...           S                       S
25175  ENST00000371724.6             skin  ...           S                       S
43493  ENST00000441024.6  large_intestine  ...           T                       T
60309  ENST00000343225.4          stomach  ...           T                       T

[5 rows x 18 columns]


####Save this if you want. It has no duplicates.

In [None]:
# mergy.to_csv('/content/drive/My Drive/non canon mergy no duplicates.csv', encoding='utf-8', index=False)

# U're stuck here

Still need to eliminate non synonymous mutations, incorporate selenocysteines, count mets, considerer which of the duplicates you actually have to keep.

In [None]:
# load mergy marched phospho window post aa match
# add the tumour origin column, as well as gene name, and primary histo and primary site

# from google.colab import drive
# drive.mount('drive')

# import pandas as pd
# icaro = pd.read_csv('/content/drive/My Drive/index preserved df.csv')
# icaro.shape
# icaro.head(3)

# icaro.drop_duplicates(subset= ['indice'], keep='first', inplace=True, ignore_index=False)
# icaro.set_index(['indice'], drop=True, append=False, inplace=True, verify_integrity=False) 
# icaro.sort_index(inplace = True)
# print(icaro.shape, icaro.head(5))
# icaro.head(5)

# col_Cosmic = pd.read_csv('/content/DECOMPcosmic.tsv', sep='\t', usecols=[0,7,11,34])
# col_Cosmic.head(10)

# enhanced_Cosmic = icaro.merge(col_Cosmic, how='inner', left_index=True, right_index=True, validate='1:1' )

(418723, 8)        Mutation AA  ... UniProtKB isoform ID
indice              ...                     
11        p.S1064L  ...                  NaN
84        p.P1057H  ...                  NaN
228        p.I251M  ...                  NaN
257        p.V410E  ...                  NaN
461        p.Q521R  ...                  NaN

[5 rows x 8 columns]


In [None]:
mergy['Tumour origin'].value_counts()

# Both sets: with 5 AA window
# pre AA match
# primary                                   26320
# metastasis                                 2933
# recurrent                                   580
# secondary                                    16

# post AA match
# primary                                   4818
# metastasis                                 607
# recurrent                                   89
# secondary                                    1

In [None]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 19)

In [None]:
# tm_origin_grp = mergy.groupby(['Primary site'])
# tm_origin_grp["Gene name"].apply(lambda x: x.value_counts()).head(150)#.nlargest(8))
# mergy.head(150)

In [None]:
# cantabria = mergy.loc[mergy['Primary site'].str.contains('liver')]

# cantabria.shape
# (972, 19)

# cantabria.head(150)
# cantabria['Gene name'].value_counts().head(100)

# COSMIC analysis

Find most frequently mutated genes (per tumor origin / per cancer type).

What are the most represented AA pairs? (per tumor origin / per cancer type).

In [None]:
import pandas as pd
df_Cosmic = pd.read_csv('/content/drive/My Drive/mergy no duplicates.csv')

In [None]:
df_Cosmic.shape

(9471, 15)

In [None]:
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 10)
# df_Cosmic = df_Cosmic.loc[~df_Cosmic['Primary site'].str.contains('NS')]
# df_Cosmic = df_Cosmic.loc[~df_Cosmic['Primary histology'].str.contains('NS')]
# grupo_sitio = df_Cosmic.groupby(['Primary site'])
# grupo_sitio['Gene name'].apply(lambda x: x.value_counts()).head(300)

In [None]:
# Homogenize to quantifiable format of substituted AAs
def standard_format(x):
    if x[1] == '.':
        return x.replace(x , (f'{x[2]} to {x[-1]}'))



df_Cosmic['Mutation AA'] = df_Cosmic['Mutation AA'].apply(standard_format)


In [None]:
# histograms for these groups to visualize frequencies
# //for the entire cosmic database
# //also for the phosphodata

## Groupies

In [None]:
# df_Cosmic['Primary site'].value_counts()

# top ten

# skin                                           1754674
# large_intestine                                1588714
# lung                                            899525
# stomach                                         697025
# endometrium                                     592289
# liver                                           535557
# upper_aerodigestive_tract                       399039
# breast                                          382435
# thyroid                                         360961
# urinary_tract                                   302655
# central_nervous_system                          295983
# haematopoietic_and_lymphoid_tissue              278589

In [None]:
# df_Cosmic['Primary histology'].value_counts().nlargest(5)

# top 5

# carcinoma                                           6917500
# malignant_melanoma                                  1386706
# glioma                                               281197
# other                                                276070
# lymphoid_neoplasm                                    184435

In [None]:
# GENERATED A NEW DF WITH GROUPED BY SITE/HISTOLOGY/AA MUTATION COUNTS
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 10)

# grupo_sh = df_Cosmic.groupby(['Primary site', 'Primary histology'])
# first_graph = grupo_sh['Gene name'].value_counts().to_frame()
# first_graph = grupo_sh['Mutation AA'].apply(lambda x: x.value_counts().nlargest(3)).to_frame()
# first_graph.to_csv('/content/drive/My Drive/first graph.csv', encoding='utf-8', index=True)

grupo = df_Cosmic[df_Cosmic['Primary histology']== 'carcinoma']
grupo['Mutation AA'].value_counts().head(500)

S to F    1591
S to P    1447
S to L     580
S to Y     511
S to C     400
S to N     210
S to G     208
T to A     171
T to M     166
S to T     156
S to A     149
S to R     148
T to I     125
Y to C     119
S to I     109
T to P      64
T to S      50
Y to H      37
T to R      21
Y to S      19
T to K      16
T to N      15
Y to F      13
S to W      12
Y to N      12
Y to D       9
Name: Mutation AA, dtype: int64

In [None]:
# graphy = pd.read_csv('/content/drive/My Drive/first graph.csv')
# graphy

Unnamed: 0,Primary site,Primary histology,Mutation AA,Mutation AA.1
0,adrenal_gland,adrenal_cortical_adenoma,S to P,867
1,adrenal_gland,adrenal_cortical_adenoma,S to F,580
2,adrenal_gland,adrenal_cortical_adenoma,R to C,204
3,adrenal_gland,adrenal_cortical_adenoma,R to H,171
4,adrenal_gland,adrenal_cortical_adenoma,P to L,159
...,...,...,...,...
10626,urinary_tract,carcinoma,I to K,65
10627,urinary_tract,carcinoma,I to R,53
10628,urinary_tract,carcinoma,L to W,47
10629,urinary_tract,carcinoma,E to L,2


In [None]:
# GENERATED A NEW DF WITH GROUPED BY SITE/HISTOLOGY/AA MUTATION COUNTS

# grupo_sh = df_Cosmic.groupby(['Primary site', 'Primary histology'])
# second_graph = grupo_sh['Gene name'].value_counts().to_frame()
# second_graph.to_csv('/content/drive/My Drive/second graph.csv', encoding='utf-8', index=True)
# print('DONE!')

DONE!


In [None]:
# graphy_2 = pd.read_csv('/content/drive/My Drive/second graph.csv')
# graphy_2.head(4)

In [None]:
# GENERATED A NEW DF WITH GROUPED BY SITE/HISTOLOGY/AA MUTATION COUNTS

# grupo_sh = mergy.groupby(['Primary site', 'Primary histology'])
# third_graph = grupo_sh['ACC_ID'].value_counts().to_frame()
# third_graph.to_csv('/content/drive/My Drive/third graph.csv', encoding='utf-8', index=True)
# print('DONE!')

DONE!


In [None]:
# graphy_3 = pd.read_csv('/content/drive/My Drive/third graph.csv')
# graphy_3.head(4)

Unnamed: 0,Primary site,Primary histology,ACC_ID,ACC_ID.1
0,NS,NS,P13489,7
1,NS,carcinoma,Q99676,2
2,NS,malignant_melanoma,P35222,69
3,NS,malignant_melanoma,P24534,16


Distribution of primary site / primary histology.

In [None]:
# First, group data by site or type
# cancer_type_grp = df_Cosmic.groupby(['Primary histology'])
# tm_origin_grp = df_Cosmic.groupby(['Primary site']) 

# ?Second, show the other column as percentages
# tm_origin_grp['Primary histology'].value_counts(normalize=True)
# cancer_type_grp['Primary histology'].value_counts(normalize=True)

# Third, show total counts for the whole dataframe of sites and types
# df_Cosmic['Primary histology'].value_counts(normalize=True)
# df_Cosmic['Primary site'].value_counts(normalize=True)

How many SNPs with FATHMM > 0.5?

In [None]:
# Mutation AA == T,S,y
#  1) if cosmic originally had one of these three and remains the same
#  2) if cosmic originally had one of these three but changed into another one of them
#  3) if it didnt have one of those and turned into any of them
