In [1]:
import os
import pandas as pd 
import numpy as np
from IPython.display import clear_output

In [2]:
cwd0=os.getcwd()
cwd_interim=os.path.join(cwd0,'interim_files/')
cwd_refdata=os.path.join(cwd0,'reference_data_files/')

In [3]:
# Define Useful Functions 
setlen=lambda x:len(set(x)) # Calculate length of set of a list.

In [8]:
# ==BEGIN HERE== to skip re-importing raw genomic data.
df_mutfiles=pd.read_csv(os.path.join(cwd_interim,'df_mutfiles.zip'),sep='\t',dtype=str)
geneset=list(set(df_mutfiles.Hugo_Symbol.values))
histcodes=list(set(df_mutfiles.CODE.sort_values()))

### Perform Counts

In [10]:
from multiprocess import Pool, cpu_count

In [11]:
%%time
# Define Null Output Matrix: genes vs hist as Integer
df_Out=pd.DataFrame(0,columns=['All']+histcodes,index=geneset+['Total'],dtype=int)
# Construct the output matrix by iteratating over rows of reduced dataframe thereby covering each gene mutated for each sample
def fun_Outdf(inpvec):
    df_Out1=inpvec[1]
    inpdf=inpvec[0]
    for idx,row in inpdf.iterrows():
        igene=row.Hugo_Symbol
        icode=row.CODE
        df_Out1.loc[igene,icode]=df_Out1.loc[igene,icode]+1
        df_Out1.loc[igene,'All']=df_Out1.loc[igene,'All']+1
    return df_Out1

ncores=cpu_count()-2# number of cores the task can be split into
imarkers=[i*int(len(df_mutfiles)/ncores) for i in range(ncores+1)]
imarkers[-1]=len(df_mutfiles)
datalist=[[df_mutfiles[imarkers[i]:imarkers[i+1]],df_Out[:].copy(deep=True)] for i in range(ncores)]
if __name__ == '__main__': #I don't quite understand why this is necessary but it is a part of multiprocessing docs
    po=Pool(ncores) # invoke 5 pooled threads/processes. 
    list_resdf=list(po.map(fun_Outdf,datalist)) 
    po.close() 
    po.join()

for idf in list_resdf:
    df_Out=df_Out+idf

Wall time: 5min 18s


In [12]:
#Check correct counts: All these numbers are same if all mutations were counted once
df_Out.drop(columns='All').sum().sum(),df_Out['All'].sum(),len(df_mutfiles)

(2544571, 2544571, 2544571)

In [13]:
# This is an important point. We could choose to count the total number of samples however, then if the same sample showed up in two different studies, it will be counted as one. Instead, we consider the possibility that since some studies name samples simply by numbers, it is possible that two studies have similar name but different samples. So we count unique Study_ID+Sample_ID.
# This is consistent since we already removed redundant Patient IDs with the same histology in a previous step.

In [14]:
%%time
df_mutfiles['StidSid']=[df_mutfiles.loc[idx,'STUDY_ID']+'_'+df_mutfiles.loc[idx,'Tumor_Sample_Barcode'] for idx in df_mutfiles.index]
# Count number of cases 'Total' within each histology : Perform this action after filtering for curated and sequenced samples
TotalRow=[setlen(df_mutfiles.StidSid.values)]+[setlen(df_mutfiles.StidSid[df_mutfiles.CODE==ihist].values) for ihist in df_Out.columns[1:]]
df_Out.loc['Total']=TotalRow

Wall time: 1min 25s


In [15]:
for item in ['n/a','NAN','NA','na','nan']:
    if item in df_Out.index:
        df_Out=df_Out.drop(index=item)

In [16]:
df_Out=df_Out.drop(columns=df_Out.columns[df_Out.loc['Total']==0])

In [18]:
# To test genomics pipeline pre-gene renaming, the interim file df_out can be output by un-commenting this command.
df_Out.to_excel(os.path.join(cwd_interim,'df_Out_preGeneRename.xlsx'),index_label='Hugo_Symbol')

## PostProcessing: Remove redundant gene names and remove null rows

In [19]:
df_chDegen=pd.read_csv(os.path.join(cwd_interim,'Genelist_ManyChromosomes.xlsx'))# Only MARCH1 MARCH2 and SEPT15 have any real issues. This is very very likely due to excel errors someone made in the past by copying data incorrectly without realizing.
degenchlist=df_chDegen[['Hugo_Symbol','Chromosome Locations','Entries_per_Location']].applymap(lambda x:str(x).upper()).values

In [20]:
#Import alternate gene nomenclature file from cbioportal: https://docs.cbioportal.org/3.-cbioportal-maintenance/updating-gene-and-gene_alias-tables
#Homo_sapien.gene_info.gz ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz
dfGeneNames=pd.read_csv(os.path.join(cwd_refdata,'Homo_sapiens_gene_info_GM.txt'),sep='\t',dtype=str)
dfGeneNames=dfGeneNames.astype(str).applymap(lambda x:x.upper())
dfGeneNames.Synonyms=[str(row).split('|') for row in dfGeneNames.Synonyms.values]
listSynonyms=[elem for row in dfGeneNames.Synonyms.values for elem in row]
def FindGeneName(igene):
    retgene=np.nan
    if (igene in set(dfGeneNames.Symbol)) or (igene not in set(listSynonyms)) or (igene in [row[0] for row in degenchlist]):
        return igene
    else:
        retgene=dfGeneNames[[igene in row for row in dfGeneNames.Synonyms]].Symbol.values[0]
    # proceed to rename if the chromosome no is same.    
    chno_retgene=str(dfGeneNames[dfGeneNames.Symbol==retgene].chromosome.values[0])
    chno_igene=str(df_mutfiles[df_mutfiles.Hugo_Symbol==igene].Chromosome.values[0])# to cover simple renaming situations
    return retgene if ((chno_retgene==chno_igene) and (igene in geneset)) else igene

In [None]:
%%time
dfC=df_Out[:].copy(deep=True)
genesrenamed=[]
genesadded=[]
for igene in dfC.index:
    newgene=FindGeneName(igene)
    if (newgene != igene):
        if (newgene in dfC.index.values):
            dfC.loc[newgene]=dfC.loc[newgene].copy()+dfC.loc[igene].copy()
            dfC.drop(igene,inplace=True)
            genesadded=genesadded+[igene]
        else:
            dfC.loc[newgene]=dfC.loc[igene].copy()
            dfC.drop(igene,inplace=True)
            genesrenamed=genesrenamed+[igene]

In [None]:
dfCadd=df_Out.loc[genesadded]
dfCadd['Hugo_Symbol_parent']=[FindGeneName(igene) for igene in dfCadd.index]
dfCadd=dfCadd[[dfCadd.columns[-1]]+list(dfCadd.columns[:-1])]
dfCadd.to_excel(os.path.join(cwd_interim,'Genes_Added.xlsx'))

In [None]:
dfCren=df_Out.loc[genesrenamed]
dfCren['Hugo_Symbol_new']=[FindGeneName(igene) for igene in dfCren.index]
dfCren=dfCren[[dfCren.columns[-1]]+list(dfCren.columns[:-1])]
dfCren.to_excel(os.path.join(cwd_interim,'Genes_Renamed.xlsx'))

In [None]:
print('len(genesrenamed),len(genesadded): ',len(genesrenamed),len(genesadded))

In [None]:
idx1=list(dfC.index)
idx1=sorted(idx1)
idx1.remove('Total')
idx1=idx1+['Total']
dfC=dfC.loc[idx1]

In [None]:
colist=dfC.columns.sort_values()
colist=[colist[-1]]+list(colist[:-1])
dfC=dfC[colist]

In [None]:
dfC.to_csv(os.path.join(cwd0,'Genomics_Output_Processed.txt'),header=True,sep='\t',index_label='Hugo_Symbol')
print('Done. Results stored in Genomics_Output_Processed.txt')