In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
import umap
from sklearn.decomposition import PCA

pdtemp = pd.read_csv('tests/geneid2hugo.tsv',sep='\t')
hugo2geneid = {row['HUGO']:row['GeneID'] for i,row in pdtemp.iterrows()}
geneid2hugo = {row['GeneID']:row['HUGO'] for i,row in pdtemp.iterrows()}

In [None]:
# score_type can be 'ORF' or 'CRISPR'
score_type = 'ORF'
#score_type = 'CRISPR'

dfscores = pd.read_csv('data/'+score_type.lower()+'_scores_merged.tsv',sep='\t')
#dfscores = pd.read_csv('/mnt/c/Datas/MorphMap/crispr_scores_merged.tsv',sep='\t')
#label = 'CRISPR'
label = score_type
display(dfscores)

listofgenes = list(set(list(dfscores['GENE1']) + list(dfscores['GENE2'])))
print(len(listofgenes))

slc_genes = [g for g in listofgenes if g.startswith('SLC') ]
or_genes = [g for g in listofgenes if g.startswith('OR') ]
print('Number of OR genes',len(or_genes))
print('Number of SLC genes',len(slc_genes))
slcor_genes = slc_genes+or_genes

dfscores_strong_connections = dfscores[dfscores['ABS_'+score_type+'_SIM']>0.7]
listof_strongconnections = list(set(list(dfscores_strong_connections['GENE1']) + list(dfscores_strong_connections['GENE2'])))
slc_genes_strong = [g for g in listof_strongconnections if g.startswith('SLC') ]
or_genes_strong = [g for g in listof_strongconnections if g.startswith('OR') ]
print('Number of OR genes',len(or_genes_strong))
print('Number of SLC genes',len(slc_genes_strong))
slcor_genes_strong = slc_genes_strong+or_genes_strong

orfsimilarities = [(row['GENE1'],row['GENE2']) for i,row in dfscores_strong_connections.iterrows()]
print('Number of strong connections=',len(orfsimilarities))


In [None]:
# Create clustergram based on MorphMap similarities

conns = slcor_genes
conns = slcor_genes_strong

genes1 = list(conns)
dfscores_subm = dfscores.loc[((dfscores['GENE1'].isin(genes1))&(dfscores['GENE2'].isin(genes1)))]
#score_name = 'ABS_'+label+'_SIM'
score_name = label+'_SIM'
#score_name = 'unsupervised_max'
#score_name = 'gene_bp'
#score_name_annot = 'gene_bp'
#score_name_annot = 'ABS_'+label+'_SIM'
score_name_annot = label+'_SIM'
score_name_annot = 'unsupervised_max'
scores_dict = {}
scores_dict_annot = {}
for i,row in dfscores_subm.iterrows():
    scores_dict[(row['GENE1'],row['GENE2'])] = row[score_name]
    scores_dict[(row['GENE2'],row['GENE1'])] = row[score_name]
    scores_dict_annot[(row['GENE1'],row['GENE2'])] = row[score_name_annot]
    scores_dict_annot[(row['GENE2'],row['GENE1'])] = row[score_name_annot]

display(dfscores_subm)
sc_matrix = np.zeros([len(genes1),len(genes1)])
sc_matrix_annot = np.zeros([len(genes1),len(genes1)])
for i,gi in enumerate(genes1):
    for j,gj in enumerate(genes1):
        if i==j:
            sc_matrix[i,j] = 1.0
            sc_matrix_annot[i,j] = 1.0            
        else:
            sc_matrix[i,j] = scores_dict[(gi,gj)]
            sc_matrix[j,i] = scores_dict[(gj,gi)]
            sc_matrix_annot[i,j] = scores_dict_annot[(gi,gj)]
            sc_matrix_annot[j,i] = scores_dict_annot[(gj,gi)]

df = pd.DataFrame(data=sc_matrix,columns=genes1)
df.index = genes1
plt.rcParams['figure.dpi'] = 300
sns.set(font_scale = 0.8)
sns.clustermap(df,vmin=-1,vmax=1,xticklabels=True,annot_kws={"size": 5},cmap='vlag')
plt.title(score_name)

In [None]:
conns = slcor_genes


genes1 = list(conns)
dfscores_subm = dfscores.loc[((dfscores['GENE1'].isin(genes1))&(dfscores['GENE2'].isin(genes1)))]
#score_name = 'ABS_'+label+'_SIM'
#score_name = label+'_SIM'
#score_name = 'gene_mf'
#score_name = 'unsupervised_max'
score_name = 'gene_bp'
#score_name_annot = 'gene_bp'
#score_name_annot = 'ABS_'+label+'_SIM'
score_name_annot = label+'_SIM'
score_name_annot = 'unsupervised_max'
scores_dict = {}
scores_dict_annot = {}
for i,row in dfscores_subm.iterrows():
    scores_dict[(row['GENE1'],row['GENE2'])] = row[score_name]
    scores_dict[(row['GENE2'],row['GENE1'])] = row[score_name]
    scores_dict_annot[(row['GENE1'],row['GENE2'])] = row[score_name_annot]
    scores_dict_annot[(row['GENE2'],row['GENE1'])] = row[score_name_annot]

display(dfscores_subm)
sc_matrix = np.zeros([len(genes1),len(genes1)])
sc_matrix_annot = np.zeros([len(genes1),len(genes1)])
for i,gi in enumerate(genes1):
    for j,gj in enumerate(genes1):
        if i==j:
            sc_matrix[i,j] = 1.0
            sc_matrix_annot[i,j] = 1.0            
        else:
            sc_matrix[i,j] = scores_dict[(gi,gj)]
            sc_matrix[j,i] = scores_dict[(gj,gi)]
            sc_matrix_annot[i,j] = scores_dict_annot[(gi,gj)]
            sc_matrix_annot[j,i] = scores_dict_annot[(gj,gi)]

df = pd.DataFrame(data=sc_matrix,columns=genes1)
df.index = genes1
plt.rcParams['figure.dpi'] = 300
sns.set(font_scale = 0.8)
sns.clustermap(df,vmin=-1,vmax=1,xticklabels=True,annot_kws={"size": 5},cmap='vlag')
plt.title(score_name)

In [None]:
# make UMAP plot showing strong connections between SLC and OR genes

model_name = 'gene_bp'

func_similarity = pd.read_csv('trained_models/'+model_name+'/'+model_name+'_genefunction.tsv',sep='\t')
Xf = func_similarity[func_similarity.columns[1:]].to_numpy()
geneids = np.array(func_similarity[func_similarity.columns[0]])

geneids2index = {}
for i,gid in tqdm(enumerate(geneids)):
    k= np.where(geneids==str(gid))[0][0]
    geneids2index[gid] = k



In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(Xf)

In [None]:
genetype = []
genesize = []
for g in geneids:
    gid = int(g[6:])
    if gid in geneid2hugo: 
        hugo = geneid2hugo[gid]
        if hugo in or_genes:
            genetype.append('OR')
            genesize.append(100)
        elif hugo in slc_genes:
            genetype.append('SLC')
            genesize.append(100)
        else:
            genetype.append('OTHER')
            genesize.append(1)
    else:
        genetype.append('OTHER')
        genesize.append(1)
    
df = pd.DataFrame(data={'UMAP1':embedding[:,0],'UMAP2':embedding[:,1],'GENE':genetype,'SIZE':genesize})
sns.scatterplot(df,x='UMAP1',y='UMAP2',s=genesize,hue='GENE')
for gor in or_genes:
    for gslc in slc_genes:
        gid_or = hugo2geneid[gor]
        gid_slc = hugo2geneid[gslc]
        i_or = geneids2index['Gene::'+str(gid_or)]
        i_slc = geneids2index['Gene::'+str(gid_slc)]
        #print(gor,gslc)
        if (gor,gslc) in orfsimilarities:
            plt.plot([embedding[i_or,0],embedding[i_slc,0]],[embedding[i_or,1],embedding[i_slc,1]],'k-',alpha=0.3,linewidth=0.4)
        if (gslc,gor) in orfsimilarities:
            plt.plot([embedding[i_or,0],embedding[i_slc,0]],[embedding[i_or,1],embedding[i_slc,1]],'k-',alpha=0.3,linewidth=0.4)  
plt.title(model_name+' space',fontsize=15)          
