In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import networkx as nx
import random
import os

def sigmoid_norm(z):
    zn = z/np.std(z)
    zn = 1/(1 + np.exp(-zn))
    zn = (zn-0.5)*2.0
    return zn

#pdtemp = pd.read_csv('../tests/geneid2hugo.tsv',sep='\t')
pdtemp = pd.read_csv('tests/biomart_hugogeneid.tsv',sep='\t')
hugo2geneid = {row['HUGO']:row['GeneID'] for i,row in pdtemp.iterrows()}
geneid2hugo = {row['GeneID']:row['HUGO'] for i,row in pdtemp.iterrows()}

plt.rcParams['figure.dpi'] = 300
hist_colormap = 'Blues'

In [None]:
# score_type can be 'ORF' or 'CRISPR'
# Run one at a time using either 'ORF', or 'CRISPR'
score_type = 'ORF'
# score_type = 'CRISPR'

dfscores = pd.read_csv('data/'+score_type.lower()+'_scores_merged.tsv',sep='\t')

# additional normalization,if needed
#scores = ['gene_mf','gene_bp','gene_pathway']
#for s in scores:
#    xd = np.array(dfscores[s])
#    std = np.std(xd)
#    xd_norm = sigmoid_norm(xd/(2*std))
#    dfscores[s] = xd_norm

#dfscores = pd.read_csv('/mnt/c/Datas/MorphMap/crispr_scores_merged.tsv',sep='\t')
#label = 'CRISPR'
label = score_type
display(dfscores)


Unnamed: 0,GENE1,GENE2,CRISPR_SIM,ABS_CRISPR_SIM,gene_mf__go,gene_bp__go,gene_pathway,unsupervised_max
0,PALS2,SEPTIN1,0.067,0.067,0.034,0.112,0.169,0.169
1,PALS2,MRPL58,-0.097,0.097,-0.109,0.132,0.200,0.200
2,PALS2,TAFAZZIN,0.061,0.061,0.081,0.129,0.245,0.245
3,PALS2,MARS1,-0.140,0.140,0.038,0.074,0.054,0.074
4,PALS2,SARS1,-0.166,0.166,0.079,0.089,0.092,0.092
...,...,...,...,...,...,...,...,...
15089266,SIX6,TGIF1,0.148,0.148,0.442,0.353,0.417,0.442
15089267,SIX6,SPTLC1,-0.016,0.016,-0.156,-0.269,-0.102,-0.102
15089268,SMARCA4,TGIF1,-0.037,0.037,0.856,0.575,0.711,0.856
15089269,SMARCA4,SPTLC1,0.012,0.012,0.286,0.352,0.052,0.352


In [4]:
dfscores.shape

(23028291, 8)

In [None]:
# Create a randomized (reshuffled) version of dfscores

listofgenes = list(set(list(dfscores['GENE1']) + list(dfscores['GENE2'])))
print(len(listofgenes))

scorevalues_reshuffled_samples = []
number_of_samples = 5
for i in range(number_of_samples):

    print('Sample ',(i+1),'/',number_of_samples)

    listofgenes_reshuffled = listofgenes.copy()
    random.shuffle(listofgenes_reshuffled)

    scorevalues_reshuffled = list(dfscores['ABS_'+label+'_SIM'])
    random.shuffle(scorevalues_reshuffled)

    gene1 = list(dfscores['GENE1'])
    gene2 = list(dfscores['GENE2'])

    pair_keys = {gene1[i]+'#'+gene2[i]:i for i,g in tqdm(enumerate(gene1))}
    scores = list(dfscores['ABS_'+label+'_SIM'])

    reshuffle_map = {l:listofgenes_reshuffled[i] for i,l in enumerate(listofgenes)}
    scorevalues_reshuffled = []
    for i,g in tqdm(enumerate(gene1)):
        key = reshuffle_map[gene1[i]]+'#'+reshuffle_map[gene2[i]]
        if not key in pair_keys:
            key = reshuffle_map[gene2[i]]+'#'+reshuffle_map[gene1[i]]
        scorevalues_reshuffled.append(scores[pair_keys[key]])
    
    dfscores_reshuffled = dfscores.copy()
    dfscores_reshuffled['ABS_'+label+'_SIM'] = scorevalues_reshuffled

    scorevalues_reshuffled_samples.append(dfscores_reshuffled)




In [None]:
# show histogram in an interval
val = 0.8
scores = dfscores[(dfscores['ABS_'+label+'_SIM']>=val)&(dfscores['ABS_'+label+'_SIM']<=val+0.3)]
#sns.histplot(scores,x='gene_bp__go',bins=50,stat='probability')
plt.figure(figsize=(15,3))
sns.histplot(scores,x='gene_bp__go',bins=30,stat='probability')
plt.xlim(-1,1)

In [None]:
# unconditional density plots
fields = ['gene_mf','gene_bp','gene_pathway','unsupervised_max','weakly_supervised','strongly_supervised']
for field in fields:
    #sns.scatterplot(dfscores,x='ORF_SIM',y=field,s=1,c='k')
    sns.histplot(dfscores,x='ABS_'+label+'_SIM',y=field,cmap=hist_colormap,bins=(100,100),alpha=1.0,legend=True,stat='density',cbar=True)
    plt.show()



In [None]:
# conditional density, both negative and positive values

fields = ['gene_mf','gene_bp','gene_pathway','unsupervised_max','weakly_supervised','strongly_supervised']
stepsize = 0.015
for field in fields:
    #sns.scatterplot(dfscores,x='ORF_SIM',y=field,s=1,c='k')
    orf_intervals = np.arange(-1,1,stepsize)
    for i,x in enumerate(orf_intervals):
        dfscores_int = dfscores.loc[(dfscores[label+'_SIM']>=x-stepsize/10)&(dfscores[label+'_SIM']<=x+stepsize*1.1)]
        if len(dfscores_int)>20:
            try:
                sns.histplot(dfscores_int,x=label+'_SIM',y=field,cmap=hist_colormap,bins=(1,50),alpha=1.0)
            except:
                k = 1
        
    plt.legend()
    plt.show()

In [None]:
# conditional density, absolute values

#fields = ['gene_mf','gene_bp','gene_pathway','unsupervised_max','weakly_supervised','strongly_supervised']
#fields = ['gene_mf','gene_mf__go','gene_mf__go__enriched_0.30__QC','gene_mf__go__enriched_0.50__QC']
fields = ['gene_bp__go','gene_mf__go','gene_pathway']
#fields = ['weakly_supervised','strongly_supervised']
stepsize = 0.015
for field in fields:
    #sns.scatterplot(dfscores,x='ORF_SIM',y=field,s=1,c='k')
    orf_intervals = np.arange(-1,1,stepsize)
    for i,x in enumerate(orf_intervals):
        dfscores_int = dfscores.loc[(dfscores['ABS_'+label+'_SIM']>=x-stepsize/10)&(dfscores['ABS_'+label+'_SIM']<=x+stepsize*1.1)]
        if len(dfscores_int)>10:
            try:
                sns.histplot(dfscores_int,x='ABS_'+label+'_SIM',y=field,cmap=hist_colormap,bins=(1,50),alpha=1.0,kde=True)
            except:
                k = 1
        
    plt.legend()
    plt.show()

In [None]:
# Compute suitable explainability thresholds
dfscores_int = dfscores.loc[(dfscores['ABS_'+label+'_SIM']>=x)&(dfscores['ABS_'+label+'_SIM']<=x+0.1)]
x = 0
#for field in fields:
#    sns.histplot(dfscores_int,x=field,bins=50,alpha=0.4,legend=True)
#    plt.show()
#plt.legend()
scores = list(dfscores_int[field])

#plt.hist(scores,bins=100)

In [None]:
# Plot explained fraction of gene pairs as a function of MorphMap similarity

#colors  = ['r','g','b','m','y']
colors  = ['#ff7f00', '#4daf4a', '#377eb8']

#fields = ['gene_mf','gene_bp','gene_pathway','unsupervised_max','weakly_supervised','strongly_supervised']
#fields = ['gene_mf','gene_pathway','gene_bp']
#fields = ['gene_mf','gene_mf__go','gene_mf__go__enriched_0.30__QC','gene_mf__go__enriched_0.50__QC']

#fields = ['gene_mf','gene_pathway','gene_bp','unsupervised_max']
fields = ['gene_mf__go','gene_pathway','gene_bp__go']
#fields = ['gene_bp__go']
field_names = ['Gene - GO Molecular Function','Gene - Pathway','Gene - GO Biological Process']
#fields = ['weakly_supervised','strongly_supervised']
normalize_explained_fraction = False
stepsize = 0.05
for k,field in enumerate(fields):

    if score_type=='ORF':
        func_high_thresh = 0.5
    else:
        func_high_thresh = 0.5
        if field=='gene_mf__go':
            func_high_thresh = 0.4


    orf_scores = np.array(dfscores[label+'_SIM'])
    #
    # the next line can be commented in order to see both positive and negative side of the distribution
    #
    orf_scores = np.array(dfscores['ABS_'+label+'_SIM'])
    func_scores = np.array(dfscores[field])
    expected__high_fraction = np.sum(func_scores>func_high_thresh)/func_scores.shape[0]
    print('Expected high fraction=',expected__high_fraction)
    orf_intervals = np.arange(-0.6,1,stepsize)
    observed_high_ratios = []
    for i,x in enumerate(orf_intervals):
        irx = np.where((orf_scores>x)&(orf_scores<x+stepsize))[0]
        sample = func_scores[irx]
        observed_high_fraction = np.sum(sample>func_high_thresh)/sample.shape[0]
        if normalize_explained_fraction:
            ratio = (observed_high_fraction-expected__high_fraction)/expected__high_fraction
        else:
            ratio = observed_high_fraction
        observed_high_ratios.append(ratio*100)
    plt.plot(orf_intervals,observed_high_ratios,colors[k]+'-',linewidth=5,label=field_names[k])
    plt.xlabel(label+' similarity')
    if normalize_explained_fraction:
        plt.ylabel('Enrichment relative difference, %')
    else:
        plt.ylabel('Explained fraction, %')
if normalize_explained_fraction:
    plt.title('Explained normalized fraction')
else:
    plt.title('Explained fraction of gene-gene links')

# Adding reshuffled baseline

orf_intervals = np.arange(0.0,1.0,stepsize)
high_ratios_dict = {}

for i,dfscores_reshuffled in enumerate(scorevalues_reshuffled_samples):
    print('Sample',i)
    for k,field in enumerate(fields):
        orf_scores = np.array(dfscores[label+'_SIM'])
        #
        # the next line can be commented in order to see both positive and negative side of the distribution
        #
        orf_scores = np.array(dfscores_reshuffled['ABS_'+label+'_SIM'])
        func_scores = np.array(dfscores_reshuffled[field])
        expected__high_fraction = np.sum(func_scores>func_high_thresh)/func_scores.shape[0]
        print('Expected high fraction=',expected__high_fraction)
        observed_high_ratios = []
        for i,x in enumerate(orf_intervals):
            irx = np.where((orf_scores>x)&(orf_scores<x+stepsize))[0]
            sample = func_scores[irx]
            if len(irx)>15:
                observed_high_fraction = np.sum(sample>func_high_thresh)/sample.shape[0]
                if normalize_explained_fraction:
                    ratio = (observed_high_fraction-expected__high_fraction)/expected__high_fraction
                else:
                    ratio = observed_high_fraction
                observed_high_ratios.append(ratio*100)
            else:
                observed_high_ratios.append(ratio*100)
        v = high_ratios_dict.get(field,[])
        v.append(observed_high_ratios)
        high_ratios_dict[field] = v

for k,field in enumerate(fields):
    ar = np.array(high_ratios_dict[field])
    plt.plot(orf_intervals,np.mean(ar,axis=0),colors[k]+'--',linewidth=3,label=field_names[k]+' reshuffled')
    plt.plot(orf_intervals,np.mean(ar,axis=0)+np.std(ar,axis=0),colors[k]+'--',linewidth=0.5)
    plt.plot(orf_intervals,np.mean(ar,axis=0)-np.std(ar,axis=0),colors[k]+'--',linewidth=0.5)

plt.tight_layout()
plt.legend()
plt.savefig(f'{score_type}.jpeg', format='jpeg', bbox_inches='tight', dpi=1200)
plt.show()
plt.close()

In [None]:
# Plot unexplained fraction of gene pairs as a function of MorphMap similarity

fields = ['gene_mf','gene_bp','gene_pathway','unsupervised_max','weakly_supervised','strongly_supervised']
fields = ['gene_mf','gene_bp','gene_pathway','unsupervised_max']

normalize_unexplained_fraction = False

for field in fields:
    orf_scores = np.array(dfscores['ORF_SIM'])
    func_scores = np.array(dfscores[field])
    func_low_thresh = 0.5
    expected__low_fraction = np.sum(func_scores<func_low_thresh)/func_scores.shape[0]
    print('Expected low fraction=',expected__low_fraction)
    stepsize = 0.05
    orf_intervals = np.arange(0,1,stepsize)
    observed_low_ratios = []
    #print(orf_intervals)
    for i,x in enumerate(orf_intervals):
        irx = np.where((orf_scores>x)&(orf_scores<x+stepsize))[0]
        #print(irx)
        sample = func_scores[irx]
        observed_low_fraction = np.sum(sample<func_low_thresh)/sample.shape[0]
        if normalize_unexplained_fraction:
            ratio = (observed_low_fraction-expected__low_fraction)/expected__low_fraction
        else:
            ratio = observed_low_fraction
        observed_low_ratios.append(ratio*100)
        #print('observed_high_fraction=',observed_high_fraction)
        #dfscores['INT'] = np.array(((orf_scores>x)&(orf_scores<x+stepsize)))
        #sns.displot(dfscores,x=field,hue='INT',kind='kde',common_norm=False)
        #plt.title('[{:2.2f},{:2.2f}]'.format(x,x+stepsize))
    plt.plot(orf_intervals,observed_low_ratios,'-',linewidth=5,label=field)
    plt.xlabel('ORF similarity')
    plt.ylabel('Enrichment relative difference, %')
plt.legend()
if normalize_unexplained_fraction:
    plt.title('Unexplained normalized fraction')
else:
    plt.title('Unexplained fraction of gene-gene links')
plt.show()

In [None]:
display(dfscores)
sns.histplot(dfscores,x='ABS_CRISPR_SIM',bins=50)

In [None]:
# This cell is for comparing two functional embedding versions

if os.path.exists('data/'+score_type.lower()+'_scores_merged.old'):
    dfscores_old = pd.read_csv('data/'+score_type.lower()+'_scores_merged.old',sep='\t')
    gene1_old = list(dfscores_old['GENE1'])
    gene2_old = list(dfscores_old['GENE2'])
    pair_keys_old = {gene1_old[i]+'#'+gene2_old[i]:i for i,g in tqdm(enumerate(gene1_old))}
    mf_old = dfscores_old['gene_mf']
    bp_old = dfscores_old['gene_bp']
    pt_old = dfscores_old['gene_pathway']

    dfscores = pd.read_csv('data/'+score_type.lower()+'_scores_merged.tsv',sep='\t')
    gene1 = list(dfscores['GENE1'])
    gene2 = list(dfscores['GENE2'])
    pair_keys = {gene1[i]+'#'+gene2[i]:i for i,g in tqdm(enumerate(gene1))}

    mf_old_reordered = []
    bp_old_reordered = []
    pt_old_reordered = []

    for i,g in tqdm(enumerate(gene1)):
        key = gene1[i]+'#'+gene2[i]
        if key in pair_keys_old:
            mf_old_reordered.append(mf_old[pair_keys_old[key]])
        else:
            mf_old_reordered.append(np.nan)
    bp_old_reordered = []
    for i,g in tqdm(enumerate(gene1)):
        key = gene1[i]+'#'+gene2[i]
        if key in pair_keys_old:
            bp_old_reordered.append(bp_old[pair_keys_old[key]])
        else:
            bp_old_reordered.append(np.nan)
    pt_old_reordered = []
    for i,g in tqdm(enumerate(gene1)):
        key = gene1[i]+'#'+gene2[i]
        if key in pair_keys_old:
            pt_old_reordered.append(pt_old[pair_keys_old[key]])
        else:
            pt_old_reordered.append(np.nan)

    dfscores['gene_mf.old'] = mf_old_reordered
    dfscores['gene_bp.old'] = bp_old_reordered
    dfscores['gene_pathway.old'] = pt_old_reordered

In [None]:
# This cell is for comparing two functional embedding versions
if os.path.exists('data/'+score_type.lower()+'_scores_merged.old'):
    sns.histplot(dfscores,x='gene_mf',y='gene_mf.old',bins=[100,100],cmap=hist_colormap)
    plt.show()
    sns.histplot(dfscores,x='gene_bp',y='gene_bp.old',bins=[100,100],cmap=hist_colormap)
    plt.show()
    sns.histplot(dfscores,x='gene_pathway',y='gene_pathway.old',bins=[100,100],cmap=hist_colormap)
    plt.show()