In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from statannotations.Annotator import Annotator
from statannot import add_stat_annotation
from sklearn.metrics import adjusted_mutual_info_score as ANMI
from matplotlib.patches import Patch


import statannot
import json
import glob
import random
import nmi
import string

from pathlib import Path
from matplotlib.ticker import FormatStrFormatter
from helps import *

import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn')
plt.rcParams['font.size'] = 15
plt.rcParams["xtick.labelsize"]=13
plt.rcParams["ytick.labelsize"]=13
plt.rcParams["axes.titlesize"]=15
plt.rcParams["figure.dpi"]=600
plt.rcParams["savefig.format"]="pdf"
plt.rcParams["savefig.bbox"]="tight"

# NMI

In [2]:
df=pd.read_csv("Results/Seurat/seurat_hsbm_metadata.txt", sep="\t")
print(df.shape)
df.head()

(3723, 26)


Unnamed: 0,orig.ident,nCount_mRNA,nFeature_mRNA,Group,mRNA_snn_res.0.5,seurat_clusters,seurat_clusters_mRNA,lncRNA_snn_res.0.5,seurat_clusters_lncRNA,mRNA.weight,...,hSBM.mRNA_2,hSBM.lncRNA_0,hSBM.lncRNA_1,hSBM.lncRNA_2,hSBM.mRNA.lncRNA_0,hSBM.mRNA.lncRNA_1,hSBM.mRNA.lncRNA_2,triSBM.mRNA.lncRNA_0,triSBM.mRNA.lncRNA_1,triSBM.mRNA.lncRNA_2
hs.LumR.CGTGAGCTCACATACG.1,SeuratProject,441.838876,351,LumR,7,2,7,2,2,0.739659,...,0,133,1,0,50,8,0,27,0,1
hs.LumR.AACTGGTTCAGCTGGC.1,SeuratProject,375.232822,194,LumR,7,3,7,3,3,0.585476,...,0,74,1,0,2,2,0,7,0,1
hs.Lum.CACAGGCCATTAACCG.1,SeuratProject,308.878196,254,Lum,4,1,4,1,1,0.528964,...,0,124,1,0,13,4,0,38,2,1
hs.Basal.TTTCCTCAGCCAGTTT.1,SeuratProject,361.110331,295,Basal,2,4,2,4,4,0.718288,...,1,39,5,1,33,10,1,17,3,0
hs.Basal.TCTATTGCAAGAAGAG.1,SeuratProject,328.966235,257,Basal,2,0,2,0,0,0.828428,...,1,39,5,1,16,10,1,17,3,0


In [None]:
print(df.columns)

In [5]:
couples = [("seurat_clusters_wnn","seurat_clusters_mRNA"), 
           ("seurat_clusters_wnn","seurat_clusters_lncRNA"),
           ("triSBM.mRNA.lncRNA_1","hSBM.mRNA.lncRNA_1"),
           ("triSBM.mRNA.lncRNA_1","hSBM.lncRNA_1")]
for coup in couples:
    NMI=np.around(nmi.compute_normalised_mutual_information(df[coup[0]], df[coup[1]]), decimals=3)
    print("NMI", coup[0], "vs", coup[1], "-->", NMI)
    print("ANMI", coup[0], "vs", coup[1], "-->", round(ANMI(df[coup[0]],  df[coup[1]]), 3))
    print("\n")

NMI seurat_clusters_wnn vs seurat_clusters_mRNA --> 0.855
ANMI seurat_clusters_wnn vs seurat_clusters_mRNA --> 0.855


NMI seurat_clusters_wnn vs seurat_clusters_lncRNA --> 0.57
ANMI seurat_clusters_wnn vs seurat_clusters_lncRNA --> 0.569


NMI triSBM.mRNA.lncRNA_1 vs hSBM.mRNA.lncRNA_1 --> 0.494
ANMI triSBM.mRNA.lncRNA_1 vs hSBM.mRNA.lncRNA_1 --> 0.492


NMI triSBM.mRNA.lncRNA_1 vs hSBM.lncRNA_1 --> 0.433
ANMI triSBM.mRNA.lncRNA_1 vs hSBM.lncRNA_1 --> 0.431




In [8]:
couples=[("seurat_clusters_mRNA","Group"),
           ("seurat_clusters_lncRNA","Group"),
           ("seurat_clusters_wnn","Group")]
for coup in couples:
    NMI=np.around(nmi.compute_normalised_mutual_information(df[coup[0]], df[coup[1]]),decimals=3)
    nmi_rand=0
    for k in range(1000):
        a=df[coup[0]].to_list()
        np.random.shuffle(a)
        nmi_rand+=nmi.compute_normalised_mutual_information(df[coup[1]],a)/1000
    print(coup[0], "vs", coup[1], "NMI:", NMI, "   NMI/NMI*", NMI/nmi_rand)

seurat_clusters_mRNA vs Group NMI: 0.784    NMI/NMI* 464.48469547160386
seurat_clusters_lncRNA vs Group NMI: 0.672    NMI/NMI* 614.1855921717237
seurat_clusters_wnn vs Group NMI: 0.78    NMI/NMI* 450.95479195259486


In [None]:
df.replace("LumR", "Lum-R", inplace=True)
df.replace("BasalR", "Basal-R", inplace=True)

In [None]:
data=pd.DataFrame(index=df.index)
data["mRNA weight"]=df["mRNA.weight"]
data["lncRNA weight"]=df["lncRNA.weight"]
data=data.melt()
data.columns=["Gene family","WNN weight"]
data.index=list(df.index)+list(df.index)
data["typehisto"]=[df.loc[ind]["Group"] for ind in data.index]
data

In [None]:
subtypes=list(set(data["typehisto"]))
fig, axs = plt.subplots(1,2, figsize=(14,6), dpi=600)

sns.boxplot(data=data, x="Gene family",y="WNN weight", ax=axs[0],
            palette=["green","violet"], saturation=0.65)
test_results = add_stat_annotation(data=data, x="Gene family",y="WNN weight", ax=axs[0],
                                   box_pairs=[("mRNA weight","lncRNA weight")],
                                   test='Mann-Whitney', text_format='simple',
                                   comparisons_correction=None,
                                   text_annot_custom=["Mann-Whitney U-Test p-value < 1$^{-10}$"],
                                   loc='inside', verbose=2)


sns.boxplot(data=data, x="Gene family",y="WNN weight", hue="typehisto", ax=axs[1],
           palette=dict(zip(subtypes, nmi.set_colors(subtypes))),  saturation=0.65)
pairs=[(("mRNA weight", "Basal"),   ("lncRNA weight", "Basal")),
       (("mRNA weight", "Basal-R"), ("lncRNA weight", "Basal-R")),
       (("mRNA weight", "Lum"),   ("lncRNA weight", "Lum")),
       (("mRNA weight", "Lum-R"),   ("lncRNA weight", "Lum-R"))]
test_results = add_stat_annotation(data=data, x="Gene family",y="WNN weight",
                                   hue="typehisto", ax=axs[1], box_pairs=pairs,
                                   test='Mann-Whitney', text_format='simple',
                                   comparisons_correction=None,
                                   text_annot_custom=["MW U p < 1$^{-10}$" for i in range(4)],
                                   loc='inside', verbose=1)


axs[0].set_ylabel("WNN weight", weight="bold", size=20)
axs[1].set_ylabel("WNN weight", weight="bold", size=20)

axs[0].set_xlabel("Gene family", weight="bold", size=20)
axs[1].set_xlabel("Gene family", weight="bold", size=20)

axs[0].set_xticks([0,0.25,0.5,0.75,1])
axs[1].set_yticks([0,0.25,0.5,0.75,1])

title=f"Fig_7"
plt.savefig(f"Results/Figures/{title}.png", dpi=600)
#plt.savefig(f"Results/Figures/{title}.pdf", dpi=600)
plt.show()