In [None]:
%load_ext watermark
%watermark -a Filippo_Valle -p numpy,pandas,seaborn,matplotlib,topicpy,sklearn,cloudpickle,plotly,nsbm -m -g -r -v

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as  sns
import matplotlib.pyplot as plt
from topicpy.hsbmpy import get_scores, get_scores_shuffled, normalise_score, add_score_lines, get_max_available_L
from topicpy.hsbmpy import get_cluster_given_l, get_fraction_sites, get_clustersinfo, plot_cluster_composition

# Clustering and scores

In [None]:
level = 1
algorithm = 'trisbm'
directory = "brca/"
L=get_max_available_L(directory, algorithm)
df_clusters = pd.read_csv("%s/%s/%s_level_%d_clusters.csv"%(directory,algorithm,algorithm,L), header=[0])
df_clusters.head()

In [None]:
df_files = pd.read_csv("%s/files.dat"%directory, index_col=[0], header=[0]).dropna(axis=1, how='all').dropna(axis=0, how='all')
samples = pd.read_csv("%s/%s/%s_level_0_clusters.csv"%(directory,algorithm,algorithm), header=[0]).astype(str).values.ravel()
samples=samples[samples!="nan"]
df_files = df_files.reindex(index=samples).dropna(how="any",axis=0).fillna("unknown")
labels = df_files.columns
print(labels)

In [None]:
analyses = {
    "normaltumor": ["hsbm", "trisbm", "shuffle"],
    "subtypes": ["hsbm", "trisbm", "shuffle"],
    "htetra": ["hsbm", "trisbm", "tetrasbm", "shuffle"],
    "regulatory": ["hsbm", "trisbm", "mirdip", "tarbase", "shuffle"]
}

In [None]:
labels = ["Subtype_Selected", 'sample_submitter_id_type', 'sample_submitter_id_mirna_type']

In [None]:
directory = "brca/"

scores = {}
scores['hsbm'] = get_scores(directory, labels, algorithm='topsbm', df_files=df_files, verbose=False)[labels[1]]
scores['trisbm'] = get_scores(directory, labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[1]]
#scores['tetrasbm'] = get_scores(directory+"/tetrasbm/", labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[1]]
#scores['pentasbm'] = get_scores(directory+"/pentasbm/", labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[1]]
scores['shuffle'] = get_scores_shuffled(directory, df_files, label=labels[0], algorithm='topsbm')
shuffle = []
for _ in range(50):
    shuffle.append(get_scores_shuffled(directory, df_files, label=labels[0], algorithm='topsbm')["V"])
scores["shuffle"]["V"] = np.average(shuffle,0)
normalise_score(scores, base_algorithm="shuffle")

In [None]:
analysis = "normaltumor"

fig=plt.figure(figsize=(20,15))
ax = fig.subplots(1)
add_score_lines(ax,scores, V="norm_V", labels=analyses[analysis])
#ax.set_xscale('linear')
#ax.set_title("Tumor/Normal", fontsize=40)
ax.set_ylabel("NMI/NMI* score", fontsize=35)

plt.xlim(1,max(map(max,[score["xl"] for score in scores.values()]))*1.1)
plt.ylim(0,max(map(max,[score["norm_V"] for score in scores.values()]))*1.1)
plt.show()
fig.savefig("%s/metric_scores_%s.pdf"%(directory,analysis))

In [None]:
directory = "brca/"

scores = {}
#scores['hsbm'] = get_scores("../cancers/breast/", ["Subtype_Selected"], algorithm='topsbm', df_files=pd.read_csv("../cancers/breast/files.dat", index_col=0), verbose=False)["Subtype_Selected"]
scores['hsbm'] = get_scores(directory, labels, algorithm='topsbm', df_files=df_files, verbose=False)[labels[0]]
scores['trisbm'] = get_scores(directory, labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled(directory, df_files, label=labels[0], algorithm='topsbm')
shuffle = []
for _ in range(50):
    shuffle.append(get_scores_shuffled(directory, df_files, label=labels[0], algorithm='topsbm')["V"])
scores["shuffle"]["V"] = np.average(shuffle,0)
normalise_score(scores, base_algorithm="shuffle")

In [None]:
analysis = "subtypes"

fig=plt.figure(figsize=(20,15))
ax = fig.subplots(1)
add_score_lines(ax,scores, V="norm_V", labels=analyses[analysis])
#ax.set_xscale('linear')
#ax.set_title("Tumor/Normal", fontsize=40)
ax.set_ylabel("NMI/NMI* score", fontsize=35)

plt.xlim(1,max(map(max,[score["xl"] for score in scores.values()]))*1.1)
plt.ylim(0,max(map(max,[score["norm_V"] for score in scores.values()]))*1.1)
plt.show()
fig.savefig("%s/metric_scores_%s.pdf"%(directory,analysis))

In [None]:
level = 1
algorithm = "topsbm"
cluster = get_cluster_given_l(level, directory,algorithm=algorithm)
fraction_sites = get_fraction_sites(cluster,df_files=df_files,label=labels[0], normalise=False)

clustersinfo = get_clustersinfo(cluster,fraction_sites)
plot_cluster_composition(fraction_sites,directory,level,label=labels[0], normalise=False,algorithm=algorithm)

In [None]:
level = 0
algorithm = "trisbm"
cluster = get_cluster_given_l(level, directory,algorithm=algorithm)
fraction_sites = get_fraction_sites(cluster,df_files=df_files,label=labels[0], normalise=False)

clustersinfo = get_clustersinfo(cluster,fraction_sites)
plot_cluster_composition(fraction_sites,directory,level,label=labels[0], normalise=False,algorithm=algorithm)

In [None]:
directory = "brca/"

scores = {}
scores['hsbm'] = get_scores(directory, labels, algorithm='topsbm', df_files=df_files, verbose=False)[labels[0]]
scores['trisbm'] = get_scores(directory, labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[0]]
scores['tetrasbm'] = get_scores(directory+"tetrasbm", labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[0]]
scores['pentasbm'] = get_scores(directory+"pentasbm", labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[0]]
scores['mirdip'] = get_scores(directory+"/mirdip", labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[0]]
scores['tarbase'] = get_scores(directory+"/tarbase", labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[0]]
scores['shuffle'] = get_scores_shuffled(directory, df_files, label=labels[0], algorithm='topsbm')
shuffle = []
for _ in range(50):
    shuffle.append(get_scores_shuffled(directory, df_files, label=labels[0], algorithm='topsbm')["V"])
scores["shuffle"]["V"] = np.average(shuffle,0)
normalise_score(scores, base_algorithm="shuffle")

In [None]:
analysis = "regulatory"

fig=plt.figure(figsize=(20,15))
ax = fig.subplots(1)
add_score_lines(ax,scores, V="norm_V", labels=analyses[analysis])
#ax.set_xscale('linear')
#ax.set_title("Tumor/Normal", fontsize=40)
ax.set_ylabel("NMI/NMI* score", fontsize=35)

plt.xlim(1,max(map(max,[score["xl"] for score in scores.values()]))*1.1)
plt.ylim(0,max(map(max,[score["norm_V"] for score in scores.values()]))*1.1)
plt.show()
fig.savefig("%s/metric_scores_%s.pdf"%(directory,analysis))

In [None]:
analysis = "htetra"

fig=plt.figure(figsize=(20,15))
ax = fig.subplots(1)
add_score_lines(ax,scores, V="norm_V", labels=analyses[analysis])
#ax.set_xscale('linear')
#ax.set_title("Tumor/Normal", fontsize=40)
ax.set_ylabel("NMI/NMI* score", fontsize=35)

plt.xlim(1,max(map(max,[score["xl"] for score in scores.values()]))*1.1)
plt.ylim(0,max(map(max,[score["norm_V"] for score in scores.values()]))*1.1)
plt.show()
fig.savefig("%s/metric_scores_%s.pdf"%(directory,analysis))

In [None]:
import sys
sys.path.append("../hSBM_Topicmodel/")
from sbmtm import sbmtm
from nsbm import nsbm
import cloudpickle as pickle
import plotly.graph_objects as go

In [None]:
with open("brca_subtypes/topsbm/topsbm.pkl", "rb") as file:
    hsbm_sub = pickle.load(file)
    
with open("brca_subtypes/trisbm/model.pkl", "rb") as file:
    trisbm_sub = pickle.load(file)
    
with open("brca/trisbm/model.pkl", "rb") as file:
    tri = pickle.load(file)
    
with open("brca/tetrasbm/trisbm/trisbm.pkl", "rb") as file:
    tetrasbm = pickle.load(file)
    
with open("brca/pentasbm/trisbm/trisbm.pkl", "rb") as file:
    pentasbm = pickle.load(file)

with open("brca/tarbase/trisbm/trisbm.pkl", "rb") as file:
    tarbase = pickle.load(file)
    
#with open("brca/mirdip/trisbm/trisbm.pkl", "rb") as file:
#    mirdip = pickle.load(file)

In [None]:
fig = go.Figure()

        
fig.add_traces([
    go.Bar(x=[0], y=[hsbm_sub.get_mdl()/hsbm_sub.g.num_edges()], name="hSBM"),
    #go.Bar(x=[1], y=[trisbm_sub.get_mdl()/trisbm_sub.g.num_edges()], name="triSBM"),
    go.Bar(x=[1], y=[tri.get_mdl()/tri.g.num_edges()], name="triSBM"),
    go.Bar(x=[2], y=[tetrasbm.get_mdl()/tetrasbm.g.num_edges()], name="tetraSBM"),
    #go.Bar(x=[3], y=[pentasbm.get_mdl()/pentasbm.g.num_edges()], name="pentaSBM"),
    go.Bar(x=[3], y=[tarbase.get_mdl()/tarbase.g.num_edges()], name="regulatory"),
    #go.Bar(x=[1], y=[mirdip.get_mdl()/mirdip.g.num_edges()], name="mirdip"),
])

titlefont = {
    "size": 30 
}

tickfont = {
    "size":25
}

layout = {
    "title":"Entropy",
    "xaxis":{
        "title": "Dataset",
        "tickmode": "array",
        "tickvals": list(range(4)),
        "ticktext": ["hSBM", "triSBM", "tetraSBM", "regulatory"],
        "titlefont": titlefont,
        "tickfont": tickfont
    },
    "yaxis":{
        "title": "∑/E",
        #"range":[-3e4,1000],
        "type":"log",
        "titlefont": titlefont,
        "tickfont": tickfont,
    },
    "legend":{
        "font_size":35
    },
    "showlegend":False
}

fig.update_layout(layout)

fig.show()
fig.write_image("entropy_comparison.pdf", engine="kaleido")

In [None]:
import math
print((hsbm_sub.get_mdl()/hsbm_sub.g.num_edges()-trisbm_sub.get_mdl()/trisbm_sub.g.num_edges()))
print((hsbm_sub.get_mdl()/hsbm_sub.g.num_edges()-trisbm_sub.get_mdl()/trisbm_sub.g.num_edges())*math.log10(math.exp(1)))

# Overlaps

In [None]:
directory = "brca/"
os.chdir(directory)

## Clusters conservation

In [None]:
from sklearn.metrics import adjusted_mutual_info_score

In [None]:
df_first = pd.read_csv("topsbm/topsbm_level_1_clusters.csv")
df_second = pd.read_csv("trisbm/trisbm_level_0_clusters.csv")
assert(np.isin(list(filter(lambda sample: str(sample)!="nan",df_first.values.ravel())), 
        list(filter(lambda sample: str(sample)!="nan",df_second.values.ravel())), invert=True).sum()==0)

In [None]:
samples = df_first.values.ravel()
samples = list(filter(lambda sample: str(sample)!="nan",samples))

In [None]:
partition = []
for sample in samples:
    try:
        partition.append((
                df_first.columns[(df_first==sample).any()].values[0].split(" ")[1],
                df_second.columns[(df_second==sample).any()].values[0].split(" ")[1]
                )
            )
    except:
        pass
partition = list(zip(*partition))

In [None]:
adjusted_mutual_info_score(partition[0],partition[1])

In [None]:
AMI_levels = [[0.127, 0],
              [0.31, 0],
              [0.25, 0],
              [0.005, 0]]

In [None]:
cm = sns.clustermap(AMI_levels,
            xticklabels=["l%d"%l for l in range(2)],
            yticklabels=["l%d"%l for l in range(4)],
            row_cluster=False,
            col_cluster=False,
            vmax=1)

ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_title("clusters", fontsize=35)
ax.set_ylabel("hSBM", fontsize=35, rotation=90)
ax.set_xlabel("trisbm", fontsize=35, rotation=0)

ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")
ax.tick_params(labelsize=35)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("AMI", fontsize=30)
plt.tight_layout()
fig.savefig("cluster_overlap_levels.pdf")

In [None]:
df_cluster_overlap = pd.DataFrame(index=df_first.columns, columns=df_second.columns, data=0.)

In [None]:
def get_overlap(x,y):
    return np.isin(x,y).sum().astype(float)/float(len(y))

def get_pval(setA, setB):
    x = np.isin(setA,setB).sum() # number of successes
    M = len(samples) # pop size
    k = len(setB) # successes in pop
    N = len(setA) # sample size
    pval = hypergeom.sf(x-1, M, k, N)
    return pval

In [None]:
for row in df_first.columns:
    for column in df_second.columns:
        df_cluster_overlap.at[row,column]=get_overlap(df_first[row].dropna().values, df_second[column].dropna().values)

In [None]:
cm = sns.clustermap(df_cluster_overlap,
            xticklabels=df_second.columns,
            yticklabels=df_first.columns,
            row_cluster=False,
            col_cluster=False)

ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("hSBM", fontsize=35, rotation=90)
ax.set_xlabel("trisbm", fontsize=35, rotation=0)

ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")
ax.tick_params(labelsize=35)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("%sample\nin common", fontsize=30)
cm.savefig(f"topic_kl.pdf")

## Topic conservation

In [None]:
df_first = pd.read_csv("topsbm/topsbm_level_1_topics.csv")
df_second = pd.read_csv("trisbm/trisbm_level_0_topics.csv")
assert(np.isin(list(filter(lambda sample: str(sample)!="nan",df_first.values.ravel())), 
        list(filter(lambda sample: str(sample)!="nan",df_second.values.ravel())), invert=True).sum()==0)

In [None]:
genes = df_first.values.ravel()
genes = list(filter(lambda sample: str(sample)!="nan",genes))

In [None]:
partition = []
for sample in genes:
    partition.append((
        df_first.columns[(df_first==sample).any()].values[0].split(" ")[1],
        df_second.columns[(df_second==sample).any()].values[0].split(" ")[1]
        )
    )
partition = list(zip(*partition))

In [None]:
adjusted_mutual_info_score(partition[0], partition[1])

In [None]:
AMI_levels = [[0.24, 0.099],
             [0.25,0.09],
             [0.12, 0.09],
             [0.08, 0.10]]

In [None]:
cm = sns.clustermap(AMI_levels,
            xticklabels=["l%d"%l for l in range(2)],
            yticklabels=["l%d"%l for l in range(4)],
            row_cluster=False,
            col_cluster=False,
            vmax=1)

ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_title("topics", fontsize=35)
ax.set_ylabel("hSBM", fontsize=35, rotation=90)
ax.set_xlabel("trisbm", fontsize=35, rotation=0)

ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")
ax.tick_params(labelsize=35)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
cax.set_title("AMI", fontsize=30)
plt.tight_layout()
fig.savefig("topic_overlap_levels.pdf")

In [None]:
df_cluster_overlap = pd.DataFrame(index=df_first.columns, columns=df_second.columns, data=0.)

In [None]:
for row in df_first.columns:
    for column in df_second.columns:
        df_cluster_overlap.at[row,column]=get_overlap(df_first[row].dropna().values, df_second[column].dropna().values)

In [None]:
cm = sns.clustermap(df_cluster_overlap,
            xticklabels=df_second.columns,
            yticklabels=df_first.columns,
            row_cluster=False,
            col_cluster=False,
            vmax=1)

ax = cm.ax_heatmap
fig = ax.get_figure()
ax.set_ylabel("hSBM", fontsize=35, rotation=90)
ax.set_xlabel("trisbm", fontsize=35, rotation=0)

ax.yaxis.tick_left()
ax.yaxis.set_label_position("left")
ax.tick_params(labelsize=35)

cax = cm.ax_cbar
cax.tick_params(labelsize=30)
#cax.set_title("% gene in common", fontsize=30)
cax.set_title("P_hypergeom", fontsize=30)
cm.savefig(f"topic_overlap.pdf")

# Survival
See [triSBM_survival](triSBM_survival.ipynb)

# GO contribution
See [go_contribution](go_contribution.ipynb)