In [None]:
%load_ext watermark
%watermark -a Filippo_Valle -p pandas,regex -m

https://github.com/EGA-archive/ega-download-client

`pyega3 -d fetch EGAF00000171401`

`pyega3 -d fetch EGAF00000245831`

In [None]:
import pandas as pd
import regex as re

In [None]:
df = pd.read_csv("EGAF00000171401/Illumina_HT12_normalised_mRNA_expression.xls", sep=" ")
print(df.shape)
df.head()

In [None]:
df_mirna = pd.read_csv("EGAF00000245831/Agilent_ncRNA_60k_normalised_miRNA_expression.xls", sep="\t").set_index("miRNA").drop("ProbeID", axis=1)
df_mirna = df_mirna.loc[~df_mirna.index.duplicated(),:]
print(df_mirna.shape)
df_mirna.head()

In [None]:
df = df.reindex(columns=df.columns[df.columns.isin(df_mirna.columns)])
df_mirna = df_mirna.reindex(columns=df.columns)

In [None]:
df_tcga = pd.read_csv("../mainTable_hv.csv",index_col=0)

In [None]:
df = df.reindex(index=df.index[df.index.isin(df_tcga.index)])

In [None]:
#df_tcga = pd.read_csv("../mainTable_mirna.csv",index_col=0)
#df_mirna.reindex(index=df_mirna.index[df_mirna.index.isin(df_tcga.index.drop_duplicates())])

In [None]:
df_all = pd.concat([df,df_mirna], axis=0)

## hSBM

In [None]:
import sys
sys.path.append("../../hSBM_Topicmodel/")
from sbmtm import sbmtm

In [None]:
hsbm = sbmtm()
hsbm.make_graph_from_BoW_df(df.fillna(0))

In [None]:
hsbm.g

In [None]:
hsbm.save_graph("graph_metabric_hsbm.xml.gz")

## triSBM

In [None]:
from trisbm import trisbm

In [None]:
trisbm_model = trisbm()
trisbm_model.make_graph(df_all.fillna(0), lambda gene: 1 if "ENSG" in gene else 2)

In [None]:
trisbm_model.save_graph("graph_metabric_trisbm.xml.gz")

In [None]:
import numpy as np
np.isin(hsbm.words, trisbm_model.words).sum()

In [None]:
df_files = pd.read_excel("../brca/41523_2021_345_MOESM2_ESM.xlsx", skiprows=4)
def parse_metabric_sample(sample):
    if re.match("MB.[0-9]{4}", sample):
        sample=sample.replace(".","-")
    return sample
df_files["sample"]=list(map(parse_metabric_sample,df_files["sample"]))
df_files.set_index("sample", inplace=True)

In [None]:
df_files = df_files.reindex(index=df_all.columns).fillna("unknown")

In [None]:
analyses = {
    "subtypes": ["hsbm_bccs", "trisbm_bccs", "shuffle"],
}

In [None]:
labels = ['BCS subtype', 'BCCS subtype']

### subtypes

In [None]:
from topicpy.hsbmpy import get_scores, get_scores_shuffled, add_score_lines, normalise_score, get_cluster_given_l, get_clustersinfo, get_fraction_sites, plot_cluster_composition
import matplotlib.pyplot as plt

In [None]:
directory = "./"

scores = {}
scores['hsbm_bcs'] = get_scores(directory, labels, algorithm='topsbm', df_files=df_files, verbose=False)[labels[1]]
scores['trisbm_bcs'] = get_scores(directory, labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[1]]

scores['hsbm_bccs'] = get_scores(directory, labels, algorithm='topsbm', df_files=df_files, verbose=False)[labels[1]]
scores['trisbm_bccs'] = get_scores(directory, labels, algorithm='trisbm', df_files=df_files, verbose=False)[labels[1]]


scores['shuffle'] = get_scores_shuffled(directory, df_files, label=labels[1], algorithm='topsbm')
shuffle = []
for _ in range(50):
    shuffle.append(get_scores_shuffled(directory, df_files, label=labels[1], algorithm='topsbm')["V"])
scores["shuffle"]["V"] = np.average(shuffle,0)
normalise_score(scores, base_algorithm="shuffle")

In [None]:
analysis = "subtypes"

fig=plt.figure(figsize=(20,15))
ax = fig.subplots(1)
add_score_lines(ax,scores, V="norm_V", labels=analyses[analysis])
#ax.set_xscale('linear')
#ax.set_title("Tumor/Normal", fontsize=40)
ax.set_ylabel("NMI/NMI* score", fontsize=35)

plt.xlim(1,max(map(max,[score["xl"] for score in scores.values()]))*1.1)
plt.ylim(0,max(map(max,[score["norm_V"] for score in scores.values()]))*1.1)
plt.show()
fig.savefig("%s/metric_scores_%s.pdf"%(directory,analysis))

In [None]:
level = 0
algorithm = "topsbm"
cluster = get_cluster_given_l(level, ".", algorithm=algorithm)
fraction_sites = get_fraction_sites(cluster, df_files=df_files, label=labels[0], normalise=False)

clustersinfo = get_clustersinfo(cluster,fraction_sites)
plot_cluster_composition(fraction_sites,".",level,label=labels[0], normalise=False,algorithm=algorithm)

In [None]:
level = 0
algorithm = "trisbm"
cluster = get_cluster_given_l(level, ".", algorithm=algorithm)
fraction_sites = get_fraction_sites(cluster, df_files=df_files, label=labels[0], normalise=False)

clustersinfo = get_clustersinfo(cluster,fraction_sites)
plot_cluster_composition(fraction_sites,".",level,label=labels[0], normalise=False,algorithm=algorithm)

In [None]:
level = 0
algorithm = "topsbm"
cluster = get_cluster_given_l(level, ".", algorithm=algorithm)
fraction_sites = get_fraction_sites(cluster, df_files=df_files, label=labels[1], normalise=False)

clustersinfo = get_clustersinfo(cluster,fraction_sites)
plot_cluster_composition(fraction_sites,".",level,label=labels[1], normalise=False,algorithm=algorithm)

In [None]:
level = 0
algorithm = "trisbm"
cluster = get_cluster_given_l(level, ".", algorithm=algorithm)
fraction_sites = get_fraction_sites(cluster, df_files=df_files, label=labels[1], normalise=False)

clustersinfo = get_clustersinfo(cluster,fraction_sites)
plot_cluster_composition(fraction_sites,".",level,label=labels[1], normalise=False,algorithm=algorithm)