# Process data

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_messanger = pd.read_csv("mainTable_fpkm.csv", index_col=0)
df_miRNA = pd.read_csv("mainTable_mirna.csv", index_col=0)

In [None]:
df_files = pd.read_csv("files_manifest.dat")
df_files.head(2)

In [None]:
df_files.drop_duplicates(subset=["sample_submitter_id"], keep="first", inplace=True)
df_files.drop_duplicates(subset=["sample_submitter_id_mirna"], keep="first", inplace=True)

In [None]:
df_files.set_index("sample_submitter_id", inplace=True)
df_files = df_files.reindex(index=df_messanger.columns).dropna(how="all", axis=0)
df_messanger = df_messanger.reindex(columns=df_files.index)
df_messanger.columns = df_files["cases.0.submitter_id"]
df_messanger.head(2)

In [None]:
df_files = df_files.reset_index().set_index("sample_submitter_id_mirna")
df_files = df_files.reindex(index=df_miRNA.columns).dropna(how="all", axis=0)
df_miRNA = df_miRNA.reindex(columns=df_files.index)
df_miRNA.columns = df_files["cases.0.submitter_id"]
df_miRNA.head(2)

In [None]:
df = df_messanger.append(df_miRNA.reindex(columns=df_messanger.columns))
df.to_csv("mainTable_all.csv", index=True)

In [None]:
df.head(2)

In [None]:
df_files = df_files.reset_index().set_index("cases.0.submitter_id")

In [None]:
df_files.loc["TCGA-D8-A140",:]

# Filter genes
## HVG

In [None]:
import scanpy as sc

In [None]:
adata = sc.AnnData(X=df_messanger.reindex(columns=df_files.index).dropna(how="all", axis=1).transpose(), obs=df_files[df_files.index.isin(df_messanger.columns)])

In [None]:
sc.pp.log1p(adata, copy=False)
sc.pp.highly_variable_genes(adata, n_top_genes=3000, n_bins=50)
sc.pl.highly_variable_genes(adata, log=False, save='hvg.pdf')

In [None]:
hvg = adata.var[adata.var['highly_variable']==True].index
samples = adata.obs.index

In [None]:
hvg

In [None]:
df.reindex(index=hvg, columns=samples).to_csv("mainTable_hv.csv")

## HVmiRNA

In [None]:
adata = sc.AnnData(X=df_miRNA.reindex(columns=df_files.index).dropna(how="all", axis=1).transpose(), obs=df_files)

In [None]:
sc.pp.log1p(adata, copy=False)
sc.pp.highly_variable_genes(adata, n_top_genes=1000, n_bins=50)
sc.pl.highly_variable_genes(adata, log=False, save='hvmiRNA.pdf')

# Make Graph

In [None]:
import graph_tool.all as gt
from time import time
import seaborn as sns

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../trisbm")

from trisbm import trisbm

In [None]:
model = trisbm()
model.make_graph(df.reindex(index=np.concatenate([hvg, df_miRNA.index])).applymap(lambda fpkm:np.log(fpkm+1)),
                  get_kind=lambda word: 1 if "ENSG" in word else 2
                )

In [None]:
model._get_shape()

In [None]:
g = model.g
g

In [None]:
g.save("graph_breast_trisbm.xml.gz")

In [None]:
gt.adjacency(g, weight=g.ep["count"]).toarray().max()

In [None]:
np.log2(df["TCGA-D8-A140"]["ENSG00000000938"]+1)

## sbmtm

In [None]:
import sys
sys.path.append("../hSBM_Topicmodel/")

In [None]:
from sbmtm import sbmtm

In [None]:
model = sbmtm()
model.make_graph_from_BoW_df(df.reindex(index=hvg).dropna(how="any", axis=1).applymap(lambda fpkm:np.log(fpkm+1)))
model.save_graph("graph_breast_hsbm.xml.gz")