In [None]:
%load_ext watermark
%watermark -a Filippo_Valle -p numpy,pandas,seaborn,matplotlib,topicpy,sklearn,cloudpickle,plotly,nsbm -m -g -r -v

In [None]:
import pandas as pd
import sys
from topicpy.hsbmpy import get_file
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
directory="brca/"
algorithm = "trisbm"
l = 0

In [None]:
df_topics = pd.read_csv("%s/%s/%s_level_%d_topic-dist.csv"%(directory,algorithm,algorithm,l))
t_size = pd.read_csv("%s/%s/%s_level_%d_word-dist.csv"%(directory,algorithm,algorithm,l), index_col=0).apply(lambda topic: sum((topic>0).astype(int))).mean()
df_files = pd.read_csv("%s/files.dat"%directory, index_col=0).dropna(how='all', axis=0)
df_topics.set_index('doc', inplace=True)
df_topics.insert(0,'tissue','')
df_topics.drop('i_doc', axis=1, inplace=True)
label = "Subtype_Selected"
if label not in df_files.columns:
    raise AttributeError(f"{label} not valid. Available: {df_files.columns}")
print("Working at level %d with %d topics and average size: %d"%(l,df_topics.shape[1]-1, round(t_size)))

In [None]:
for sample in df_topics.index.values:
    try:
        df_topics.at[sample,'tissue']=("%s"%(get_file(sample,df_files)[label]))
    except:
        print(*sys.exc_info())
        print(sample)
        df_topics.at[sample,'tissue']="unknown"

In [None]:
df_cmap = df_topics.sort_values(by='tissue').set_index('tissue').transpose()
df_cmap = df_cmap.subtract(df_cmap.mean(axis=1),axis=0)
#create a color palette with the same number of colors as unique values in the Source column
network_pal = sns.color_palette('husl',n_colors=len(df_cmap.columns))

#Create a dictionary where the key is the category and the values are the
#colors from the palette we just created
network_lut = dict(zip(df_cmap.columns, network_pal))
network_col = df_cmap.columns.map(network_lut)

In [None]:
def get_box_data(topic):
    box_data = []
    tissues = df_cmap.columns.unique()
    for tissue in tissues:
        df_series = df_cmap.transpose().reset_index()[["tissue",topic]]
        df_series=df_series[df_series["tissue"]==tissue]
        box_data.append(df_series[topic].values)
    return box_data, tissues

In [None]:
for topic in df_cmap.index: # all
    box_data, tissues = get_box_data(topic)

    fig, ax = plt.subplots(figsize=(18,15))
    ax.boxplot(box_data, sym="o", flierprops={"c":"gray", "ms":20, "lw":5}, boxprops = {"lw":10}, whiskerprops={"lw":2}, capprops={"lw":5}, medianprops={"c":"red", "lw":10})
    #ax.violinplot(box_data)
    
    ax.tick_params(labelsize=35)
    ax.set_title(topic, fontsize=35)

    ax.set_xticklabels(tissues, rotation = 75)
    ax.set_ylabel("Centered $P($topic $|$ sample$)$", fontsize=35)

    plt.tight_layout()
    fig.savefig(f"box_{algorithm}_{l}{topic}.pdf")
    plt.show()