# Import

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config IPCompleter.use_jedi = False

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path
from IPython.display import display
import pickle

import os
import warnings
warnings.filterwarnings("ignore")
%config InlineBackend.figure_format = 'png'
plt.rcParams['pdf.fonttype'] = 'truetype'
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['figure.dpi'] = 120
sns.set_style('white')

In [3]:
from functions.utils import *
from functions.clustering import *

# Data

In [4]:
# expression = read_dataset('your_sample.tsv')
expression = read_dataset('/uftp2/Datasets/TCGA/current_version/data/projects/SKCM/expressions.tsv.gz')
expression.head()

Unnamed: 0_level_0,TCGA-FW-A3I3-06,TCGA-FS-A1ZD-06,TCGA-EE-A2M6-06,TCGA-WE-A8K4-01,TCGA-FS-A4FD-06,TCGA-EE-A3AD-06,TCGA-GN-A4U7-06,TCGA-ER-A19J-06,TCGA-D3-A5GU-06,TCGA-W3-AA1V-06,...,TCGA-ER-A3ET-06,TCGA-EE-A20C-06,TCGA-XV-A9W5-01,TCGA-EB-A4OZ-01,TCGA-WE-AAA3-06,TCGA-EE-A2MR-06,TCGA-EE-A3J4-06,TCGA-EE-A2GH-06,TCGA-D3-A8GR-06,TCGA-GN-A4U8-11
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1BG,0.0,0.0,0.0,0.0,0.036089,0.0,0.0,0.0,0.0,0.06477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1CF,0.025217,0.007251,0.004564,0.024397,0.006793,0.025941,0.013212,0.016306,0.030594,0.034704,...,0.0,0.020345,0.017169,0.011669,0.011014,0.016859,0.015384,0.0,0.010907,0.004148
A2M,73.571877,504.510696,525.628828,140.915524,406.081849,2926.982979,457.965691,1438.106707,225.667659,98.306524,...,2703.834395,1944.369174,888.760562,69.545721,253.478875,371.905618,551.380077,264.856697,298.912886,310.56233
A2ML1,24.33331,0.273186,0.158535,0.109855,0.324048,3.859605,0.408411,3.658979,0.435077,0.0641,...,0.070537,0.05967,6.711321,9.773731,0.206873,0.155532,0.089693,0.16723,0.055587,0.173989
A3GALT2,0.032785,0.158301,0.048754,0.08675,0.0,0.0,0.0,0.034769,0.0,0.074897,...,0.0,0.04606,0.323631,0.0,0.078421,0.0,0.027621,0.227214,0.059622,0.0


# Calculate features and classify

In [5]:
with open('model/ovr_knn_calibrated.pickle', 'rb') as f:
    model = pickle.load(f)

In [9]:
with open('model/IE_clusters.pickle', 'rb') as f:
    ie_dict = pickle.load(f)

gmt = ie_dict['gmt_dict']
gmt = gmt_genes_alt_names(gmt, expression.index, verbose=True)

prog_coeffs = ie_dict['progeny_coeffs'].reset_index()

Matched: 356
Trying to find new names for 1 genes in 19706 known
querying 1-1...done.
Finished.
1 input query terms found dup hits:
	[('TRBC1', 2)]
Pass "returnall=True" to return complete lists of duplicate or missing query terms.
1 genes were not converted


In [20]:
ssgsea_df = ssgsea_formula(expression.T, gmt)
progen_df = run_progeny(expression, prog_coeffs=prog_coeffs).T

In [22]:
features_df = pd.concat([ssgsea_df, progen_df], axis=1)[ie_dict['X'].columns]
features_df = median_scale(features_df)
features_df.head()

Unnamed: 0,Metabolic suppression of CTL,B cells,M1 cytokines,Treg cells,pDC,T cells,TLS_NL,Anti-tumor chemokines,Lymphoid_checkpoints,NK cells,...,Adipocytes,EGFR,MAPK,Hypoxia,Hypoxia_factors,Glycolysis,PI3K,Autophagy,Acidosis,Proliferation_rate
TCGA-FW-A3I3-06,-1.65771,-0.755566,-1.626324,-2.089942,-2.114987,-1.440859,-1.514973,-1.389576,-1.251763,-1.09962,...,-1.581918,1.794733,1.078535,-1.150224,0.38536,-2.048782,-0.597578,-3.494884,-0.080543,-0.376407
TCGA-FS-A1ZD-06,-0.363473,-1.466634,-2.593835,-2.134535,-1.292842,-1.797645,-2.187366,-2.454614,-2.039196,-0.101073,...,-1.295091,-0.309912,2.13307,-0.362246,0.762197,-0.239327,1.631694,0.295712,-0.919787,1.538199
TCGA-EE-A2M6-06,-0.157712,-0.375979,0.164812,0.077103,-0.067641,-0.589477,-0.516655,-0.685683,-0.131481,0.368931,...,-0.381848,3.345504,2.519268,0.941616,-0.298938,1.736002,1.472398,-0.96139,-1.047898,2.296255
TCGA-WE-A8K4-01,1.365995,-0.221239,0.794101,0.373436,0.295421,1.341003,0.416438,0.693754,2.306717,1.585943,...,-1.165885,-1.525357,-1.668083,2.589781,-0.132541,1.670949,-2.066148,1.578933,0.248664,0.600184
TCGA-FS-A4FD-06,0.111906,-0.014856,-0.135466,-1.206618,-0.765715,-0.593245,-0.347583,0.328282,-0.382169,-0.915084,...,0.153395,-1.651716,0.343234,-1.068462,-0.040024,-3.112608,0.32704,-0.578446,1.136598,1.559939


In [26]:
probas = pd.DataFrame(data=model.predict_proba(features_df), index=features_df.index)
probas.columns = model.classes_

#Cutoff for class prediction
probas.loc[probas[probas>0.47].isna().all(axis=1), 'Unclassified'] = 1

class_predict = probas.idxmax(axis=1)

class_predict.value_counts()

Immune Desert                       109
Highly Immune-Enriched, Inflamed     64
Fibrotic, Angiogenic, Myeloid        62
Immune-Enriched, Fibrotic            48
Fibrotic, Hypoxic                    43
Immune-Enriched, Hypoxic             34
Lymphoid-Cell-Enriched               34
Unclassified                         25
Faintly Infiltrated, Angiogenic      18
B-Cell-Enriched, Angiogenic          14
dtype: int64