## Import packages and functions

In [1]:
import pandas as pd
import joblib
from keras.models import load_model

## Load the input data

In [2]:
example_data = pd.read_csv('example_data.tsv', sep='\t', index_col='Sample ID')
X = example_data.values

## Process the input data

In [3]:
Scaler = joblib.load('../models/Scaler/Scaler.joblib')
X_scaled = Scaler.transform(X)

## Load the models you want to use

In [4]:
Ridge = joblib.load('../models/Ridge/Ridge.joblib')
RFR = joblib.load('../models/RFR/RFR.joblib')
MLP = load_model('../models/MLP/MLP.h5')

## Run the models

In [5]:
Ridge_purity = Ridge.predict(X_scaled)
RFR_purity = RFR.predict(X)
MLP_purity = MLP.predict(X_scaled).reshape(-1)

## You can identify the result as a data frame

In [6]:
pd.DataFrame(data=[Ridge_purity, RFR_purity, MLP_purity], index=['Ridge', 'RFR', 'MLP'], columns=example_data.index).T

Unnamed: 0_level_0,Ridge,RFR,MLP
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sample_1,0.920105,0.914423,0.923452
Sample_2,0.652405,0.697304,0.667602
Sample_3,0.890411,0.868287,0.870883
Sample_4,0.789208,0.702283,0.794525
Sample_5,0.591327,0.629323,0.569975
Sample_6,0.486609,0.464711,0.467527


## Also, you can save the result

In [7]:
result = pd.DataFrame(data=[Ridge_purity, RFR_purity, MLP_purity], index=['Ridge', 'RFR', 'MLP'], columns=example_data.index).T
result.to_csv('example_result.tsv', sep='\t')

# ※ When using top-ranked gene set

## Load gene list

In [8]:
top100_gene_list = pd.read_csv('../GeneList/Top100.txt', sep='\t', header=None)
top100_gene_list.columns = ['Ensembl gene id', 'Gene symbol']
top100_gene_list.head()

Unnamed: 0,Ensembl gene id,Gene symbol
0,ENSG00000147443.11,DOK2
1,ENSG00000100368.12,CSF2RB
2,ENSG00000158714.9,SLAMF8
3,ENSG00000110324.8,IL10RA
4,ENSG00000143119.11,CD53


In [9]:
top30_gene_list = pd.read_csv('../GeneList/Top30.txt', sep='\t', header=None)
top30_gene_list.columns = ['Ensembl gene id', 'Gene symbol']

## Extract top genes from original input data

In [10]:
# example_data = pd.read_csv('example_data.tsv', sep='\t', index_col='Sample ID')
example_data_top100 = example_data[list(top100_gene_list['Ensembl gene id'])]
X_top100 = example_data_top100.values

example_data_top30 = example_data[list(top30_gene_list['Ensembl gene id'])]
X_top30 = example_data_top30.values

## Process the data

In [11]:
Scaler_top100 = joblib.load('../models/Scaler/Scaler_top100.joblib')
X_top100_scaled = Scaler_top100.transform(X_top100)

Scaler_top30 = joblib.load('../models/Scaler/Scaler_top30.joblib')
X_top30_scaled = Scaler_top30.transform(X_top30)

## Load the models you want to use

In [12]:
Ridge_top100 = joblib.load('../models/Ridge/Ridge_top100.joblib')
MLP_top100 = load_model('../models/MLP/MLP_top100.h5')

RFR_top30 = joblib.load('../models/RFR/RFR_top30.joblib')
MLP_top30 = load_model('../models/MLP/MLP_top30.h5')

## Run the models

In [13]:
Ridge_top100_purity = Ridge_top100.predict(X_top100_scaled)
MLP_top100_purity = MLP_top100.predict(X_top100_scaled).reshape(-1)

RFR_top30_purity = RFR_top30.predict(X_top30)
MLP_top30_purity = MLP_top30.predict(X_top30_scaled).reshape(-1)

## You can identify the result as a data frame

In [14]:
pd.DataFrame(data=[Ridge_top100_purity, MLP_top100_purity, RFR_top30_purity, MLP_top30_purity],
             index=['Ridge_top100', 'MLP_top100', 'RFR_top30', 'MLP_top30'],
             columns=example_data_top100.index).T

Unnamed: 0_level_0,Ridge_top100,MLP_top100,RFR_top30,MLP_top30
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sample_1,0.877812,0.893113,0.911913,0.920209
Sample_2,0.667252,0.703523,0.680491,0.729512
Sample_3,0.888481,0.842372,0.881929,0.892949
Sample_4,0.782178,0.777399,0.716106,0.761243
Sample_5,0.5795,0.598675,0.6305,0.614695
Sample_6,0.456734,0.451512,0.466977,0.467309
