In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import Image
from core.mixer import Mixer
from core.cell_types import CellTypes
from core.model import DeconvolutionModel
from core.plotting import print_cell_matras, cells_p, print_all_cells_in_one
from core.utils import *
import numpy as np 
import gc

In [3]:
data_annot = pd.read_csv('/home/shpakb/Desktop/kassandra_data/all_models_annot.tsv', index_col=0)
data_expr = pd.read_csv('/home/shpakb/Desktop/kassandra_data/all_models_expr.tsv', index_col=0)
lab_expr = pd.read_csv('/home/shpakb/Desktop/kassandra_data/laboratory_data_expressions.tsv', sep='\t', index_col=0)
lab_annot = pd.read_csv('/home/shpakb/Desktop/kassandra_data/laboratory_data_annotation.tsv', sep='\t', index_col=0)

In [4]:
model_column = 'Blood_model_annot'
samples = data_annot.loc[data_annot['Tumor_model_annot'] == 'cancer_cells'].index
cancer_expr = data_expr[samples]
cancer_annot = data_annot.loc[samples]
cancer_annot['Tumor_type'] = cancer_annot['Dataset']
cancer_annot = cancer_annot[['Tumor_type', 'Dataset']]

samples = data_annot.loc[~data_annot[model_column].isna()].index
cells_expr = data_expr[samples]

cells_annot = data_annot.loc[samples]
cells_annot = cells_annot[[model_column, 'Dataset']]
cells_annot.columns = ['Cell_type', 'Dataset']
cells_annot = pd.concat([lab_annot, cells_annot])
cells_annot.loc[cells_annot['Dataset'].isna(), 'Dataset'] = cells_annot.loc[cells_annot['Dataset'].isna()].index
cells_expr = pd.concat([lab_expr, cells_expr], axis=1)

# to make sure that there is no repeated samples
samples = sorted(list(set(cells_annot.index).intersection(set(cells_expr.columns))))
cells_expr = cells_expr[samples]
cells_annot = cells_annot.loc[samples]

print(cells_expr.shape, cells_annot.shape)
print(cancer_expr.shape, cancer_annot.shape)

(18792, 3626) (3626, 2)
(18792, 2166) (2166, 2)


In [5]:
cell_types = CellTypes.load('configs/full_blood_model.yaml')
missing_cts = [x for x in cell_types.get_all_subtypes('General_cells') if not x in cells_annot['Cell_type'].unique()]
missing_cts

['Lymphoid_cells', 'Memory_T_helpers']

In [6]:
# adding missing cell types
cell_types = CellTypes.load('configs/full_blood_model.yaml')
missing_cts = [x for x in cell_types.get_all_subtypes('General_cells') if not x in cells_annot['Cell_type'].unique()]

for ct in missing_cts:
    subtypes = cell_types.get_direct_subtypes(ct)
    annot = cells_annot.loc[cells_annot['Cell_type'].isin(subtypes)]
    annot.index
    expr = cells_expr[annot.index]
    annot['Cell_type'] = ct
    annot.index = annot.index + f'_{ct}'
    annot['Dataset'] = annot.index
    expr.columns = expr.columns + f'_{ct}'
    cells_expr = pd.concat([cells_expr, expr], axis=1)
    cells_annot = pd.concat([cells_annot, annot])

# to make sure that there is no repeated samples
samples = sorted(list(set(cells_annot.index).intersection(set(cells_expr.columns))))
cells_expr = cells_expr[samples]
cells_annot = cells_annot.loc[samples]
print(cells_expr.shape, cells_annot.shape)

(18792, 4213) (4213, 2)


In [7]:
# free up memory
del data_annot, data_expr, lab_expr, lab_annot
gc.collect()

64

In [8]:
# Model training
mixer = Mixer(cell_types=cell_types,
              cells_expr=cells_expr, cells_annot=cells_annot,
              tumor_expr=cancer_expr, tumor_annot=cancer_annot,
              num_av=3, num_points=300000)

model = DeconvolutionModel(cell_types,
                           boosting_params_first_step='configs/boosting_params/lgb_parameters_first_step.tsv',
                           boosting_params_second_step='configs/boosting_params/lgb_parameters_second_step.tsv')
model.fit(mixer)

Checking normal cells expressions...
Expressions OK
Checking cancer cells expressions...
Expressions OK
Generating mixes for B_cells model
Fitting B_cells model
Trained in:  1.0 sec.


Generating mixes for Basophils model
Fitting Basophils model
Trained in:  1.0 sec.


Generating mixes for CD27neg_Memory_B_cells model
Fitting CD27neg_Memory_B_cells model
Trained in:  2.3 sec.


Generating mixes for CD4_T_cells model
Fitting CD4_T_cells model
Trained in:  1.4 sec.


Generating mixes for CD57neg_Cytotoxic_NK_cells model
Fitting CD57neg_Cytotoxic_NK_cells model
Trained in:  2.3 sec.


Generating mixes for CD57pos_Cytotoxic_NK_cells model
Fitting CD57pos_Cytotoxic_NK_cells model
Trained in:  1.9 sec.


Generating mixes for CD8_T_cells model
Fitting CD8_T_cells model
Trained in:  1.1 sec.


Generating mixes for CDC model
Fitting CDC model
Trained in:  2.5 sec.


Generating mixes for Central_memory_CD8_T_cells model
Fitting Central_memory_CD8_T_cells model
Trained in:  3.3 sec.


Generating 