In [87]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from helpers import *

In [88]:
#   Read main dataset and annotation about cell type adn centriole existence 
df = pd.read_csv('./data/S7_table.csv')
annotation = pd.read_csv('./data/annotation.csv')
data = pd.read_csv('./data/data_gaps.csv')


In [89]:
# Create a dictionary of existing centriol
annotation = annotation.drop(columns = ['cell_status'])
annotation['centriole'] = np.where(annotation['centriole'] == 'absent', 0, annotation['centriole'])
annotation['centriole'] = np.where(annotation['centriole'] == 'present', 1, annotation['centriole'])

# dictionary of centriole
annotation_dictionary = annotation.set_index('cell_type').T.to_dict('list')

In [90]:
# Delete from main dataset undefferentiated cell type and adjusted.tmp equals 0.0  
df.drop(df[df['adjusted.tpm.estimate'] == 0.0].index, inplace = True)
df.drop(df[df["cell.bin"].apply(lambda x: define_undefferentiated(x)) == True].index, inplace = True)

# drop columns which will not be used
df.drop('raw.tpm.estimate', inplace=True, axis=1)
df.drop('bootstrap.median.tpm', inplace=True, axis=1)
df.drop('ci.95p.lb', inplace=True, axis=1)
df.drop('ci.80p.lb', inplace=True, axis=1)
df.drop('ci.95p.ub', inplace=True, axis=1)
df.drop('ci.80p.ub', inplace=True, axis=1)

# add column with cell type and cell bin
df['cell_type'] = df["cell.bin"].apply(lambda x: split_cell_type(x, 1))
df['cell_bin'] = df["cell.bin"].apply(lambda x: split_cell_type(x, 0))

In [91]:
test_gene = df[df['gene'] == 'aap-1']
test_gene.head()

Unnamed: 0,gene,gene.id,cell.bin,adjusted.tpm.estimate,cell_type,cell_bin
0,aap-1,WBGene00000001,ADE:390_510,14.5,ADE,390_510
2,aap-1,WBGene00000001,ADF:390_510,30.7,ADF,390_510
6,aap-1,WBGene00000001,ADL:330_390,55.0,ADL,330_390
7,aap-1,WBGene00000001,ADL:390_510,59.4,ADL,390_510
8,aap-1,WBGene00000001,ADL:510_650,49.4,ADL,510_650


In [92]:
# create a new dataFrame for new data
general_df = pd.DataFrame()
keys = ['210_270', '270_330', '330_390', '390_450', '450_510', '510_580', '580_650', 'gt_650']

for gene, group in tqdm(df.groupby(['gene'])):
    general_df_cell = pd.DataFrame()
    for cell_type, cell in group.groupby(['cell_type']):
        dict_cell_tb = {}
        for i in keys:
            dict_cell_tb[i] = 0.0
        # assing existiing estimate
        for index in range(cell['cell_bin'].size):
            value_tb_table = cell["cell_bin"].values[index]
            estimate = cell['adjusted.tpm.estimate'].values[index]
            if cell['adjusted.tpm.estimate'].size != 1:
                if value_tb_table in dict_cell_tb.keys():
                    dict_cell_tb[value_tb_table] = estimate
                else: 
                    for k in dict_cell_tb:
                        k_split = k.split('_')
                        tb_split = value_tb_table.split('_')
                        if (k_split[0] == tb_split[0] or k_split[1] == tb_split[0] or k_split[0] == tb_split[1] or k_split[1] == tb_split[1]) > 0:
                            if k_split[0] == tb_split[0]:
                                dict_cell_tb[k] =  estimate
                            if dict_cell_tb[k] == 0.0: 
                                    dict_cell_tb[k] =  estimate
                list_d = list(dict_cell_tb.values())
                median = np.median(list_d)
                for k in dict_cell_tb:
                    if dict_cell_tb[k] == 0.0: 
                        dict_cell_tb[k] =  median
                # add dataFrame of bins and bins for particular cell
                df_cell = pd.DataFrame.from_dict([dict_cell_tb])
                df_cell["cell_type"] = cell_type
                df_cell['centriole'] = annotation_dictionary[cell_type]
                df_cell['gene'] =  gene
        if general_df_cell.empty:
            general_df_cell = df_cell
        else:
            general_df_cell = pd.concat([general_df_cell, df_cell], axis=0)
    # add dataFrame of cell type and bins for particular gene
    if general_df.empty:
        general_df = general_df_cell
    else:
        general_df = pd.concat([general_df, general_df_cell], axis=0)

100%|██████████| 17194/17194 [32:56<00:00,  8.70it/s]


In [93]:
general_df

Unnamed: 0,210_270,270_330,330_390,390_450,450_510,510_580,580_650,gt_650,cell_type,centriole,gene
0,25.60,25.60,51.2,162.0,162.0,84.7,25.60,25.60,IL2,0,B0250.18
0,25.60,25.60,51.2,162.0,162.0,84.7,25.60,25.60,IL2,0,B0250.18
0,1.15,1.15,19.7,2.3,2.3,2.3,1.15,1.15,ADL,0,2L52.1
0,1.15,1.15,19.7,2.3,2.3,2.3,1.15,1.15,ADL,0,2L52.1
0,1.15,1.15,19.7,2.3,2.3,2.3,1.15,1.15,ADL,0,2L52.1
...,...,...,...,...,...,...,...,...,...,...,...
0,130.70,130.70,130.7,130.7,261.4,722.2,1596.90,1007.50,mu_int_mu_anal,0,zyx-1
0,1.35,1.35,3.4,4.4,4.6,2.7,1.35,1.35,pm3_pm4_pm5,0,zyx-1
0,1.35,1.35,3.4,4.4,4.6,2.7,1.35,1.35,pm3_pm4_pm5,0,zyx-1
0,1.35,1.35,3.4,4.4,4.6,2.7,1.35,1.35,pm3_pm4_pm5,0,zyx-1


In [94]:
# change order of column
data = general_df[["gene","cell_type","210_270","270_330", "330_390", "390_450", "450_510", "510_580", "580_650", "gt_650", "centriole"]]

# save resulting dataset for future calculation 
data.to_csv('./data/data_without_one_estimate_for_cell.csv', index=False)