# Preparation of T_data
This notebook, takes the data_feather, anno_feather and spec_ids and creates the required logcpm and annotation input files for Cpl_AE_TE model. 

In [1]:
import feather
import numpy as np
import pandas as pd
import scipy.io as sio
from cplAE_TE.utils.load_helpers import get_paths, load_dataset, load_summary_files

beta_threshold = 0.4

def set_raw_data_paths():
    pth={}
    base_path = '/Users/fahimehb/Documents/git-workspace/coupledAE-patchseq/data/proc/'
    pth['save_path'] = base_path
    pth['T_dat'] = base_path + 'data.feather'
    pth['T_ann'] = base_path + 'anno.feather'
    pth['gene_set'] = base_path + 'good_genes_beta_score.csv'
    pth['specimen_ids'] = base_path + 'inh_spec_ids.txt'
    return pth

In [2]:
pth = set_raw_data_paths()
T_dat = feather.read_dataframe(pth['T_dat'])
T_ann = feather.read_dataframe(pth['T_ann'])

In [3]:
#Keep EXC patchseq samples  
ids = pd.read_csv(pth['specimen_ids'],header=None)
ids.rename(columns = {0:'specimen_id'}, inplace = True) 

T_ann = T_ann.loc[T_ann['spec_id_label'].astype(np.int64).isin(ids['specimen_id'])]
T_ann = T_ann[['spec_id_label',
               'sample_id',
               'Tree_first_cl_id',
               'Tree_first_cl_label',
               'Tree_first_cl_color',
               'Tree_call_label']].reset_index(drop=True)
print(T_ann.shape)

(3819, 6)


In [4]:
T_ann.head()

Unnamed: 0,spec_id_label,sample_id,Tree_first_cl_id,Tree_first_cl_label,Tree_first_cl_color,Tree_call_label
0,888001481,P1S4_190617_001_A01,57.0,Lamp5 Fam19a1 Tmem182,#FFB8CE,Core
1,736493069,P2S4_180813_052_A01,57.0,Lamp5 Fam19a1 Tmem182,#FFB8CE,Core
2,644941196,P9S4_171024_408_A01,57.0,Lamp5 Fam19a1 Tmem182,#FFB8CE,Core
3,658075752,PAS4_180123_452_A01,57.0,Lamp5 Fam19a1 Tmem182,#FFB8CE,Core
4,867017156,PAS4_190513_452_A01,57.0,Lamp5 Fam19a1 Tmem182,#FFB8CE,Core


In [5]:
T_dat.head()

Unnamed: 0,sample_id,0610005C13Rik,0610006L08Rik,0610007P14Rik,0610009B22Rik,0610009E02Rik,0610009L18Rik,0610009O20Rik,0610010B08Rik,0610010F05Rik,...,n-R5s134,n-R5s136,n-R5s138,n-R5s139,n-R5s141,n-R5s142,n-R5s143,n-R5s144,n-R5s146,n-R5s149
0,P1S4_170807_003_A01,0.0,0.0,85.439281,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,P1S4_190214_003_A01,0.0,0.0,89.029484,14.206833,51.144597,0.0,0.0,0.0,158.169402,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,P1S4_190624_004_A01,0.0,0.0,69.735493,19.924427,0.0,0.0,0.0,0.0,4.981107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,P1S4_190725_002_A01,0.0,0.0,63.821876,23.123868,0.0,0.0,79.546106,0.0,58.272148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,P1S4_190812_002_A01,0.0,0.0,54.232876,0.0,14.790784,0.0,0.0,0.0,212.987295,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
keep_gene_id = pd.read_csv(pth['gene_set'])
keep_gene_id = keep_gene_id[keep_gene_id.BetaScore>beta_threshold]['Gene'].to_list()

#Restrict T data based on genes:
keepcols = ['sample_id'] + keep_gene_id
T_dat = T_dat[keepcols]

In [7]:
T_dat.head()

Unnamed: 0,sample_id,Lhx6,Gad2,Slc32a1,Neurod2,Slc6a1,Gad1,Adarb2,Sv2b,Nrn1,...,Adrb1,Chst1,Trps1,2610100L16Rik,Gnaz,Pth2r,Ier5l,Cmya5,Coro2a,Spsb4
0,P1S4_170807_003_A01,0.0,0.0,0.0,50.085096,0.0,0.0,0.0,788.59474,377.111308,...,56.95952,6.874425,65.798067,0.0,54.013338,0.0,0.0,0.0,5.892364,0.0
1,P1S4_190214_003_A01,0.0,0.0,0.0,26.519421,62.510063,0.0,0.0,374.113257,104.183439,...,9.471222,125.020126,20.836688,0.0,38.832009,0.0,0.947122,22.730932,3.788489,0.0
2,P1S4_190624_004_A01,0.0,1.992443,0.0,84.678813,0.0,0.0,0.0,263.002432,82.686371,...,44.82996,99.622133,90.656141,241.085562,22.913091,0.0,4.981107,0.0,0.0,0.0
3,P1S4_190725_002_A01,0.0,0.0,0.0,61.047012,0.0,0.0,0.924955,299.68533,180.366171,...,95.270337,3.699819,77.696197,71.221514,18.499094,0.0,14.799276,7.399638,24.973778,0.0
4,P1S4_190812_002_A01,0.0,0.0,0.0,105.507595,102.549438,0.0,0.0,257.359648,657.696878,...,7.888418,76.912079,126.214693,26.623412,53.246824,0.0,0.0,0.0,41.414196,0.0


In [8]:
#Restrict to samples in the annotation dataframe
T_dat = T_dat[T_dat['sample_id'].isin(T_ann['sample_id'])]
T_dat.set_index(keys='sample_id',inplace=True)
T_dat = T_dat.reindex(labels=T_ann['sample_id'])
T_dat.reset_index(drop=False,inplace=True)

In [9]:
#apply log2 to cpm values
T_dat[keep_gene_id] = np.log(T_dat[keep_gene_id]+1)

In [10]:
T_dat.head()

Unnamed: 0,sample_id,Lhx6,Gad2,Slc32a1,Neurod2,Slc6a1,Gad1,Adarb2,Sv2b,Nrn1,...,Adrb1,Chst1,Trps1,2610100L16Rik,Gnaz,Pth2r,Ier5l,Cmya5,Coro2a,Spsb4
0,P1S4_190617_001_A01,0.0,7.035926,3.91454,0.0,5.673589,7.626291,8.495708,0.0,0.0,...,3.617912,0.0,3.707914,4.609378,0.0,0.0,0.0,0.0,0.0,0.0
1,P2S4_180813_052_A01,0.0,5.678864,2.888838,0.0,6.923124,6.884398,8.493333,0.0,0.0,...,0.0,5.310021,0.0,4.461687,0.0,0.0,0.0,4.522157,0.0,0.0
2,P9S4_171024_408_A01,0.0,5.400001,0.0,0.0,7.171507,6.689848,8.398767,0.0,0.0,...,0.0,0.0,0.855346,0.0,0.0,0.0,0.0,0.0,0.855346,0.0
3,PAS4_180123_452_A01,0.0,6.680487,4.195907,0.0,6.120281,7.039112,7.851552,3.503117,0.0,...,2.37614,0.0,4.370792,3.442537,2.058511,0.0,3.983817,0.0,5.337449,0.0
4,PAS4_190513_452_A01,0.0,7.369056,3.585549,0.0,6.524931,7.472464,9.796245,0.0,0.0,...,2.74358,2.986679,1.813628,1.48654,4.470492,0.0,0.0,0.0,0.0,0.0


In [11]:
assert (T_ann['sample_id'].sort_index(axis=0) == T_dat['sample_id'].sort_index(axis=0)
       ).all(), 'Order of annotation and data samples is different!'

save_path = pth['save_path']
T_dat.to_csv(save_path + 'inh_T_data.csv',index=False)
T_ann.to_csv(save_path + 'inh_T_annotations.csv',index=False)

# T_dat.to_csv('/Users/fahimehb/Documents/Coupled_AE_EXC_patchseq/dat/proc/T_data.csv',index=False)
# T_ann.to_csv('/Users/fahimehb/Documents/Coupled_AE_EXC_patchseq/dat/proc/T_annotations.csv',index=False)

In [12]:
save_path

'/Users/fahimehb/Documents/git-workspace/coupledAE-patchseq/data/proc/'

In [14]:
T_ann.shape

(3819, 6)