 ### Pre-processing and curation of T data
 - Updated dataset to match cells used in Gouwens et al. 2020

In [1]:
import feather
import numpy as np
import pandas as pd
import scipy.io as sio
from cplAE_TE.utils.load_helpers import get_paths, load_dataset, load_summary_files

beta_threshold = 0.4

def set_raw_data_paths():
    pth={}
    base_path = '/Users/fruity/Dropbox/AllenInstitute/CellTypes/dat/raw/patchseq-inh/'
    pth['T_dat'] = base_path + 'data.feather'
    pth['T_ann'] = base_path + 'anno.feather'
    pth['gene_set'] = base_path + 'good_genes_beta_score.csv'
    pth['specimen_ids'] = base_path + 'specimen_ids.txt'
    pth['color_ref'] = base_path + 'color_ref.csv'    
    return pth

In [2]:
pth = set_raw_data_paths()
T_dat = feather.read_dataframe(pth['T_dat'])
T_ann = feather.read_dataframe(pth['T_ann'])

In [3]:
#Keep samples that were published in Gouwens et al. 2020
ids = pd.read_csv(pth['specimen_ids'],header=None)
ids.rename(columns = {0:'specimen_id'}, inplace = True) 

T_ann = T_ann.loc[T_ann['spec_id_label'].astype(np.int64).isin(ids['specimen_id'])]
T_ann = T_ann[['spec_id_label',
               'sample_id',
               'topLeaf_id',
               'topLeaf_label',
               'topLeaf_color','Tree_call_label']].reset_index(drop=True)

In [4]:
# match the mapping confidence labels with data published in Gouwens et al. 2020
condlist = [np.logical_or(T_ann['Tree_call_label']=='Core',T_ann['Tree_call_label']=='I1'),
            np.logical_or(T_ann['Tree_call_label']=='I2',T_ann['Tree_call_label']=='I3'),
            T_ann['Tree_call_label']=='PoorQ']

choicelist  = ['highly_consistent',
               'mod_consistent',
               'inconsistent']

T_ann['mapping_confidence'] = np.select(condlist, choicelist, default='highly consistent')

T_ann = T_ann.rename(columns={'topLeaf_id':'cluster_id',
                      'topLeaf_color':'cluster_color',
                      'topLeaf_label':'cluster'})

# show the resulting mapping
T_ann[['mapping_confidence','Tree_call_label']].drop_duplicates().sort_values(by='mapping_confidence').reset_index(drop=True)

Unnamed: 0,mapping_confidence,Tree_call_label
0,highly_consistent,Core
1,highly_consistent,I1
2,inconsistent,PoorQ
3,mod_consistent,I2
4,mod_consistent,I3


In [5]:
#These numbers match 2020 Gouwens et al. numbers in Fig. S1 G. 
T_ann['mapping_confidence'].value_counts().to_frame()

Unnamed: 0,mapping_confidence
highly_consistent,2954
mod_consistent,900
inconsistent,415


 - Drop cells that are mapped inconsistently to leaf nodes
 - Drop cell types with less than at least `10` samples

In [6]:
#Remove 'inconsistent' cells
T_ann = T_ann[T_ann['mapping_confidence'] != 'inconsistent'].reset_index(drop=True)

In [7]:
#Keep cells that have at least 10 samples
keep_types = T_ann['cluster'].value_counts()>=10
keep_types = keep_types.index[keep_types].values
T_ann = T_ann.loc[T_ann['cluster'].isin(keep_types)]

In [8]:
#Update colors for different labels
ref_df = pd.read_csv(pth['color_ref'])
ref_df = ref_df[['cluster_label','cluster_color','dendcluster_id']]
ref_df.rename(columns={'cluster_label':'cluster','dendcluster_id':'cluster_id'},inplace=True)

#Update annotation dataframe
T_ann = T_ann.merge(right=ref_df,how='left',left_on='cluster',right_on='cluster',suffixes=('_old',''))
T_ann = T_ann[['spec_id_label','sample_id','cluster','cluster_color','cluster_id','mapping_confidence',]]

#Get ordered cluster id for well sampled cells
old_id = np.sort(np.unique(T_ann['cluster_id'].values))
new_id = np.arange(old_id.size)
ref_df = pd.DataFrame({'cluster_id':new_id,'old_id':old_id})

#Update annotation dataframe
T_ann = T_ann.merge(right=ref_df,how='left',left_on='cluster_id',right_on='old_id',suffixes=('_old',''))
T_ann = T_ann[['spec_id_label','sample_id','cluster','cluster_color','cluster_id','mapping_confidence',]]

In [9]:
#Sort dataset by cell type to make annotation file easier to human-read
T_ann = T_ann.sort_values('cluster_id', axis=0)
T_ann = T_ann.reset_index(drop=True)

In [10]:
#Compare with Gala et al. 2020 Bioarxiv dataset version
import scipy.io as sio

def comparison_check(T_ann):
    """Compare labels in this dataset with datasaet used in BioArxiv version,
    """
    new_counts = T_ann['cluster'].value_counts().to_frame()
    new_counts = new_counts.rename(columns={'cluster':'new_count'}).reset_index()

    O = sio.loadmat('/Users/fruity/Dropbox/AllenInstitute/CellTypes/code/cplAE_patchseq_TE/data/proc/PS_v5_beta_0-4_pc_scaled_ipfx_eqTE.mat',squeeze_me=True)
    old_counts = pd.DataFrame(dict((k, O[k]) for k in ('cluster','cluster_color')))
    old_counts = old_counts['cluster'].value_counts().to_frame()
    old_counts = old_counts.rename(columns={'cluster':'old_count'}).reset_index()

    compare_df = new_counts.merge(old_counts,how='outer',left_on='index',right_on='index')
    compare_df.fillna(value=0,inplace = True)
    compare_df['diff'] = compare_df['new_count'] - compare_df['old_count']
    compare_df = compare_df.astype({'new_count': 'int', 'old_count': 'int', 'diff':'int'})
    return compare_df

comparison_check(T_ann)

Unnamed: 0,index,new_count,old_count,diff
0,Sst Calb2 Pdlim5,260,237,23
1,Pvalb Reln Itm2a,224,203,21
2,Sst Hpse Cbln4,221,199,22
3,Lamp5 Lsp1,175,158,17
4,Pvalb Tpbg,167,148,19
5,Pvalb Sema3e Kank4,166,152,14
6,Sst Rxfp1 Prdm8,141,136,5
7,Lamp5 Plch2 Dock5,138,130,8
8,Sst Esm1,131,112,19
9,Vip Ptprt Pkp2,123,114,9


In [11]:
keep_gene_id = pd.read_csv(pth['gene_set'])
keep_gene_id = keep_gene_id[keep_gene_id.BetaScore>beta_threshold]['Gene'].to_list()

#Restrict T data based on genes:
keepcols = ['sample_id'] + keep_gene_id
T_dat = T_dat[keepcols]

In [12]:
#Restrict to samples in the annotation dataframe
T_dat = T_dat[T_dat['sample_id'].isin(T_ann['sample_id'])]
T_dat.set_index(keys='sample_id',inplace=True)
T_dat = T_dat.reindex(labels=T_ann['sample_id'])
T_dat.reset_index(drop=False,inplace=True)


In [13]:
assert (T_ann['sample_id'].sort_index(axis=0) == T_dat['sample_id'].sort_index(axis=0)).all(), 'Order of annotation and data samples is different!'
T_dat.to_csv('/Users/fruity/Dropbox/AllenInstitute/CellTypes/dat/proc/patchseq-inh/T_data.csv',index=False)
T_ann.to_csv('/Users/fruity/Dropbox/AllenInstitute/CellTypes/dat/proc/patchseq-inh/T_annotations.csv',index=False)