In [108]:
import requests
import json
import nferx_py.fn as nf
import pandas as pd
from multiprocessing import pool
import seaborn as sns
from tqdm import tqdm
from nltk.stem import SnowballStemmer
from gensim.utils import simple_preprocess
import numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
stemmer = SnowballStemmer('english')

In [3]:
nf.authenticate('nfer','6eaa1d27bfa0639f2712191fa55df872')
nf.modify_defaults('server', 'preview')
nf.modify_defaults('api_server', 'preview')
AUTH = nf.AUTH

2020-05-11 12:17:30,292 : INFO : Authentication successful - nferX is online
2020-05-11 12:17:30,293 : INFO : Default parameter server modified
2020-05-11 12:17:30,294 : INFO : Default parameter api_server modified


### notes
1. Get local scores for all gene/cell combinations from single cell api for Tabula muris (study 2) and mouse cell atlas (study 15)
2. Do "intra-study" testing: for each Tabula Muris cluster, (1) identify the subset of genes to consider (e.g. top 0.05% by cohen's D vs. all other cells), (2) calculate cosine similarity between mean CP10K vector of this gene subset and the literature score vectors for all Tabula Muris cell types with this same gene subset. This would give us a ranked list of literature-derived labels for each cluster. Then repeat this with Mouse Cell Atlas. ---- Cosine sim did not work out---- Switched to calculating vector norm (root of sum of squares) of literature scores of top genes from selected from cluster A, accross all clusters    
-------------step 3 yet to be done------------------------
3. Do "inter-study" testing: for each Tabula Muris cluster, (1) identify the subset of genes to consider (e.g. top 0.05% by cohen's D vs. all other cells), (2) calculate cosine similarity between mean CP10K vector of this gene subset and the literature score vectors for all Mouse Cell Atlas cell types with this same gene subset. Then repeat this to label all Mouse Cell Atlas clusters using the literature vectors from the Tabula Muris dataset. This will require some handling of synonyms/cell "families" which I provided to Katie for this analysis ... but we could go over that later since step 2 alone will take some time to set up and optimize I think

### POST to single cell api to get the table

In [4]:
URL = 'https://pre-staging.nferx.com/singlecellapi/study2/summary?'
headers = {'content-type' : 'application/json'}
res = requests.post(url = URL, params = {'page': 1, 'rows': 3412880}, auth = AUTH)
data_all = pd.DataFrame(res.json()['result']['data'])
# d.drop(columns=['localScoreCellType', 'localScoreTissueType', 'greaterThanZero'], inplace = True)

data_all.drop(columns=['localScoreTissueType', 'greaterThanZero'], inplace = True)
data_all.localScoreCellType = [d['localScore'] for d in data_all.localScoreCellType]
data_all['cluster'] = data_all.cellType.str.replace(' ', '_') + ';'+data_all.tissueType.str.replace(' ', '_')

2020-05-11 12:19:34,533 : INFO : NumExpr defaulting to 8 threads.


In [5]:
data_all.head()

Unnamed: 0,mean,cohenD,localScoreCellType,countOfCells,cohendNA,geneId,cellType,tissueType,geneName,cluster
0,8.853313,8.359104,0.615536,155,False,19188,erythroblasts,bone marrow,Beta-s,erythroblasts;bone_marrow
1,7.706127,4.943145,1.05881,390,False,11271,pancreatic A cells,pancreas,Gcg,pancreatic_A_cells;pancreas
2,5.716988,4.647337,2.573489,24,False,23282,mast cells,lung,Tmsb4x,mast_cells;lung
3,5.122446,4.616773,4.121519,4394,False,5689,microglia,brain,Hexb,microglia;brain
4,5.003388,4.5077,2.240599,89,False,17014,type II pneumocytes,lung,Sftpb,type_II_pneumocytes;lung


In [6]:
gene_cluster_localScore = data_all.pivot(index = 'geneName', columns = 'cluster', values = 'localScoreCellType')
gene_cluster_cohend = data_all.pivot(index = 'geneName', columns = 'cluster', values = 'cohenD')

In [195]:
def get_top_genes_vector(df, top_n = None, quantile = 0.995):
    #matrix 1: top-genes means (shape: 1*N)
    cutoff = df.cohenD.quantile([0.995]).values[0]
    df = df.set_index('geneName')
    top_genes_vec = df.loc[df.cohenD >= cutoff, 'mean']
    if top_n:
        return top_genes_vec.sort_values(ascending = False)[0:top_n]
    
    return top_genes_vec

In [223]:
def get_matrix(top_genes_mean, metric = 'cosiine'):
    cluster = top_genes_mean.cluster[0]
    #matrix 1: top-genes means (shape: 1*N)
    top_genes_mean = top_genes_mean.drop(columns = 'cluster').set_index('geneName').transpose()
    #matrix 2: top-genes(N) vs all - clusters'(M) local score (shape: N*M)
    gene_local_score_subset = gene_cluster_localScore.loc[top_genes_mean.columns,:]
    
    assert top_genes_mean.shape[1] == gene_local_score_subset.shape[0]
    if metric == 'cosine':
        cs = cosine_similarity(top_genes_mean.to_numpy(), gene_local_score_subset.transpose())
        cs = pd.Series(cs.reshape(-1), index = gene_local_score_subset.columns)
    elif metric == 'norm':
        cs = gene_local_score_subset.pow(2).sum().pow(1/2)
    
    return cs

In [224]:
n_genes = [5, 10, 25]
for n in n_genes:
    cluster_mean_vectors = data_all.groupby('cluster').apply(lambda x: get_top_genes_vector(x, top_n = n))
    cluster_mean_vectors = cluster_mean_vectors.reset_index()
    res = cluster_mean_vectors.groupby('cluster').apply(lambda x: get_matrix(x, 'norm'))
    res.index = res.index + '_TRUE'
    res.to_csv('./data/RSS_top_%s_genes.csv'%n)

In [225]:
res.apply(lambda x: x.sort_values(ascending = False).index, axis = 1)

cluster
Alveolar_macrophages;lung_TRUE               Index(['macrophages;spleen', 'macrophages;musc...
Astrocytes;brain_TRUE                        Index(['Astrocytes;brain', 'Bergmann_glial_cel...
B_cells;adipose_tissue_TRUE                  Index(['B_cells;breast', 'B_cells;spleen', 'B_...
B_cells;bone_marrow_TRUE                     Index(['B_cells;muscle_organ', 'B_cells;adipos...
B_cells;breast_TRUE                          Index(['erythrocytes;vasculature', 'Blood_cell...
                                                                   ...                        
smooth_muscle_cells;heart_TRUE               Index(['fibroblasts;vasculature', 'fibroblasts...
stem_cells_of_epidermis;skin_of_body_TRUE    Index(['professional_antigen_presenting_cells;...
stromal_cells;breast_TRUE                    Index(['fibroblasts;heart', 'fibroblasts;vascu...
stromal_cells;lung_TRUE                      Index(['fibroblasts;heart', 'fibroblasts;vascu...
type_II_pneumocytes;lung_TRUE             

### Previous code

In [None]:
collection =  'patrick-cells-pl-200304'
n = 10
def query_lab(gene_list, n=n, collection=collection, window = 101):
    result_df = pd.DataFrame()
    for gene in tqdm(gene_list[0:n]): 
        query = gene
        res = nf.get_signals_lab(query = query, control_collection = collection, 
                                 window = window)

        if type(res)!= str:
            res = pd.DataFrame(res)
            res['gene'] = gene
            result_df = pd.concat([result_df, res])
        #res.token = res.token.str.upper()
     
    return result_df.pivot(index = 'gene', columns = 'token', values = 'score').astype('float32')

In [None]:
df_test = top_genes.iloc[0:1].apply(query_lab)

In [None]:
df_all = top_genes.apply(query_lab)

In [None]:
syns_all  =  set([tok for ls in syns for tok in ls])
def sort_df(df, q = [0.3, 0.5, 0.75], by = [0.75, 0.5, 0.3]):
    df_collapsed = pd.DataFrame()
    df = df.T
    no_match = df.index.difference(syns_all)
#     print(no_match)
    df.insert(loc = 0, column = 'count', value = n - df.isna().sum(1))
    for i, syn in tqdm(enumerate(syns)):
        df_subset = df[df.index.isin(syn)]
        if not df_subset.empty:
#             print(i)
#             print('\n',df_subset.index)
            
            df_idx = df_subset['count'].idxmax()
            df_max = df_subset.max()
            df_max.name = df_idx
            df_collapsed = pd.concat([df_collapsed, df_max], axis=1)
    if not df.loc[no_match,:].empty:
        df_collapsed = pd.concat([df_collapsed, df.loc[no_match,:].T], axis = 1)
    quantiles = df_collapsed.quantile(q, axis = 0)
    df_collapsed = pd.concat([quantiles, df_collapsed]).T
    df_collapsed = df_collapsed.sort_values(by = by, ascending = False)
    
    return df_collapsed.drop_duplicates()

In [None]:
df_all[0].T

In [None]:
sorted_dfs = list(map(sort_df, df_all))

In [None]:
z = df_all[0].columns.difference(syns_all)

In [None]:
df_all[0].loc[:,'yolk_sac_derived_macrophages']

In [None]:
lit_df = pd.DataFrame()
for i, idx in enumerate(top_genes.index):
    df = sorted_dfs[i].copy(deep = True)
#     df.insert(loc = 0, column = 'count', value = n - df.isna().sum(1))
    df.insert(loc = 0, column = 'tissue', value = idx[0])
    df.insert(loc = 0, column = 'true_cell', value = idx[1])
    df = df.reset_index(drop = False, inplace = False)
    df.columns = ['token'] + df.columns[1:].to_list()
#     df = df[df['count'] > 3]
    lit_df = pd.concat([lit_df, df.loc[0:10, ['token', 'true_cell', 'tissue', 'count']]])

In [None]:
lit_df.to_csv('./data/t10_genes_per_cluster.csv')

In [None]:
lit_df

In [None]:
ees_tokens = pd.read_csv('./data/appendTokens.csv', sep = '\t', header=None)

In [None]:
ees_tokens.columns = ['token', 'syn']

In [None]:
synonyms = ees_tokens.groupby('token').apply(lambda x: x.syn.to_list())

In [None]:
all_syns = [set([idx]+syn) for idx, syn in synonyms.iteritems()]

In [None]:
tokens = sorted_dfs[0].index
syn_mat = pd.DataFrame(np.eye(len(tokens)), index=idx, columns=idx)

In [None]:
ees_ = ees_tokens.copy(deep = True)
ees_['val'] = 1
ees_1 = pd.pivot_table(ees_, index='token', columns='syn', values = 'val').fillna(0)
ees_2 = pd.pivot_table(ees_, index='syn', columns='token', values = 'val').fillna(0)

In [None]:
ees_1.index.intersection(ees_2.index)

In [None]:
len(all_syns)

In [None]:
subset = {i: [] for i in range(len(all_syns))}
duplicates = {i: [] for i in range(len(all_syns))}
for i, syn_set in enumerate(all_syns):
    for j, syn_set_ in enumerate(all_syns):
        if (i!=j) & (len(syn_set.intersection(syn_set_)) > 0):
            print('\n',i, j, '\nIntersection: ', syn_set.intersection(syn_set_))
            if syn_set_ == syn_set:
                duplicates[i].append(j)
            else:
                subset[i].append(j)
                
                

In [None]:
subset

In [None]:
syns = []
for k, v in subset.items():
    if v:
        syn_subset = synonyms.iloc[[k]+v]
        print(syn_subset.index)
        s = list(set(syn_subset.explode().to_list() + syn_subset.index.to_list()))
        if s not in syns:
            syns.append(s)
    else:
        syns.append(list(set(synonyms.iloc[k] + [synonyms.index[k]])))

In [None]:
syns

In [None]:
d[d.index.isin(syns[1])]