In [1]:
import anndata as ad
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import sys
import torch
import os
import matplotlib.pyplot as plt
import glob

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

  data = yaml.load(f.read()) or {}
  import pandas.util.testing as tm


### silhouette score

In [2]:
meta = pd.read_csv('data/rna.meta.csv',sep='\t')

ct_int = meta['seurat_annotations'].astype('category').cat.codes.to_numpy().astype('int')
ct_int = np.concatenate((ct_int,ct_int), axis=0)

tech_int = np.zeros(2*len(meta))
tech_int[len(meta):] = 1


In [3]:
merge_list = glob.glob('results/raw/*embed*') + ['results/raw/Seurat_umap.csv']
len(merge_list)

11

In [4]:
from sklearn.metrics import silhouette_score


method_id_list = []
for i in range(len(merge_list)):
    method_id = merge_list[i].split('/')[2].split('_')[0]
    if 'UMAP' in merge_list[i]:
        method_id = '%s (UMAP)'%(method_id)        
    method_id_list.append(method_id)

silh_score_df = pd.DataFrame({'method':method_id_list,
                              'cell_type_score':0,
                              'modality_score':0,
                              'batch_score':0})

for i in range(len(merge_list)):
    merge = pd.read_csv(merge_list[i]).iloc[:,1:].to_numpy()
    silh_score_df.iloc[i,1] = silhouette_score(merge, ct_int)
    silh_score_df.iloc[i,2] = 1 - silhouette_score(merge, tech_int)


In [5]:
silh_score_df #all

Unnamed: 0,method,cell_type_score,modality_score,batch_score
0,bindSC,0.073523,0.997772,0
1,Liger,0.101955,0.998244,0
2,Liger,0.091363,0.99804,0
3,Liger,0.115715,0.999181,0
4,SiaNN,0.262235,0.997367,0
5,Liger,0.105813,0.998752,0
6,Liger,0.071535,0.997577,0
7,Liger,0.128193,0.998793,0
8,Liger,0.091524,0.997854,0
9,Liger,0.098597,0.997398,0


In [6]:
silh_score_df.to_csv('results/silh_score.csv',index=False) #all

### FOSCTTM

In [8]:
def foscttm(neighborhood_file):
    neighborhood = open(neighborhood_file,'r').readlines()
    neighborhood = [int(i.strip()) for i in neighborhood]
    neighborhood = np.array(neighborhood)
    
    return np.mean(neighborhood)/len(neighborhood)

In [9]:
file_list = glob.glob('results/raw/neigh*')
file_list[0]

'results/raw/neighborhood_Liger.2.txt'

In [10]:
method_id_list = []
batch_list = []
for i in range(len(file_list)):
    method_id = file_list[i].split('_')[1].split('.')[0]
    if 'UMAP' in file_list[i]:
        method_id = '%s (UMAP)'%(method_id)        
    method_id_list.append(method_id)


foscttm_df = pd.DataFrame({'method':method_id_list,
                              'foscttm':0})

for i in range(len(file_list)):
    foscttm_df.iloc[i,1] = foscttm(file_list[i])

In [11]:
foscttm_df = foscttm_df.sort_values('foscttm')
foscttm_df.iloc[0:20,:]

Unnamed: 0,method,foscttm
5,bindSC,0.051458
9,Seurat,0.065906
1,siaNN,0.069865
2,Liger,0.078723
6,Liger,0.083967
3,Liger,0.086246
0,Liger,0.088387
8,Liger,0.090141
7,Liger,0.092339
4,Liger,0.092506


In [12]:
foscttm_df.to_csv('results/foscttm_score.csv',index=False)

### Label transfer accuracy

In [15]:
file_list = glob.glob('results/raw/*acc*')
file_list = sorted(file_list)
file_list[0]

'results/raw/Liger_transfer_acc.1.csv'

In [16]:
file_list

['results/raw/Liger_transfer_acc.1.csv',
 'results/raw/Liger_transfer_acc.2.csv',
 'results/raw/Liger_transfer_acc.3.csv',
 'results/raw/Liger_transfer_acc.4.csv',
 'results/raw/Liger_transfer_acc.5.csv',
 'results/raw/Liger_transfer_acc.6.csv',
 'results/raw/Liger_transfer_acc.7.csv',
 'results/raw/Liger_transfer_acc.8.csv',
 'results/raw/Seurat_acc.txt',
 'results/raw/SiaNN_transfer_acc.csv',
 'results/raw/bindSC_transfer_acc.csv']

In [17]:
method_id_list = []
for i in range(len(file_list)):
    method_id = file_list[i].split('/')[2].split('_')[0]
    if 'UMAP' in file_list[i]:
        method_id = '%s (UMAP)'%(method_id)        
    method_id_list.append(method_id)

pred_df = pd.read_csv(file_list[1],index_col=0)
pred_df

acc_ct_df = pd.DataFrame(np.zeros((len(file_list), pred_df.shape[0]+1)))

In [18]:
acc_ct_df.columns = ['method'] + pred_df['cell_type'].tolist()
acc_ct_df['method'] = method_id_list

In [19]:
acc_ct_df.columns[1:]

Index(['CD4 Naive', 'CD4 TCM', 'CD8 Naive', 'CD16 Mono', 'NK', 'Treg',
       'CD14 Mono', 'cDC', 'CD8 TEM_1', 'Intermediate B', 'Naive B', 'Plasma',
       'CD4 TEM', 'MAIT', 'Memory B', 'gdT', 'pDC', 'CD8 TEM_2', 'HSPC'],
      dtype='object')

In [20]:
for i in range(len(file_list)):
    if method_id_list[i] == 'Seurat':
        seurat = pd.read_csv(file_list[i],sep='\t')
        seurat = seurat.iloc[:,[0,3]]
        seurat.columns = ['cell_type', 'Seurat']
        seurat.index = seurat['cell_type']
        preded_ct = np.intersect1d(acc_ct_df.columns[1:],seurat.index)
        unpreded_ct = np.setdiff1d(acc_ct_df.columns[1:],seurat.index)
        #seurat.insert
        for ct in unpreded_ct:
            seurat.loc[-1] = [ct, 0.0]
            seurat.index = seurat.index[0:-1].tolist() + [ct]
        seurat = seurat.loc[acc_ct_df.columns[1:],:]
        assert np.all(acc_ct_df.columns[1:].to_numpy() == seurat['cell_type'].to_numpy())
        acc_ct_df.iloc[i,1:] = seurat.iloc[:,1].to_numpy()
    else:
        pred_df = pd.read_csv(file_list[i],index_col=0)
        assert np.all(acc_ct_df.columns[1:].to_numpy() == pred_df['cell_type'].to_numpy())
        acc_ct_df.iloc[i,1:] = pred_df.iloc[:,1].to_numpy()

In [21]:
acc_ct_df

Unnamed: 0,method,CD4 Naive,CD4 TCM,CD8 Naive,CD16 Mono,NK,Treg,CD14 Mono,cDC,CD8 TEM_1,Intermediate B,Naive B,Plasma,CD4 TEM,MAIT,Memory B,gdT,pDC,CD8 TEM_2,HSPC
0,Liger,0.902044,0.787641,0.91844,0.484436,0.897436,0.006173,0.940967,0.651515,0.452308,0.903683,0.056338,0.0,0.637584,0.29927,0.865229,0.19863,0.971698,0.751397,0.038462
1,Liger,0.853418,0.791123,0.946099,0.51751,0.940171,0.0,0.928876,0.681818,0.396923,0.810198,0.246479,0.0,0.496644,0.532847,0.849057,0.068493,0.981132,0.715084,0.115385
2,Liger,0.902748,0.725849,0.936879,0.422179,0.92735,0.246914,0.945946,0.641414,0.556923,0.872521,0.422535,0.0,0.614094,0.664234,0.838275,0.184932,0.971698,0.653631,0.0
3,Liger,0.406624,0.805048,0.98156,0.546693,0.895299,0.080247,0.94239,0.686869,0.467692,0.804533,0.267606,0.0,0.332215,0.532847,0.862534,0.308219,0.981132,0.731844,0.0
4,Liger,0.880197,0.759791,0.937589,0.533074,0.925214,0.0,0.920697,0.686869,0.483077,0.756374,0.330986,0.0,0.711409,0.686131,0.859838,0.246575,0.971698,0.77933,0.346154
5,Liger,0.921776,0.724978,0.679433,0.476654,0.897436,0.0,0.940967,0.540404,0.369231,0.798867,0.274648,0.0,0.278523,0.715328,0.838275,0.10274,0.981132,0.631285,0.0
6,Liger,0.877378,0.875544,0.941135,0.548638,0.897436,0.0,0.941323,0.686869,0.470769,0.76204,0.260563,0.0,0.234899,0.437956,0.854447,0.061644,0.981132,0.865922,0.0
7,Liger,0.847075,0.816362,0.949645,0.468872,0.929487,0.265432,0.91643,0.611111,0.452308,0.858357,0.098592,0.0,0.436242,0.613139,0.743935,0.273973,0.990566,0.695531,0.384615
8,Seurat,0.939,0.829,0.987,0.702,0.942,0.79,0.927,0.636,0.757,0.943,0.824,0.833,0.852,0.912,0.954,0.329,0.962,0.936,0.846
9,SiaNN,0.749824,0.823325,0.887943,0.77821,0.952991,0.080247,0.970484,0.722222,0.581538,0.844193,0.176056,0.555556,0.513423,0.693431,0.884097,0.424658,0.981132,0.843575,0.846154


In [22]:
file_list_2 = glob.glob('results/raw/*prediction*')
file_list_2 = sorted(file_list_2)
len(file_list_2)

11

In [23]:
method_id_list = []
for i in range(len(file_list_2)):
    method_id = file_list_2[i].split('/')[2].split('_')[0]
    if 'UMAP' in file_list_2[i]:
        method_id = '%s (UMAP)'%(method_id)        
    method_id_list.append(method_id)


acc_df = pd.DataFrame({'method':method_id_list,
                              'total_acc':0})

for i in range(len(file_list_2)):
    pred_df = pd.read_csv(file_list_2[i])
    acc_df.iloc[i,1] = np.mean(pred_df['cell_type'] == pred_df['prediction'])




In [24]:
acc_ct_df.insert(1, 'total', acc_df['total_acc'].to_numpy())
acc_ct_df

Unnamed: 0,method,total,CD4 Naive,CD4 TCM,CD8 Naive,CD16 Mono,NK,Treg,CD14 Mono,cDC,...,Intermediate B,Naive B,Plasma,CD4 TEM,MAIT,Memory B,gdT,pDC,CD8 TEM_2,HSPC
0,Liger,0.802247,0.902044,0.787641,0.91844,0.484436,0.897436,0.006173,0.940967,0.651515,...,0.903683,0.056338,0.0,0.637584,0.29927,0.865229,0.19863,0.971698,0.751397,0.038462
1,Liger,0.793892,0.853418,0.791123,0.946099,0.51751,0.940171,0.0,0.928876,0.681818,...,0.810198,0.246479,0.0,0.496644,0.532847,0.849057,0.068493,0.981132,0.715084,0.115385
2,Liger,0.807914,0.902748,0.725849,0.936879,0.422179,0.92735,0.246914,0.945946,0.641414,...,0.872521,0.422535,0.0,0.614094,0.664234,0.838275,0.184932,0.971698,0.653631,0.0
3,Liger,0.745486,0.406624,0.805048,0.98156,0.546693,0.895299,0.080247,0.94239,0.686869,...,0.804533,0.267606,0.0,0.332215,0.532847,0.862534,0.308219,0.981132,0.731844,0.0
4,Liger,0.806665,0.880197,0.759791,0.937589,0.533074,0.925214,0.0,0.920697,0.686869,...,0.756374,0.330986,0.0,0.711409,0.686131,0.859838,0.246575,0.971698,0.77933,0.346154
5,Liger,0.748655,0.921776,0.724978,0.679433,0.476654,0.897436,0.0,0.940967,0.540404,...,0.798867,0.274648,0.0,0.278523,0.715328,0.838275,0.10274,0.981132,0.631285,0.0
6,Liger,0.805993,0.877378,0.875544,0.941135,0.548638,0.897436,0.0,0.941323,0.686869,...,0.76204,0.260563,0.0,0.234899,0.437956,0.854447,0.061644,0.981132,0.865922,0.0
7,Liger,0.792739,0.847075,0.816362,0.949645,0.468872,0.929487,0.265432,0.91643,0.611111,...,0.858357,0.098592,0.0,0.436242,0.613139,0.743935,0.273973,0.990566,0.695531,0.384615
8,Seurat,0.892336,0.939,0.829,0.987,0.702,0.942,0.79,0.927,0.636,...,0.943,0.824,0.833,0.852,0.912,0.954,0.329,0.962,0.936,0.846
9,SiaNN,0.824145,0.749824,0.823325,0.887943,0.77821,0.952991,0.080247,0.970484,0.722222,...,0.844193,0.176056,0.555556,0.513423,0.693431,0.884097,0.424658,0.981132,0.843575,0.846154


In [25]:
acc_ct_df.to_csv('results/label_trasfer_acc.csv',index=False)