In [2]:
import argparse
from utils import prepare_save_dir
# from STELLAR import STELLAR
import numpy as np
import torch
import pandas as pd
import anndata
import scanpy as sc
import pickle
import sys
sys.path.append("../")
# from datasets import GraphDataset
from sklearn.metrics import f1_score, accuracy_score

In [3]:
parser = argparse.ArgumentParser(description='STELLAR')
parser.add_argument('--dataset', default='TonsilBE', help='dataset setting')
parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)')
parser.add_argument('--name', type=str, default='STELLAR')
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--lr', type=float, default=1e-3) # learning rate
parser.add_argument('--wd', type=float, default=5e-2) # weight decay
parser.add_argument('--num-heads', type=int, default=13)
parser.add_argument('--num-seed-class', type=int, default=3)
parser.add_argument('--sample-rate', type=float, default=0.5) # downsample dataset by using 50% of cells
parser.add_argument('-b', '--batch-size', default=1, type=int,
                metavar='N', help='mini-batch size')
parser.add_argument('--distance_thres', default=50, type=int)# distance threshold for constructing the graph
parser.add_argument('--savedir', type=str, default='./') # output directory

parser.add_argument('--use-processed-graph', type=bool, default=False) # whether to use already preprocessed graph or construct the graph 

_StoreAction(option_strings=['--use-processed-graph'], dest='use_processed_graph', nargs=None, const=None, default=False, type=<class 'bool'>, choices=None, help=None, metavar=None)

In [4]:
args = parser.parse_args(args=[])
args.cuda = torch.cuda.is_available()
args.device = torch.device("cuda" if args.cuda else "cpu")

In [5]:
df = pd.read_csv('./data/region1_2_healthy_mouse_lymph_simplified.csv')
train_df = df.loc[df['region'] == 1]
test_df = df.loc[df['region'] == 2]
train_y = train_df['cluster'].str.lower()
test_y = test_df['cluster'].str.lower()
cell_types = np.sort(list(set(test_y))).tolist()
cell_type_dict = {}
for i, cell_type in enumerate(cell_types):
    cell_type_dict[cell_type] = i

In [6]:
df = pd.read_csv('./data/region1_2_healthy_mouse_lymph_simplified.csv')
train_df = df.loc[df['region'] == 1]
test_df = df.loc[df['region'] == 2]
train_y = train_df['cluster'].str.lower()
test_y = test_df['cluster'].str.lower()
cell_types_train = np.sort(list(set(train_y))).tolist()
class_train = [i for i in range(len(cell_types_train))]
cell_type_dict_train = {}
inverse_dict_train = {}
cell_types_test = np.sort(list(set(test_y))).tolist()
cell_type_dict_test = {}
inverse_dict_test = {}
for i, cell_type in enumerate(cell_types_train):
    cell_type_dict_train[cell_type] = i
    inverse_dict_train[i] = cell_type
for i, cell_type in enumerate(cell_types_test):
    cell_type_dict_test[cell_type] = i
    inverse_dict_test[i] = cell_type

In [7]:
cell_types_train

['b-cell',
 'cd206+ lymphatic vessels',
 'cortical sinuses',
 'dc',
 'endothelial vessels',
 'eosinophils',
 'macrophage',
 'neutrophils',
 'nk cells',
 't-cell',
 'undefined']

In [8]:
inverse_dict_train, inverse_dict_test

({0: 'b-cell',
  1: 'cd206+ lymphatic vessels',
  2: 'cortical sinuses',
  3: 'dc',
  4: 'endothelial vessels',
  5: 'eosinophils',
  6: 'macrophage',
  7: 'neutrophils',
  8: 'nk cells',
  9: 't-cell',
  10: 'undefined'},
 {0: 'b-cell',
  1: 'dc',
  2: 'endothelial vessels',
  3: 'eosinophils',
  4: 'lyve1+ lymphatic vessels',
  5: 'macrophage',
  6: 'monocytes',
  7: 'neutrophils',
  8: 'nk cells',
  9: 't-cell',
  10: 'unannotated'})

In [10]:
array_loaded = np.load('./experiments/run/MouseLymph_epoch_320_batch_32_wd_1e-05_results.npy')

In [11]:
pred_label = []
for i,j in enumerate(array_loaded):
    if j not in class_train:
        pred_label.append('novel')
    else:
        pred_label.append(inverse_dict_train[j])
# final = np.array([test_y,pred_label]).T
# print(len(final))
# final = np.delete(final,np.where(final == 'novel')[0],axis = 0)
# final = np.delete(final,np.where(~((final == 'glandular_epi') | (final == 'secretory_epithelial') | (final == 'paneth')))[0],axis = 0)
# row_idx, col_idx = np.where(~((final == 'glandular_epi') | (final == 'secretory_epithelial') | (final == 'paneth')))
# print(len(final))


In [12]:
test_y

23544    macrophage
23545    macrophage
23546        b-cell
23547    macrophage
23548        b-cell
            ...    
41136    macrophage
41137    macrophage
41138    macrophage
41139    macrophage
41140    macrophage
Name: cluster, Length: 17597, dtype: object

In [24]:
pred_label[:10]

['macrophage',
 'macrophage',
 'b-cell',
 'macrophage',
 'undefined',
 'b-cell',
 'b-cell',
 'b-cell',
 'macrophage',
 'b-cell']

In [15]:
import pandas as pd

In [17]:
df = pd.DataFrame({'label': test_y, 'pred': pred_label})

In [18]:
df.head()


Unnamed: 0,label,pred
23544,macrophage,macrophage
23545,macrophage,macrophage
23546,b-cell,b-cell
23547,macrophage,macrophage
23548,b-cell,undefined


In [19]:
def preprocess_results_matrix(result_df):
    # novel_cell_types = ['glandular_epi', 'secretory_epithelial', 'paneth']
    novel_cell_types = ['undefined']
    # drop novel cell types from the results df
    result_df_known = result_df.loc[~result_df['label'].isin(novel_cell_types)]

    return result_df_known



In [20]:
df_known = preprocess_results_matrix(df)

In [21]:
df_known['label'].value_counts()

t-cell                      5919
b-cell                      5660
macrophage                  1988
dc                          1591
endothelial vessels         1344
unannotated                  471
lyve1+ lymphatic vessels     401
nk cells                      79
monocytes                     64
eosinophils                   40
neutrophils                   40
Name: label, dtype: int64

In [22]:
df_known['pred'].value_counts()

t-cell                      5967
b-cell                      5218
macrophage                  2210
dc                          1454
endothelial vessels          912
undefined                    748
cortical sinuses             653
eosinophils                  153
cd206+ lymphatic vessels     115
nk cells                      86
neutrophils                   81
Name: pred, dtype: int64

In [23]:
from sklearn.metrics import precision_recall_fscore_support, classification_report
# metrics = precision_recall_fscore_support(final[:,0],final[:,1],average = 'weighted')
classification = classification_report(df_known['label'], df_known['pred'])
print(classification)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

                  b-cell       0.93      0.86      0.89      5660
cd206+ lymphatic vessels       0.00      0.00      0.00         0
        cortical sinuses       0.00      0.00      0.00         0
                      dc       0.69      0.63      0.66      1591
     endothelial vessels       0.77      0.52      0.62      1344
             eosinophils       0.25      0.97      0.40        40
lyve1+ lymphatic vessels       0.00      0.00      0.00       401
              macrophage       0.68      0.76      0.72      1988
               monocytes       0.00      0.00      0.00        64
             neutrophils       0.46      0.93      0.61        40
                nk cells       0.78      0.85      0.81        79
                  t-cell       0.87      0.88      0.88      5919
             unannotated       0.00      0.00      0.00       471
               undefined       0.00      0.00      0.00         0

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
classification = classification_report(df['label'], df['pred'])
print(classification)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                          precision    recall  f1-score   support

                  b-cell       0.91      0.88      0.89      5660
cd206+ lymphatic vessels       0.00      0.00      0.00         0
        cortical sinuses       0.00      0.00      0.00         0
                      dc       0.71      0.63      0.67      1591
     endothelial vessels       0.79      0.52      0.63      1344
             eosinophils       0.31      1.00      0.48        40
lyve1+ lymphatic vessels       0.00      0.00      0.00       401
              macrophage       0.70      0.76      0.73      1988
               monocytes       0.00      0.00      0.00        64
             neutrophils       0.55      0.93      0.69        40
                nk cells       0.77      0.84      0.80        79
                  t-cell       0.87      0.87      0.87      5919
             unannotated       0.00      0.00      0.00       471
               undefined       0.00      0.00      0.00         0

        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
a = [0.00,
0.90, 
0.77, 
0.03, 
0.00,    
0.84,  
0.95, 
0.96, 
0.70, 
0.86, 
0.95]

In [20]:
b = [
0.90, 
0.77, 
0.03,     
0.84,  
0.95, 
0.96, 
0.70, 
0.86, 
0.95]

In [21]:
sum(a) / len(a)

0.6327272727272728

In [22]:
sum(b) / len(b)

0.7733333333333334

In [23]:
classification

'                     precision    recall  f1-score   support\n\n                  b       0.00      0.00      0.00         0\n        endothelial       0.38      0.96      0.54      6181\n             innate       0.97      0.89      0.93      4282\n              nerve       0.91      0.94      0.93      2047\n              novel       0.00      0.00      0.00         0\n               pdpn       0.98      0.65      0.78       914\n             plasma       0.95      0.95      0.95      1177\n       smoothmuscle       0.00      0.00      0.00      9023\nsquamous_epithelial       0.99      0.56      0.72      1077\n             stroma       0.86      0.85      0.85      4218\n                  t       0.97      0.93      0.95      1416\n\n           accuracy                           0.62     30335\n          macro avg       0.64      0.61      0.60     30335\n       weighted avg       0.54      0.62      0.55     30335\n'

In [24]:
print(classification)

                     precision    recall  f1-score   support

                  b       0.00      0.00      0.00         0
        endothelial       0.38      0.96      0.54      6181
             innate       0.97      0.89      0.93      4282
              nerve       0.91      0.94      0.93      2047
              novel       0.00      0.00      0.00         0
               pdpn       0.98      0.65      0.78       914
             plasma       0.95      0.95      0.95      1177
       smoothmuscle       0.00      0.00      0.00      9023
squamous_epithelial       0.99      0.56      0.72      1077
             stroma       0.86      0.85      0.85      4218
                  t       0.97      0.93      0.95      1416

           accuracy                           0.62     30335
          macro avg       0.64      0.61      0.60     30335
       weighted avg       0.54      0.62      0.55     30335

