In [1]:
import argparse
from utils import prepare_save_dir
from STELLAR import STELLAR
import numpy as np
import torch
import pandas as pd
import anndata
import scanpy as sc
import pickle
import sys
sys.path.append("../")
from datasets import GraphDataset, load_tonsilbe_data, load_hubmap_data
from sklearn.metrics import f1_score, accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
parser = argparse.ArgumentParser(description='STELLAR')
parser.add_argument('--dataset', default='TonsilBE', help='dataset setting')
parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)')
parser.add_argument('--name', type=str, default='STELLAR')
parser.add_argument('--epochs', type=int, default=50)
parser.add_argument('--lr', type=float, default=1e-3) # learning rate
parser.add_argument('--wd', type=float, default=5e-2) # weight decay
parser.add_argument('--num-heads', type=int, default=13)
parser.add_argument('--num-seed-class', type=int, default=3)
parser.add_argument('--sample-rate', type=float, default=0.5) # downsample dataset by using 50% of cells
parser.add_argument('-b', '--batch-size', default=1, type=int,
                metavar='N', help='mini-batch size')
parser.add_argument('--distance_thres', default=50, type=int)# distance threshold for constructing the graph
parser.add_argument('--savedir', type=str, default='./') # output directory

parser.add_argument('--use-processed-graph', type=bool, default=False) # whether to use already preprocessed graph or construct the graph 

_StoreAction(option_strings=['--use-processed-graph'], dest='use_processed_graph', nargs=None, const=None, default=False, type=<class 'bool'>, choices=None, help=None, metavar=None)

In [3]:
args = parser.parse_args(args=[])
args.cuda = torch.cuda.is_available()
args.device = torch.device("cuda" if args.cuda else "cpu")

In [4]:
df = pd.read_csv('./data/BE_Tonsil_l3_dryad.csv')
train_df = df.loc[df['sample_name'] == 'tonsil']
test_df = df.loc[df['sample_name'] == 'Barretts Esophagus']
train_y = train_df['cell_type'].str.lower()
test_y = test_df['cell_type'].str.lower()
cell_types = np.sort(list(set(test_y))).tolist()
cell_type_dict = {}
for i, cell_type in enumerate(cell_types):
    cell_type_dict[cell_type] = i

In [5]:
df = pd.read_csv('./data/BE_Tonsil_l3_dryad.csv')
train_df = df.loc[df['sample_name'] == 'tonsil']
test_df = df.loc[df['sample_name'] == 'Barretts Esophagus']
train_y = train_df['cell_type'].str.lower()
test_y = test_df['cell_type'].str.lower()
cell_types_train = np.sort(list(set(train_y))).tolist()
class_train = [i for i in range(len(cell_types_train))]
cell_type_dict_train = {}
inverse_dict_train = {}
cell_types_test = np.sort(list(set(test_y))).tolist()
cell_type_dict_test = {}
inverse_dict_test = {}
for i, cell_type in enumerate(cell_types_train):
    cell_type_dict_train[cell_type] = i
    inverse_dict_train[i] = cell_type
for i, cell_type in enumerate(cell_types_test):
    cell_type_dict_test[cell_type] = i
    inverse_dict_test[i] = cell_type

In [6]:
cell_types_train

['b',
 'endothelial',
 'innate',
 'nerve',
 'pdpn',
 'plasma',
 'smoothmuscle',
 'squamous_epithelial',
 'stroma',
 't']

In [7]:
inverse_dict_train, inverse_dict_test

({0: 'b',
  1: 'endothelial',
  2: 'innate',
  3: 'nerve',
  4: 'pdpn',
  5: 'plasma',
  6: 'smoothmuscle',
  7: 'squamous_epithelial',
  8: 'stroma',
  9: 't'},
 {0: 'endothelial',
  1: 'glandular_epi',
  2: 'innate',
  3: 'nerve',
  4: 'paneth',
  5: 'pdpn',
  6: 'plasma',
  7: 'secretory_epithelial',
  8: 'smoothmuscle',
  9: 'squamous_epithelial',
  10: 'stroma',
  11: 't'})

In [8]:
array_loaded = np.load('/Users/yunseokj/stellar/experiments/STELLAR_run/TonsilBE_STELLAR/TonsilBE_results_proper_distance50_batch32_wd5e.npy')

In [9]:
pred_label = []
for i,j in enumerate(array_loaded):
    if j not in class_train:
        pred_label.append('novel')
    else:
        pred_label.append(inverse_dict_train[j])
# final = np.array([test_y,pred_label]).T
# print(len(final))
# final = np.delete(final,np.where(final == 'novel')[0],axis = 0)
# final = np.delete(final,np.where(~((final == 'glandular_epi') | (final == 'secretory_epithelial') | (final == 'paneth')))[0],axis = 0)
# row_idx, col_idx = np.where(~((final == 'glandular_epi') | (final == 'secretory_epithelial') | (final == 'paneth')))
# print(len(final))


In [10]:
test_y

173968         innate
173969    endothelial
173970    endothelial
173971    endothelial
173972    endothelial
             ...     
219921         innate
219922         innate
219923         innate
219924         innate
219925         innate
Name: cell_type, Length: 45958, dtype: object

In [11]:
pred_label

['endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'innate',
 'stroma',
 'innate',
 'endothelial',
 'endothelial',
 'innate',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'innate',
 'innate',
 'endothelial',
 'endothelial',
 'novel',
 'endothelial',
 'endothelial',
 'endothelial',
 'innate',
 'endothelial',
 'novel',
 'endothelial',
 'endothelial',
 'endothelial',
 'novel',
 'endothelial',
 'endothelial',
 'innate',
 'endothelial',
 'innate',
 'endothelial',
 'innate',
 'innate',
 'endothelial',
 'endothelial',
 'stroma',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'innate',
 'innate',
 'innate',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'innate',
 'endothelial',
 'endothelial',
 'innate',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',
 'endothelial',

In [12]:
import pandas as pd

In [13]:
df = pd.DataFrame({'label': test_y, 'pred': pred_label})

In [14]:
df.head()


Unnamed: 0,label,pred
173968,innate,endothelial
173969,endothelial,endothelial
173970,endothelial,endothelial
173971,endothelial,endothelial
173972,endothelial,endothelial


In [15]:
def preprocess_results_matrix(result_df):
    novel_cell_types = ['glandular_epi', 'secretory_epithelial', 'paneth']
    
    # drop novel cell types from the results df
    result_df_known = result_df.loc[~result_df['label'].isin(novel_cell_types)]

    return result_df_known



In [16]:
df_known = preprocess_results_matrix(df)

In [17]:
df_known['label'].value_counts()

smoothmuscle           9023
endothelial            6181
innate                 4282
stroma                 4218
nerve                  2047
t                      1416
plasma                 1177
squamous_epithelial    1077
pdpn                    914
Name: label, dtype: int64

In [18]:
from sklearn.metrics import precision_recall_fscore_support, classification_report
# metrics = precision_recall_fscore_support(final[:,0],final[:,1],average = 'weighted')
classification = classification_report(df_known['label'], df_known['pred'])
print(classification)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                  b       0.00      0.00      0.00         0
        endothelial       0.38      0.96      0.54      6181
             innate       0.97      0.89      0.93      4282
              nerve       0.91      0.94      0.93      2047
              novel       0.00      0.00      0.00         0
               pdpn       0.98      0.65      0.78       914
             plasma       0.95      0.95      0.95      1177
       smoothmuscle       0.00      0.00      0.00      9023
squamous_epithelial       0.99      0.56      0.72      1077
             stroma       0.86      0.85      0.85      4218
                  t       0.97      0.93      0.95      1416

           accuracy                           0.62     30335
          macro avg       0.64      0.61      0.60     30335
       weighted avg       0.54      0.62      0.55     30335



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
a = [0.00,
0.90, 
0.77, 
0.03, 
0.00,    
0.84,  
0.95, 
0.96, 
0.70, 
0.86, 
0.95]

In [20]:
b = [
0.90, 
0.77, 
0.03,     
0.84,  
0.95, 
0.96, 
0.70, 
0.86, 
0.95]

In [21]:
sum(a) / len(a)

0.6327272727272728

In [22]:
sum(b) / len(b)

0.7733333333333334

In [23]:
classification

'                     precision    recall  f1-score   support\n\n                  b       0.00      0.00      0.00         0\n        endothelial       0.38      0.96      0.54      6181\n             innate       0.97      0.89      0.93      4282\n              nerve       0.91      0.94      0.93      2047\n              novel       0.00      0.00      0.00         0\n               pdpn       0.98      0.65      0.78       914\n             plasma       0.95      0.95      0.95      1177\n       smoothmuscle       0.00      0.00      0.00      9023\nsquamous_epithelial       0.99      0.56      0.72      1077\n             stroma       0.86      0.85      0.85      4218\n                  t       0.97      0.93      0.95      1416\n\n           accuracy                           0.62     30335\n          macro avg       0.64      0.61      0.60     30335\n       weighted avg       0.54      0.62      0.55     30335\n'

In [24]:
print(classification)

                     precision    recall  f1-score   support

                  b       0.00      0.00      0.00         0
        endothelial       0.38      0.96      0.54      6181
             innate       0.97      0.89      0.93      4282
              nerve       0.91      0.94      0.93      2047
              novel       0.00      0.00      0.00         0
               pdpn       0.98      0.65      0.78       914
             plasma       0.95      0.95      0.95      1177
       smoothmuscle       0.00      0.00      0.00      9023
squamous_epithelial       0.99      0.56      0.72      1077
             stroma       0.86      0.85      0.85      4218
                  t       0.97      0.93      0.95      1416

           accuracy                           0.62     30335
          macro avg       0.64      0.61      0.60     30335
       weighted avg       0.54      0.62      0.55     30335

