# Feed Forward Neural Network 

In [1]:
import sys  
sys.path.insert(0, '../scripts')

In [2]:
dataset_train = '../dataset/68kPBMC_processed.h5ad'
dataset_test = '../dataset/smartseq2.h5ad'

In [3]:
import pickle
from utils import *

In [4]:
import scanpy as sc
import os
from numpy.random import seed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn import metrics

In [5]:
adata_train, adata_test = train_test_split(sc.read(dataset_train), train_frac=0.80)
# adata_test=sc.read(it_clust_dataset_test)

In [6]:
len(adata_train.to_df())

54840

In [7]:
print("Starting preprocessing...")
train_dic = preprocess(adata_train, min_cells=0,min_genes=0, get_hvgs = True, scale_and_hvgs = True, calculate_hvg_and_log1p = False)
test_dic = preprocess(adata_test, min_cells=0, min_genes=0)

Starting preprocessing...


  view_to_actual(adata)


In [8]:
col= [i for i in train_dic['hvg'].index]
print(col)

['AL627309.1', 'RP11-54O7.1', 'RP11-54O7.17', 'HES4', 'ISG15', 'TNFRSF18', 'C1orf233', 'C1orf222', 'GABRD', 'PLCH2', 'RP3-395M20.7', 'LINC01134', 'NPHP4', 'SLC2A7', 'RBP7', 'KIF1B', 'AGTRAP', 'MTHFR', 'NPPA-AS1', 'KIAA2013', 'PLOD1', 'TNFRSF1B', 'EFHD2', 'CTRC', 'PLEKHM2', 'EPHA2', 'PADI2', 'PADI4', 'ARHGEF10L', 'ALDH4A1', 'RP1-43E13.2', 'CDA', 'RP3-329E20.2', 'HSPG2', 'C1QA', 'C1QC', 'C1QB', 'EPHB2', 'RP5-1057J7.6', 'E2F2', 'ID3', 'CLIC4', 'RUNX3', 'PAQR7', 'EXTL1', 'ZNF593', 'ZNF683', 'FAM46B', 'SLC9A1', 'FGR', 'IFI6', 'PTAFR', 'RP5-1092A3.5', 'RP11-442N24__B.1', 'SDC3', 'ZCCHC17', 'SERINC2', 'RP11-73M7.9', 'SPOCD1', 'KIAA1522', 'PHC2', 'TFAP2E', 'EVA1B', 'CSF3R', 'LINC01137', 'ZC3H12A', 'MYCL', 'MFSD2A', 'KCNQ4', 'KDM4A-AS1', 'CCDC24', 'SLC6A9', 'HPDL', 'MAST2', 'LURAP1', 'FAAH', 'LINC00853', 'PDZK1IP1', 'TAL1', 'PODN', 'SLC1A7', 'RP11-117D22.2', 'AL161915.1', 'C1orf177', 'RP11-90C4.1', 'JUN', 'EFCAB7', 'CACHD1', 'AK4', 'WLS', 'RP4-677H15.4', 'ST6GALNAC3', 'NEXN-AS1', 'NEXN', 'RP5-8

In [9]:
train_adata_pp =  train_dic['data']
print(train_dic['hvg'])
test_adata_pp =  test_dic['data'][:, intersection(col, test_dic['data'].var.index)]
train_adata_pp = train_dic['data'][:, intersection(col, train_dic['data'].var.index)]

AL627309.1      True
RP11-54O7.1     True
RP11-54O7.17    True
HES4            True
ISG15           True
                ... 
AP001469.9      True
S100B           True
MT-ND2          True
MT-CO1          True
AC011043.1      True
Name: highly_variable, Length: 2000, dtype: bool


In [10]:
train_df = train_adata_pp.to_df()
test_df = test_adata_pp.to_df()

In [11]:
## taking common genes
print("Taking common genes...")
final_columns = list(set(train_df.columns).intersection(set(test_df.columns)))
print('Common columns', len(final_columns))
final_columns = [i for i in final_columns if i != 'celltype'] 
train_df = train_df[final_columns]
test_df = test_df[final_columns]

Taking common genes...
Common columns 2000


In [12]:
y_train = train_adata_pp.obs.celltype.to_list()
y_test = test_adata_pp.obs.celltype.to_list()

In [13]:
print(len(y_train))
print(len(y_test))

54840
13711


In [14]:
X_train = train_df.to_numpy()
X_test = test_df.to_numpy()

In [15]:
X_test.shape

(13711, 2000)

In [16]:
mapped_data = get_data_mapping(X_train, y_train)

In [17]:
for i in mapped_data:
    print(i, len(mapped_data[i]))

CD56+ NK 4633
CD14+ Monocyte 2628
CD8+ Cytotoxic T 9177
CD4+/CD25 T Reg 11370
CD8+/CD45RA+ Naive Cytotoxic 17561
CD4+/CD45RA+/CD25- Naive T 2220
Dendritic 1508
CD19+ B 3031
CD4+/CD45RO+ Memory 2490
CD34+ 204
CD4+ T Helper2 18


In [18]:
mapping = get_mapping(y_train)

In [19]:
y_test_lab = convert_y_to_mapping(y_test, mapping)
y_test_lab = np.array(y_test_lab)

In [20]:
y_train_lab = convert_y_to_mapping(y_train, mapping)
y_train_lab = np.array(y_train_lab)

In [21]:
from sklearn.neural_network import MLPClassifier
train_model(MLPClassifier(random_state= 3, max_iter=300, verbose=True), X_train, y_train, X_test, y_test, mapping)

Iteration 1, loss = 1.93357332
Iteration 2, loss = 1.69377945
Iteration 3, loss = 1.55791618
Iteration 4, loss = 1.42126229
Iteration 5, loss = 1.32648970
Iteration 6, loss = 1.24927596
Iteration 7, loss = 1.17741226
Iteration 8, loss = 1.10668093
Iteration 9, loss = 1.04889214
Iteration 10, loss = 1.00753676
Iteration 11, loss = 0.96955319
Iteration 12, loss = 0.92648782
Iteration 13, loss = 0.89226175
Iteration 14, loss = 0.85378911
Iteration 15, loss = 0.83647518
Iteration 16, loss = 0.80013147
Iteration 17, loss = 0.77812291
Iteration 18, loss = 0.75559933
Iteration 19, loss = 0.72709935
Iteration 20, loss = 0.70131031
Iteration 21, loss = 0.68778612
Iteration 22, loss = 0.66340083
Iteration 23, loss = 0.66051341
Iteration 24, loss = 0.64440788
Iteration 25, loss = 0.63124317
Iteration 26, loss = 0.60950150
Iteration 27, loss = 0.59863304
Iteration 28, loss = 0.58005391
Iteration 29, loss = 0.56163442
Iteration 30, loss = 0.55304324
Iteration 31, loss = 0.54059123
Iteration 32, los

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                              precision    recall  f1-score   support

              CD4+ T Helper2       0.88      0.72      0.79       678
         CD4+/CD45RO+ Memory       0.95      0.15      0.25       785
                       CD34+       0.97      0.87      0.92        45
CD8+/CD45RA+ Naive Cytotoxic       0.00      0.00      0.00         1
              CD14+ Monocyte       0.51      0.71      0.60      2740
                     CD19+ B       0.20      0.67      0.30       571
  CD4+/CD45RA+/CD25- Naive T       0.46      0.37      0.41       635
             CD4+/CD25 T Reg       0.44      0.29      0.35      1226
            CD8+ Cytotoxic T       0.65      0.40      0.50      2268
                   Dendritic       0.70      0.73      0.72      4407
                    CD56+ NK       0.00      0.00      0.00       355

                    accuracy                           0.56     13711
                   macro avg       0.52      0.44      0.44     13711
                we

  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
calculate_freq(y_test)

{'Dendritic': 355,
 'CD8+/CD45RA+ Naive Cytotoxic': 4407,
 'CD14+ Monocyte': 678,
 'CD19+ B': 785,
 'CD4+/CD25 T Reg': 2740,
 'CD4+/CD45RA+/CD25- Naive T': 571,
 'CD56+ NK': 1226,
 'CD8+ Cytotoxic T': 2268,
 'CD4+/CD45RO+ Memory': 635,
 'CD34+': 45,
 'CD4+ T Helper2': 1}