In [421]:
'''
If you run this script, it will generate 41 models on working directory.
'''

'\nIf you run this script, it will generate 41 models on working directory.\n'

In [422]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import pandas as pd
import pyreadr
import joblib
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve
import warnings

warnings.filterwarnings("ignore")

In [423]:
#CU gene expression and metadata
result = pyreadr.read_r('gene_exp.rds')
gene_exp = result[None]
result = pyreadr.read_r('metadata_patient.rds') 
cu_meta = result[None]
#common gene list
result = pyreadr.read_r('gene_combined.rds')
gene_combined = result[None]
#mean and sd of negative group in CU
result = pyreadr.read_r('tpm_negative_mean.rds') 
tpm_negative_mean = result[None]
result = pyreadr.read_r('tpm_negative_sd.rds') 
tpm_negative_sd = result[None]

#Broad gene expression and metadata, cpm normalized
broad_meta = pd.read_csv('sampleLabels-severe-mild_broad.txt', header = 0, sep = '\t')
result = pyreadr.read_r('cpm_broad.rds')
broad_gene_exp = result[None]



#broad_negative_mean, broad_negative_sd
result = pyreadr.read_r('broad_negative_mean.rds')
broad_negative_mean = result[None]
result = pyreadr.read_r('broad_negative_sd.rds')
broad_negative_sd = result[None]

#uw gene expression and metadata, tpm normalized
result = pyreadr.read_r('uw_gene_exp.rds')
uw_gene_exp = result[None]
result = pyreadr.read_r('uw_meta.rds')
uw_meta = result[None]

#uw negative mean and sd
result = pyreadr.read_r('uw_negative_mean.rds')
uw_negative_mean = result[None]
result = pyreadr.read_r('uw_negative_sd.rds')
uw_negative_sd = result[None]

In [424]:
negative_samples = cu_meta[cu_meta['diagnosis'] == 'Negative']
broad_negative_samples = broad_meta[broad_meta['group'] == 'Negative']
uw_negative_samples = uw_meta[uw_meta['group'] == 'Negative']

negative_sample_ids = negative_samples['sample.id'].values
broad_negative_sample_ids = broad_negative_samples['sample'].values
uw_negative_sample_ids = uw_negative_samples['sample'].values

gene_exp_filtered = gene_exp.drop(columns=negative_sample_ids)
broad_gene_exp_filtered = broad_gene_exp.drop(columns = broad_negative_sample_ids)
uw_gene_exp_filtered = uw_gene_exp.drop(columns = uw_negative_sample_ids)

In [425]:
gene_exp = (gene_exp_filtered - np.array(tpm_negative_mean))/np.array(tpm_negative_sd)
broad_gene_exp = (broad_gene_exp_filtered - np.array(broad_negative_mean))/np.array(broad_negative_sd)
uw_gene_exp = (uw_gene_exp_filtered - np.array(uw_negative_mean))/np.array(uw_negative_sd)

In [426]:
#find the common gene

common_rows = gene_exp.index.intersection(gene_combined.iloc[:, 0])
common_rows = common_rows.intersection(broad_gene_exp.index)
common_rows = common_rows.intersection(uw_gene_exp.index)

gene_exp_filtered = gene_exp.loc[common_rows]
broad_gene_exp_filtered = broad_gene_exp.loc[common_rows]
uw_gene_exp_filtered = uw_gene_exp.loc[common_rows]


In [404]:
'''
merged each meta data with its corresponding diagnosis
'''
merged = cu_meta.merge(gene_exp_filtered.T, left_on='sample.id', right_index=True)
merged_broad = broad_meta.merge(broad_gene_exp_filtered.T, left_on='sample', right_index=True)
merged_uw = uw_meta.merge(uw_gene_exp_filtered.T, left_on = 'sample', right_index= True)
def average_range(s):
    if isinstance(s, str) and '-' in s:
        split_s = s.split('-')
        return (float(split_s[0]) + float(split_s[1])) / 2
    else:
        return float(s)

merged.iloc[:, 2] = merged.iloc[:, 2].apply(average_range)
merged.iloc[:, 2] = merged.iloc[:, 2].fillna(merged.iloc[:, 2].mean())

In [405]:
#remove unnecessary columns 
columns_to_remove = ['sample', 'batch', 'cohort', 'group', 'sex', 'viral.load', 'immune.frac', 'monocyte.neutrophil']
merged_uw = merged_uw.drop(columns=columns_to_remove)
merged_broad = merged_broad.drop(columns=columns_to_remove)
merged_uw = merged_uw.dropna(subset=[merged_uw.columns[0]])
merged_broad = merged_broad.dropna(subset=[merged_broad.columns[0]])

test_set = pd.concat([merged_uw, merged_broad], ignore_index=True)
#remove those do not have a diagnosis.
test_set = test_set.dropna(subset=[test_set.columns[0]])
merged = merged.dropna(subset=[merged.columns[10]])

In [428]:
#test if the age is valid
merged.iloc[:, 2]

0      69.0
1      84.0
2      69.0
3      65.0
4      61.0
       ... 
373    25.0
377    21.0
379    20.0
380    19.0
394     3.0
Name: age, Length: 247, dtype: float64

In [429]:
selected_columns = merged.iloc[:, 12:]
abs_values = selected_columns.abs()
average_abs_values = abs_values.mean()
ranked_average_abs_values = average_abs_values.sort_values(ascending=False)

In [430]:
#test if I did the feature selection ranking correctly
ranked_average_abs_values.index

Index(['ADM', 'C1QC', 'MAFF', 'C1QA', 'UGT2A2', 'PCSK5', 'CDKN1A', 'POLG2',
       'CALB2', 'IFRD1',
       ...
       'PGAM4', 'MRGPRD', 'ALDOB', 'ZNF729', 'NPIPB7', 'IL21', 'GPR21',
       'HSFX3', 'GREM1', 'NAT8L'],
      dtype='object', length=16769)

In [None]:
# t test for each individual groups, decides gene ranking.
'''
divide test sets, currently working on.
'''

In [417]:
# training set features, which is CU data
merged[ranked_average_abs_values.index[:].values].head(10)

Unnamed: 0,ADM,C1QC,MAFF,C1QA,UGT2A2,PCSK5,CDKN1A,POLG2,CALB2,IFRD1,...,PGAM4,MRGPRD,ALDOB,ZNF729,NPIPB7,IL21,GPR21,HSFX3,GREM1,NAT8L
0,0.276546,2.62625,1.36005,2.569217,1.128772,1.497387,0.754544,0.724857,0.646,0.058803,...,-0.270068,2.475687,0.089797,0.221081,-0.268207,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
1,1.21557,1.756729,0.686204,1.538383,4.045701,0.453945,1.921785,0.674481,0.497058,1.000116,...,-0.270068,-0.342867,-0.338212,-0.335045,-0.268207,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
2,0.985007,1.65337,1.608142,1.697204,0.563492,1.633918,0.0247,-0.817963,1.429505,0.693123,...,-0.270068,-0.342867,-0.338212,5.097917,-0.268207,2.055352,-0.184736,-0.281767,-0.114647,-0.186995
3,-0.246479,2.719658,-0.962831,2.435366,0.78517,-0.358566,0.149126,-0.436889,0.039628,-0.558979,...,-0.270068,-0.342867,-0.338212,-0.335045,0.383004,2.251069,-0.184736,-0.281767,-0.114647,-0.186995
4,1.228187,-0.22933,1.873999,0.389895,-1.021106,0.167216,0.113133,-0.561348,-0.757904,0.912762,...,-0.270068,-0.342867,-0.338212,-0.335045,-0.268207,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
5,-1.034742,1.49859,2.294009,0.991459,-0.171434,2.193169,-0.328368,1.898371,1.579794,0.301865,...,-0.270068,-0.342867,4.201808,0.736237,-0.268207,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
6,0.064729,0.391373,-0.834157,0.144134,0.279558,-0.677354,0.34447,-0.08753,-0.757904,-0.656192,...,-0.270068,-0.342867,-0.338212,-0.335045,-0.268207,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
7,1.180479,1.936716,1.415578,1.01078,-0.285508,1.436415,1.703089,-0.394174,0.49854,0.518835,...,-0.270068,0.765959,-0.338212,0.55813,-0.268207,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
9,0.647595,0.777555,-0.841302,0.749844,-0.590974,-0.663757,0.085756,-0.302965,-0.757904,0.223048,...,-0.270068,-0.342867,3.286919,-0.335045,0.976582,-0.227223,-0.184736,-0.281767,-0.114647,-0.186995
10,-0.170295,0.770293,0.506925,0.964098,0.979285,1.12245,-0.824242,-0.238914,1.046889,-0.975287,...,1.760617,-0.342867,-0.338212,-0.168897,-0.268207,2.193853,-0.184736,-0.281767,-0.114647,-0.186995


In [415]:
# broad test set features
merged_broad[ranked_average_abs_values.index[:].values].head(10)

Unnamed: 0,ADM,C1QC,MAFF,C1QA,UGT2A2,PCSK5,CDKN1A,POLG2,CALB2,IFRD1,...,PGAM4,MRGPRD,ALDOB,ZNF729,NPIPB7,IL21,GPR21,HSFX3,GREM1,NAT8L
1,0.2363,3.472624,0.193327,3.038665,-0.561946,-2.826852,0.173153,1.146304,-0.535224,1.47704,...,-0.634896,-0.410624,-0.431235,-0.433764,-0.208514,-0.403845,-0.208514,3.040453,0.718992,-0.468885
2,-0.027975,2.619862,0.075775,2.301321,-0.561946,-0.943936,0.483482,1.574767,1.436511,1.137956,...,-0.634896,-0.410624,-0.431235,-0.433764,-0.208514,-0.403845,-0.208514,-0.208514,-0.055522,-0.468885
6,-1.000149,-0.53573,-2.048586,-0.50601,-0.561946,-2.582281,-2.388993,-1.975744,-0.535224,-3.471038,...,-0.634896,-0.410624,-0.431235,-0.433764,-0.208514,-0.403845,-0.208514,-0.208514,-0.670288,-0.468885
7,0.964635,2.022554,1.069712,2.626175,-0.561946,3.771577,2.559705,2.144832,1.436511,1.23715,...,3.529859,-0.410624,2.590001,-0.433764,28.271311,-0.403845,-0.208514,-0.208514,-0.670288,-0.468885
9,0.763485,2.16746,0.321936,3.180711,-0.561946,3.582943,0.582374,1.004703,-0.535224,0.100035,...,-0.634896,-0.410624,1.474953,2.407575,-0.208514,-0.403845,28.271311,-0.208514,-0.055522,-0.468885
10,0.112944,-0.53573,0.602245,-0.50601,1.426521,0.005474,-0.491255,-0.443608,-0.535224,-0.118556,...,-0.634896,-0.410624,1.474953,-0.433764,-0.208514,3.500276,-0.208514,-0.208514,-0.670288,-0.468885
12,-1.582748,0.317032,-0.909293,-0.50601,-0.561946,-0.75796,-1.384124,-0.774639,-0.535224,-1.216393,...,-0.634896,-0.410624,-0.431235,-0.433764,-0.208514,-0.403845,-0.208514,-0.208514,-0.670288,-0.468885
15,-4.266899,-0.53573,-3.797581,-0.50601,-0.561946,-8.495457,-5.32223,-4.152731,-0.535224,-7.658854,...,-0.634896,-0.410624,-0.431235,-0.433764,-0.208514,-0.403845,-0.208514,-0.208514,-1.721238,-0.468885
17,0.979691,4.002605,1.362973,4.315601,-0.561946,-4.294407,1.379953,3.081334,2.9572,2.495262,...,-0.634896,-0.410624,3.381141,-0.433764,28.271311,-0.403845,-0.208514,-0.208514,-1.721238,-0.468885
20,0.78796,-0.53573,1.36115,-0.50601,1.426521,0.990563,0.59289,0.598886,-0.535224,1.760872,...,-0.634896,-0.410624,-0.431235,2.407575,-0.208514,3.500276,-0.208514,-0.208514,0.380662,-0.468885


In [416]:
#uw test set features
merged_uw[ranked_average_abs_values.index[:].values].head(10)

Unnamed: 0,ADM,C1QC,MAFF,C1QA,UGT2A2,PCSK5,CDKN1A,POLG2,CALB2,IFRD1,...,PGAM4,MRGPRD,ALDOB,ZNF729,NPIPB7,IL21,GPR21,HSFX3,GREM1,NAT8L
0,-0.952709,1.79071,0.404689,1.009556,0.913233,2.139796,-2.044403,-0.589416,-0.34277,-1.409894,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,-0.562016,-0.26675
1,-0.952709,1.273605,-1.237085,3.012774,-0.473767,0.868761,-0.790095,-0.185775,1.362967,-0.218003,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,13.536342,-0.562016,-0.26675
2,-0.008554,1.438283,0.818681,3.564784,-0.473767,1.73382,-0.118356,-0.345643,-0.34277,0.370306,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,-0.299455,0.129207
3,-0.293877,1.715327,-0.07894,2.254197,-0.473767,2.345984,1.643689,0.137445,-0.34277,0.696894,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,5.266127,-0.26675
4,-0.952709,1.289118,0.00486,-0.669526,-0.473767,1.416269,0.167711,-0.041793,-0.34277,-1.169578,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,-0.562016,-0.26675
5,-0.952709,-0.123405,-1.286824,1.181369,2.252699,-0.412064,-1.039037,-0.589416,3.461726,0.896437,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,1.18683,2.276102
6,-0.952709,1.392937,-1.286824,-0.669526,-0.473767,1.095368,0.318971,-0.589416,-0.34277,-0.376357,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,3.80067,-0.165635,1.667376,-0.26675
7,-0.952709,-1.009553,-1.286824,-0.669526,-0.473767,-0.056175,-2.044403,-0.589416,-0.34277,-1.179737,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,-0.562016,-0.26675
8,-0.952709,1.25546,-1.286824,0.996054,-0.473767,-0.698944,-1.147966,0.855868,-0.34277,-0.08582,...,-0.204007,-0.183912,-0.332987,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,-0.03228,0.274155
9,-0.952709,0.408372,-0.469761,-0.178736,-0.262506,-0.324567,-1.102637,0.45729,-0.34277,-0.020273,...,-0.204007,-0.183912,5.750699,-0.210106,-0.160465,-0.145865,-0.202372,-0.165635,2.231201,-0.26675


In [216]:
# training, Age not included
best_model = [None]*20
accuracy = [None]*20
auroc = [None]*20

for i in range(20):
    X = merged[ranked_average_abs_values.index[0:1 + i]].values
    y = merged['diagnosis'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    print(X.shape)
    params = {'hidden_layer_sizes': [(10,), (15,), (20,)],
              'alpha': [1e-4, 1e-5, 1e-6],
              'solver': ['sgd', 'adam'],
              'random_state': [123]}
    model = MLPClassifier()

    grid_search = GridSearchCV(model, param_grid=params, cv=5)
    grid_search.fit(X_train, y_train)
    print("Best parameters: ", grid_search.best_params_)
    print("Best accuracy: ", grid_search.best_score_)

    best_model[i] = grid_search.best_estimator_
    joblib.dump(best_model[i], f'best_no_age_{i}.joblib')
    pred = best_model[i].predict(X_test)
    accuracy[i] = accuracy_score(y_test, pred)
    print("Test accuracy: ", accuracy[i])
    pred_probs = best_model[i].predict_proba(X_test)
    # Calculate the AUROC score, expected over 70%
    auroc[i] = roc_auc_score(y_test, pred_probs[:, 1])
    print("AUROC: ", auroc[i])


(247, 1)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'sgd'}
Best accuracy:  0.6903846153846155
Test accuracy:  0.76
AUROC:  0.4451754385964912
(247, 2)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'sgd'}
Best accuracy:  0.6903846153846155
Test accuracy:  0.76
AUROC:  0.5241228070175439
(247, 3)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'adam'}
Best accuracy:  0.7156410256410257
Test accuracy:  0.76
AUROC:  0.7149122807017544
(247, 4)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'random_state': 123, 'solver': 'sgd'}
Best accuracy:  0.6905128205128206
Test accuracy:  0.76
AUROC:  0.6535087719298246
(247, 5)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'sgd'}
Best accuracy:  0.7206410256410256
Test accuracy:  0.64
AUROC:  0.6929824561403508
(247, 6)
Best parameters

In [309]:
# taining, Age included as the first feature

_best_model = [None]*21
_accuracy = [None]*21
_auroc = [None]*21

for i in range(21):
    selected_columns = merged[ranked_average_abs_values.index[0: i]]
    selected_columns.insert(0, 'age', merged['age'])
    X = selected_columns.values
    y = merged['diagnosis'].values
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    print(X.shape)
    params = {'hidden_layer_sizes': [(10,), (15,), (20,)],
              'alpha': [1e-4, 1e-5, 1e-6],
              'solver': ['sgd', 'adam'],
              'random_state': [123]}
    model = MLPClassifier()

    grid_search = GridSearchCV(model, param_grid=params, cv=5)
    grid_search.fit(X, y)
    print("Best parameters: ", grid_search.best_params_)
    print("Best accuracy: ", grid_search.best_score_)

    _best_model[i] = grid_search.best_estimator_
    joblib.dump(_best_model[i], f'best_with_age_{i}.joblib')
    pred = _best_model[i].predict(X)
    _accuracy[i] = accuracy_score(y, pred)
    print("Test accuracy: ", _accuracy[i])
    pred_probs = _best_model[i].predict_proba(X)
    # Calculate the AUROC score, expected over 70%
    _auroc[i] = roc_auc_score(y, pred_probs[:, 1])
    print("AUROC: ", _auroc[i])


(247, 1)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'sgd'}
Best accuracy:  0.7044897959183674
Test accuracy:  0.7125506072874493
AUROC:  0.7550779404818139
(247, 2)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (20,), 'random_state': 123, 'solver': 'adam'}
Best accuracy:  0.7044897959183674
Test accuracy:  0.7044534412955465
AUROC:  0.5758148323098725
(247, 3)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'adam'}
Best accuracy:  0.7044897959183674
Test accuracy:  0.7044534412955465
AUROC:  0.5028341993386869
(247, 4)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (15,), 'random_state': 123, 'solver': 'adam'}
Best accuracy:  0.6964081632653061
Test accuracy:  0.6963562753036437
AUROC:  0.5252716107699573
(247, 5)
Best parameters:  {'alpha': 0.0001, 'hidden_layer_sizes': (10,), 'random_state': 123, 'solver': 'sgd'}
Best accuracy:  0.7044897959183674
Test accuracy:

In [431]:
'''
Used before, now it is irrelevant.
'''
# #Age is the only feature, use kneighbors.
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler 
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import classification_report, confusion_matrix


# scaler = StandardScaler()
# scaler.fit(X)

# # X_train = scaler.transform(X_train)
# # X_test = scaler.transform(X_test) 
# classifier = KNeighborsClassifier(n_neighbors=5)
# classifier.fit(X, y) 
# y_predict = classifier.predict(X)
# print(confusion_matrix(y, y_predict))
# print(classification_report(y, y_predict))
# pred_probs = classifier.predict_proba(X)
# auroc = roc_auc_score(y, pred_probs[:, 1])
# print("AUROC: ", auroc)


In [432]:
'''
Used before, now it is irrelevant.
'''
# #5-fold validation for K_neighbor
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# classifier = KNeighborsClassifier(n_neighbors=5)
# scores = cross_val_score(classifier, X_scaled, y, cv=5)

# print("Cross-validation scores: ", scores)
# print("Mean cross-validation score: ", np.mean(scores))

# classifier.fit(X_scaled, y)
# y_predict = classifier.predict(X_scaled)

# print(confusion_matrix(y, y_predict))
# print(classification_report(y, y_predict))

# pred_probs = classifier.predict_proba(X_scaled)
# auroc = roc_auc_score(y, pred_probs[:, 1])
# print("AUROC: ", auroc)

In [433]:
'''
Used before, can do this later for best of the best model selection.
'''
# params = {
#     'hidden_layer_sizes': [(20,), (15, 10,), (10, 10,), (20, 10, 5), (30, 20, 10)],
#     'activation': ['relu', 'tanh', 'logistic'],
#     'solver': ['lbfgs', 'sgd', 'adam'],
#     'alpha': [1e-4, 1e-5, 1e-6],
#     'learning_rate': ['constant', 'invscaling', 'adaptive'],
#     'random_state': [123],
# }
# model = MLPClassifier()
# grid_search = GridSearchCV(model, param_grid=params, cv=5)
# grid_search.fit(X_train, y_train)
# print("Best parameters: ", grid_search.best_params_)
# print("Best accuracy: ", grid_search.best_score_)
# best_model = grid_search.best_estimator_
# #save
# joblib.dump(best_model, 'best_model.pkl')
# pred = best_model.predict(X_test)
# accuracy = accuracy_score(y_test, pred)
# print("Test accuracy: ", accuracy)
# pred_probs = best_model.predict_proba(X_test)
# auroc = roc_auc_score(y_test, pred_probs[:, 1])
# print(best_model)
# print("AUROC: ", auroc)
# num_hidden_layers = len(best_model.hidden_layer_sizes)
# print("Number of hidden layers: ", num_hidden_layers)
# print("Number of neurons in each hidden layer: ", best_model.hidden_layer_sizes)



'\nUsed before, can do this later for best of the best model selection.\n'

In [435]:
# pull out my test set labels
test_y = test_set['class'].values

In [436]:
# test if my test set features are valid
test_X

array([[ 3.10000000e+01, -9.52709465e-01,  1.79071042e+00, ...,
         9.45848382e-01, -5.84472983e-01, -1.17108752e+00],
       [ 5.10000000e+01, -9.52709465e-01,  1.27360539e+00, ...,
         6.34016576e-01, -5.84472983e-01,  4.75703162e-01],
       [ 4.60000000e+01, -8.55404010e-03,  1.43828264e+00, ...,
        -5.73009222e-01, -5.84472983e-01,  2.46082673e-01],
       ...,
       [ 2.50000000e+01, -3.57986899e+00, -5.35729817e-01, ...,
        -9.46718889e-01, -1.02239914e+00, -4.58362307e+00],
       [ 3.50000000e+01,  8.85720084e-01,  1.16979304e+00, ...,
         9.26978664e-01, -1.02239914e+00,  2.56591350e+00],
       [ 5.50000000e+01, -2.67166474e+00, -5.35729817e-01, ...,
        -9.46718889e-01, -1.02239914e+00, -3.89185733e+00]])

In [396]:
# print out what my test set labels look like
test_y

array(['Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Severe', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Mild', 'Mild', 'Mild',
       'Mild', 'Severe', 'Severe', 'Mild', 'Mild', 'Mild', 'Mild',
       'Severe', 'Mild', 'Severe', 'Mild', 'Severe', 'Severe', 'Severe',
       'Mild', 'Severe', 'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Severe', 'Severe',
       'Severe', 'Mild', 'Severe', 'Severe', 'Mild', 'Mild', 'Severe',
       'Severe', 'Severe', 'Severe', 'Severe', 'Severe', 'Mild', 'Severe',
       'Severe', 'Severe', 'Severe', 'Severe'], dtype=object)

In [354]:
# Apply my models without age as a feature to the merged test set.

best_no_age_models = []
predictions = []
scores = []
auroc = []

for i in range(20):
    model = joblib.load(f'best_no_age_{i}.joblib')
#     best_no_age_models.append(model)
    test_X = test_set[ranked_average_abs_values.index[0:1 + i]].values
    prediction = model.predict(test_X)
    predictions.append(prediction)

    # Calculate accuracy of the prediction
    score = accuracy_score(test_y, prediction)
    scores.append(score)
    print("Test accuracy: ", scores[i])
    probs = model.predict_proba(test_X)
#     print(probs)
    _auroc = roc_auc_score(test_y, probs[:, 0])
    auroc.append(_auroc)
    print("AUROC: ", auroc[i])

Test accuracy:  0.6666666666666666
AUROC:  0.4120095124851368
Test accuracy:  0.6666666666666666
AUROC:  0.5279429250891796
Test accuracy:  0.6896551724137931
AUROC:  0.47919143876337694
Test accuracy:  0.632183908045977
AUROC:  0.4875148632580262
Test accuracy:  0.6781609195402298
AUROC:  0.4239001189060642
Test accuracy:  0.6091954022988506
AUROC:  0.5838287752675387
Test accuracy:  0.6436781609195402
AUROC:  0.5677764565992865
Test accuracy:  0.5977011494252874
AUROC:  0.5689655172413793
Test accuracy:  0.5632183908045977
AUROC:  0.6028537455410227
Test accuracy:  0.5747126436781609
AUROC:  0.6070154577883472
Test accuracy:  0.5172413793103449
AUROC:  0.5725326991676576
Test accuracy:  0.6206896551724138
AUROC:  0.5439952437574316
Test accuracy:  0.5632183908045977
AUROC:  0.5844233055885851
Test accuracy:  0.5402298850574713
AUROC:  0.5921521997621879
Test accuracy:  0.5862068965517241
AUROC:  0.5909631391200951
Test accuracy:  0.6091954022988506
AUROC:  0.5095124851367421
Test acc

In [394]:
# Apply my models with age as the first feature to the merged test set.

best_with_age_models = []
_predictions = []
_scores = []
_auroc = []

for i in range(21):
    model = joblib.load(f'best_with_age_{i}.joblib')
    selected_columns = test_set[ranked_average_abs_values.index[0: i]]
    selected_columns.insert(0, 'age', test_set['age'])
    test_X = selected_columns.values
#     print(test_X)
    prediction = model.predict(test_X)
    _predictions.append(prediction)

    # Calculate accuracy of the prediction
    score = accuracy_score(test_y, prediction)
    _scores.append(score)
    print("Test accuracy: ", scores[i])
    probs = model.predict_proba(test_X)
    print(probs)
    auroc = roc_auc_score(test_y, probs[:, 0])
    _auroc.append(auroc)
    print("AUROC: ", _auroc[i])

Test accuracy:  0.6666666666666666
[[0.59690824 0.40309176]
 [0.56565453 0.43434547]
 [0.57352732 0.42647268]
 [0.56565453 0.43434547]
 [0.52266642 0.47733358]
 [0.61530628 0.38469372]
 [0.60920733 0.39079267]
 [0.59999511 0.40000489]
 [0.6122612  0.3877388 ]
 [0.56565453 0.43434547]
 [0.55616361 0.44383639]
 [0.57666626 0.42333374]
 [0.60461051 0.39538949]
 [0.55774847 0.44225153]
 [0.59381367 0.40618633]
 [0.55774847 0.44225153]
 [0.54503933 0.45496067]
 [0.57823343 0.42176657]
 [0.50341353 0.49658647]
 [0.58604507 0.41395493]
 [0.53546788 0.46453212]
 [0.55774847 0.44225153]
 [0.57195561 0.42804439]
 [0.59071163 0.40928837]
 [0.60307407 0.39692593]
 [0.56565453 0.43434547]
 [0.58448607 0.41551393]
 [0.52106379 0.47893621]
 [0.54822278 0.45177722]
 [0.57666626 0.42333374]
 [0.50983511 0.49016489]
 [0.59381367 0.40618633]
 [0.59690824 0.40309176]
 [0.56249588 0.43750412]
 [0.53546788 0.46453212]
 [0.55299049 0.44700951]
 [0.57823343 0.42176657]
 [0.61985694 0.38014306]
 [0.59071163 0.

In [392]:
# the prediction of my models with age
_predictions

[array(['Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild'],
       dtype='<U6'),
 array(['Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
        'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'M

In [374]:
# show test labels again
test_y

array(['Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Severe', 'Mild', 'Mild', 'Mild', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Mild', 'Mild', 'Mild',
       'Mild', 'Severe', 'Severe', 'Mild', 'Mild', 'Mild', 'Mild',
       'Severe', 'Mild', 'Severe', 'Mild', 'Severe', 'Severe', 'Severe',
       'Mild', 'Severe', 'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Mild',
       'Mild', 'Mild', 'Mild', 'Mild', 'Severe', 'Severe', 'Severe',
       'Severe', 'Mild', 'Severe', 'Severe', 'Mild', 'Mild', 'Severe',
       'Severe', 'Severe', 'Severe', 'Severe', 'Severe', 'Mild', 'Severe',
       'Severe', 'Severe', 'Severe', 'Severe'], dtype=object)

In [371]:
np.count_nonzero(test_y == "Severe")

29

In [373]:
np.count_nonzero(test_y == "Mild")

58