In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# read data file
NBL_data = pd.read_csv("NBL_assay_clinical.csv")
NBL_data = NBL_data.replace('Unknown', np.float('nan'))
columns = ['Comment', 'data_type', 'updated_datetime', 'file_name','submitter_id','file_id',
           'file_size','id','created_datetime','md5sum','data_format','access','state',
           'data_category','type','experimental_strategy','project.project_id','entity_id',
           'case_id','entity_submitter_id','entity_type']
NBL_data.drop(columns, inplace=True, axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
NBL_data.head()

Unnamed: 0.1,Unnamed: 0,TARGET USI,Diagnostic ID,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,...,ENSG00000281649.1,ENSG00000281691.1,ENSG00000281706.1,ENSG00000281741.1,ENSG00000281789.1,ENSG00000281896.1,ENSG00000281912.1,__no_feature,__ambiguous,__alignment_not_unique
0,0,TARGET-30-PAIFXV,01A,Female,White,,2004,Event,630.0,Dead,...,4.703077,-2.539632,-1.150738,-2.683996,-2.347017,-1.124745,0.144663,16.10492,14.46502,17.50624
1,1,TARGET-30-PAIPGU,01A,Female,White,,898,Death,436.0,Dead,...,3.411992,-2.575069,-1.661957,-3.032512,-0.908861,-1.379287,-0.084676,16.18287,14.43767,17.07436
2,2,TARGET-30-PAISNS,01A,Male,White,,1070,Event,410.0,Dead,...,4.507146,-3.390368,0.279517,-4.745955,-1.674654,-0.258184,-1.125036,15.6474,14.6414,16.73996
3,3,TARGET-30-PAITCI,01A,Male,White,,728,Event,232.0,Dead,...,4.677114,-3.331185,-5.240817,0.99649,0.421057,-1.924573,0.028438,16.05813,14.44632,16.8517
4,4,TARGET-30-PAIVHE,01A,Male,White,Not Hispanic or Latino,1123,Event,672.0,Dead,...,4.554736,-1.448875,-3.387508,-2.257757,-1.448875,-0.796058,-0.465148,16.43831,14.30633,16.74036


In [4]:
# show dimensions of NBL_data
NBL_data.shape

(150, 21440)

In [5]:
object_data = NBL_data.describe(include=['object'])
cols = list(object_data)

In [6]:
for col in cols:
    NBL_data[col] = NBL_data[col].astype('category').cat.codes
#NBL_data = NBL_data.apply(pd.to_numeric)
NBL_data.head()

Unnamed: 0.1,Unnamed: 0,TARGET USI,Diagnostic ID,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,...,ENSG00000281649.1,ENSG00000281691.1,ENSG00000281706.1,ENSG00000281741.1,ENSG00000281789.1,ENSG00000281896.1,ENSG00000281912.1,__no_feature,__ambiguous,__alignment_not_unique
0,0,0,0,0,3,-1,2004,2,630.0,1,...,4.703077,-2.539632,-1.150738,-2.683996,-2.347017,-1.124745,0.144663,16.10492,14.46502,17.50624
1,1,1,0,0,3,-1,898,1,436.0,1,...,3.411992,-2.575069,-1.661957,-3.032512,-0.908861,-1.379287,-0.084676,16.18287,14.43767,17.07436
2,2,2,0,1,3,-1,1070,2,410.0,1,...,4.507146,-3.390368,0.279517,-4.745955,-1.674654,-0.258184,-1.125036,15.6474,14.6414,16.73996
3,3,3,0,1,3,-1,728,2,232.0,1,...,4.677114,-3.331185,-5.240817,0.99649,0.421057,-1.924573,0.028438,16.05813,14.44632,16.8517
4,4,4,0,1,3,1,1123,2,672.0,1,...,4.554736,-1.448875,-3.387508,-2.257757,-1.448875,-0.796058,-0.465148,16.43831,14.30633,16.74036


In [7]:
NBL_data.shape

(150, 21440)

In [8]:
print(NBL_data.groupby('COG Risk Group').size())

COG Risk Group
0    119
1     17
2     14
dtype: int64


In [9]:
# move Risk Group (the column to be predicted) to be the last column in the data frame
riskGroup_column = NBL_data.pop('COG Risk Group')
NBL_data['Risk group'] = riskGroup_column

In [10]:
scoring = 'accuracy'

In [11]:
np.any(np.isnan(NBL_data))

True

In [12]:
NBL_data=NBL_data.fillna(-1)

In [13]:
np.any(np.isnan(NBL_data))

False

In [14]:
NBL_data = NBL_data.apply(pd.to_numeric)

In [15]:
# separate out validation data set
NBL_array = NBL_data.values
X = NBL_array[:,0:21439]
Y = NBL_array[:,21439]
validation_size=0.20
seed = 42
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [16]:
# create an array to house each model
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state = seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# create an array to house the accuracy estimations for each model
results = []
names = []
for name, model in models:
    # create ten folds
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

LR: 0.941667 (0.065085)




LDA: 0.925000 (0.078617)
KNN: 0.800000 (0.076376)
CART: 0.883333 (0.055277)
NB: 0.866667 (0.092796)
SVM: 0.783333 (0.113039)


### AML Models in Order of Descending Accuracy (from "Bioinformatics Final Project_AML_EBL.ipynb": 
CART (94.0%), LR (83.9%), LDA (81.9%), NB (77.8%), KNN (53.8%), SVM (53%)

### NBL Models in Order of Descending Accuracy (from this notebook):
LR(94.2%), LDA (92.5%), CART (88.3%), NB(86.7%), KNN (80.0%), SVM (78.3%)

### WT Models in Order of Descendign Accuracy (from "Bioinformatics Final Project_WT_EBL.ipynb"):
LR (43.0%), NB(41.9%), KNN(39.2%), LDA (37.4%), CART (31.7%), SVM (28.6%)