In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# read data file
AML_data = pd.read_csv("AML_assay_clinical.csv")
AML_data = AML_data.replace('Unknown', np.float('nan'))
columns = ['Comment','Refractory Timepoint sent for Induction Failure Project', 'data_type','updated_datetime','file_name', 'submitter_id','file_id',
           'file_size','id','created_datetime','md5sum','data_format','access','state','data_category','type','experimental_strategy',
           'project.project_id','entity_id','case_id','entity_submitter_id', 'entity_type']
AML_data.drop(columns, inplace=True, axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
AML_data.head()

Unnamed: 0.1,Unnamed: 0,TARGET USI,Diagnostic ID,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,...,ENSG00000281649.1,ENSG00000281691.1,ENSG00000281706.1,ENSG00000281741.1,ENSG00000281789.1,ENSG00000281896.1,ENSG00000281912.1,__no_feature,__ambiguous,__alignment_not_unique
0,0,TARGET-20-PABLDZ,04A,Female,White,Not Hispanic or Latino,2455,Relapse,714,Alive,...,5.486997,-0.834782,-1.296158,-5.407333,-0.256885,1.024227,-3.844758,17.53958,15.68347,18.76896
1,1,TARGET-20-PADYIR,04A,Male,White,Not Hispanic or Latino,1159,Relapse,373,Dead,...,5.981757,0.545861,-2.077616,-1.572028,-2.077616,2.026762,-2.342754,16.88417,14.59825,18.32622
2,2,TARGET-20-PADYIR,09A,Male,White,Not Hispanic or Latino,1159,Relapse,373,Dead,...,6.877301,1.096532,-2.710611,-1.947448,-1.421239,2.485413,-2.448536,17.55232,14.80745,17.93863
3,3,TARGET-20-PADZCG,09A,Female,,Not Hispanic or Latino,5325,Relapse,314,Dead,...,5.465605,-0.330849,-1.843229,-0.801464,-0.364707,2.164604,-0.152689,17.12435,14.60608,17.85268
4,4,TARGET-20-PADZCG,04A,Female,,Not Hispanic or Latino,5325,Relapse,314,Dead,...,4.989745,1.008904,-2.298227,-1.264844,0.362001,2.474975,-1.534493,17.4779,14.67804,18.92434


In [4]:
# show dimensions of AML_data
AML_data.shape

(187, 21470)

In [5]:
object_data = AML_data.describe(include=['object'])
cols = list(object_data)

In [6]:
for col in cols:
    AML_data[col] = AML_data[col].astype('category').cat.codes
#AML_data = AML_data.apply(pd.to_numeric)
AML_data.head()

Unnamed: 0.1,Unnamed: 0,TARGET USI,Diagnostic ID,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,...,ENSG00000281649.1,ENSG00000281691.1,ENSG00000281706.1,ENSG00000281741.1,ENSG00000281789.1,ENSG00000281896.1,ENSG00000281912.1,__no_feature,__ambiguous,__alignment_not_unique
0,0,0,1,0,5,1,2455,3,714,0,...,5.486997,-0.834782,-1.296158,-5.407333,-0.256885,1.024227,-3.844758,17.53958,15.68347,18.76896
1,1,1,1,1,5,1,1159,3,373,1,...,5.981757,0.545861,-2.077616,-1.572028,-2.077616,2.026762,-2.342754,16.88417,14.59825,18.32622
2,2,1,2,1,5,1,1159,3,373,1,...,6.877301,1.096532,-2.710611,-1.947448,-1.421239,2.485413,-2.448536,17.55232,14.80745,17.93863
3,3,2,2,0,-1,1,5325,3,314,1,...,5.465605,-0.330849,-1.843229,-0.801464,-0.364707,2.164604,-0.152689,17.12435,14.60608,17.85268
4,4,2,1,0,-1,1,5325,3,314,1,...,4.989745,1.008904,-2.298227,-1.264844,0.362001,2.474975,-1.534493,17.4779,14.67804,18.92434


In [7]:
AML_data.shape

(187, 21470)

In [8]:
print(AML_data.groupby('Risk group').size())

Risk group
-1    10
 0    12
 1    72
 2    93
dtype: int64


In [9]:
# move Risk Group (the column to be predicted) to be the last column in the data frame
riskGroup_column = AML_data.pop('Risk group')
AML_data['Risk group'] = riskGroup_column

In [10]:
scoring = 'accuracy'

In [11]:
np.any(np.isnan(AML_data))

True

In [12]:
AML_data=AML_data.fillna(-1)

In [13]:
np.any(np.isnan(AML_data))

False

In [14]:
AML_data = AML_data.apply(pd.to_numeric)

In [15]:
# separate out validation data set
AML_array = AML_data.values
X = AML_array[:,0:21469]
Y = AML_array[:,21469]
validation_size=0.20
seed = 42
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [17]:
# create an array to house each model
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state = seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# create an array to house the accuracy estimations for each model
results = []
names = []
for name, model in models:
    # create ten folds
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

LR: 0.839048 (0.112159)




LDA: 0.819048 (0.133197)
KNN: 0.538095 (0.092655)
CART: 0.940000 (0.141264)
NB: 0.778095 (0.130934)
SVM: 0.530000 (0.130341)


### AML Models in Order of Descending Accuracy (from this notebook): 
CART (94.0%), LR (83.9%), LDA (81.9%), NB (77.8%), KNN (53.8%), SVM (53.0%)

### NBL Models in Order of Descending Accuracy (from "Bioinformatics Final Project_NBL_EBL.ipynb"):
LR(94.2%), LDA (92.5%), CART (88.3%), NB(86.7%), KNN (80.0%), SVM (78.3%)

### WT Models in Order of Descendign Accuracy (from "Bioinformatics Final Project_WT_EBL.ipynb"):
LR (43.0%), NB(41.9%), KNN(39.2%), LDA (37.4%), CART (31.7%), SVM (28.6%)

## Ensemble Method

In [None]:
# create base models
model1 = LinearDiscriminantAnalysis()


model2 = LogisticRegression(sovler="liblinear",multi_class='ovr')

