In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [2]:
# read data file
WT_data = pd.read_csv("WT_assay_clinical.csv")
# replace unknown values with NaN converted to np.float
WT_data = WT_data.replace('Unknown', np.float('nan'))
# create an array of columns whose data should be irrelevant to prediction
columns = ['Comment ', 'Discovery or Validation', 'updated_datetime', 'file_name', 'submitter_id', 'file_id',
           'file_size', 'id', 'created_datetime', 'md5sum', 'data_format', 'access', 'state', 'data_category',
           'experimental_strategy', 'project.project_id', 'entity_id', 'case_id', 'entity_submitter_id']
# drop the columns in that array from the dataframe
WT_data.drop(columns, inplace=True, axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
#display the first five rows of the WT_data
WT_data.head()

Unnamed: 0.1,Unnamed: 0,TARGET USI,Diagnostic ID,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,...,ENSG00000281649.1,ENSG00000281691.1,ENSG00000281706.1,ENSG00000281741.1,ENSG00000281789.1,ENSG00000281896.1,ENSG00000281912.1,__no_feature,__ambiguous,__alignment_not_unique
0,0,TARGET-50-CAAAAC,01A,Female,White,Not Hispanic or Latino,2244,Relapse,1184,DEAD,...,4.775164,-2.141151,-1.167538,-4.218016,-0.495778,-0.5333,-2.029702,16.04047,14.33014,17.94722
1,1,TARGET-50-CAAAAH,01A,Female,White,Not reported,1710,Relapse,165,DEAD,...,4.756811,-2.772979,-1.340607,-0.322102,-1.011248,-3.610656,-2.523513,15.91781,15.09286,18.31959
2,2,TARGET-50-CAAAAJ,01A,Female,White,Not Hispanic or Latino,401,Relapse,169,ALIVE,...,4.862147,-2.544237,-5.998956,-5.438748,-1.37111,-0.943032,-0.862982,15.61571,14.65771,17.50719
3,3,TARGET-50-CAAAAL,01A,Female,Black or African American,Not Hispanic or Latino,1599,Relapse,221,ALIVE,...,5.09378,-1.53245,-5.363655,-3.050407,-2.257779,-0.584042,-0.711118,15.29275,14.57354,17.69767
4,4,TARGET-50-CAAAAM,01A,Female,White,Not Hispanic or Latino,2750,,3260,ALIVE,...,4.869119,-3.374885,-0.428489,-4.398735,-1.927109,-1.470202,-1.874332,15.78805,14.97145,17.8124


In [4]:
# show dimensions of WT_data
WT_data.shape

(132, 21429)

In [5]:
# get a list of the columns in WT_data whose data type is 'object'
object_data = WT_data.describe(include=['object'])
cols = list(object_data)

In [6]:
# for each column in the list of object columns, create numerical substitutions for each variable category
for col in cols:
    WT_data[col] = WT_data[col].astype('category').cat.codes
# display first five rows
WT_data.head()

Unnamed: 0.1,Unnamed: 0,TARGET USI,Diagnostic ID,Gender,Race,Ethnicity,Age at Diagnosis in Days,First Event,Event Free Survival Time in Days,Vital Status,...,ENSG00000281649.1,ENSG00000281691.1,ENSG00000281706.1,ENSG00000281741.1,ENSG00000281789.1,ENSG00000281896.1,ENSG00000281912.1,__no_feature,__ambiguous,__alignment_not_unique
0,0,0,0,0,3,1,2244,2,1184,1,...,4.775164,-2.141151,-1.167538,-4.218016,-0.495778,-0.5333,-2.029702,16.04047,14.33014,17.94722
1,1,1,0,0,3,2,1710,2,165,1,...,4.756811,-2.772979,-1.340607,-0.322102,-1.011248,-3.610656,-2.523513,15.91781,15.09286,18.31959
2,2,2,0,0,3,1,401,2,169,0,...,4.862147,-2.544237,-5.998956,-5.438748,-1.37111,-0.943032,-0.862982,15.61571,14.65771,17.50719
3,3,3,0,0,0,1,1599,2,221,0,...,5.09378,-1.53245,-5.363655,-3.050407,-2.257779,-0.584042,-0.711118,15.29275,14.57354,17.69767
4,4,4,0,0,3,1,2750,0,3260,0,...,4.869119,-3.374885,-0.428489,-4.398735,-1.927109,-1.470202,-1.874332,15.78805,14.97145,17.8124


In [7]:
# display dimensions of WT_data dataframe
WT_data.shape

(132, 21429)

In [8]:

print(WT_data.groupby('Stage').size())

Stage
0    16
1    55
2    40
3     4
4     1
5    15
6     1
dtype: int64


In [9]:
# move Stage Group (the column to be predicted) to be the last column in the data frame
stageGroup_column = WT_data.pop('Stage')
WT_data['Stage'] = stageGroup_column

In [10]:
scoring = 'accuracy'

In [11]:
np.any(np.isnan(WT_data))

False

In [12]:
WT_data=WT_data.fillna(-1)

In [13]:
np.any(np.isnan(WT_data))

False

In [14]:
WT_data = WT_data.apply(pd.to_numeric)

In [17]:
# separate out validation data set
WT_array = WT_data.values
X = WT_array[:,0:21428]
Y = WT_array[:,21428]
validation_size=0.20
seed = 42
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [18]:
# create an array to house each model
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier(random_state = seed)))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))

# create an array to house the accuracy estimations for each model
results = []
names = []
for name, model in models:
    # create ten folds
    kfold = model_selection.KFold(n_splits=10, random_state = seed)
    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

LR: 0.430000 (0.115569)




LDA: 0.373636 (0.138734)
KNN: 0.391818 (0.101024)
CART: 0.317273 (0.122673)
NB: 0.419091 (0.122539)
SVM: 0.286364 (0.093795)


### AML Models in Order of Descending Accuracy (from "Bioinformatics Final Project_AML_EBL.ipynb"): 
CART (94.0%), LR (83.9%), LDA (81.9%), NB (77.8%), KNN (53.8%), SVM (53.0%)

### NBL Models in Order of Descending Accuracy (from "Bioinformatics Final Project_NBL_EBL.ipynb"):
LR (94.2%), LDA (92.5%), CART (88.3%), NB(86.7%), KNN (80.0%), SVM (78.3%)

### WT Models in Order of Descendign Accuracy (from this notebook):
LR (43.0%), NB(41.9%), KNN(39.2%), LDA (37.4%), CART (31.7%), SVM (28.6%)