# Intro to Machine Learning Final Project

### Bryce Readyhough 
### Robert Carter-Barkman

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import statistics

Loading in the dataset and displaying the first few rows

In [2]:
dataset = pd.read_csv("Colorectal_GSE44076.csv")

In [3]:
dataset.shape

(194, 49388)

In [4]:
dataset.tail()

Unnamed: 0,samples,type,11715100_at,11715101_s_at,11715102_x_at,11715103_x_at,11715104_s_at,11715105_at,11715106_x_at,11715107_s_at,...,AFFX-r2-TagO-3_at,AFFX-r2-TagO-5_at,AFFX-r2-TagQ-3_at,AFFX-r2-TagQ-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at
189,839,adenocarcinoma,2.949097,3.986188,3.269828,4.364195,2.962831,2.205185,2.573192,2.47753,...,1.818492,2.331696,2.409947,2.085928,6.93409,4.980856,5.952154,2.11515,1.987921,2.199241
190,840,adenocarcinoma,2.667807,3.916853,2.835128,3.335915,2.549352,2.070472,2.900022,2.370676,...,2.152947,2.166213,2.254691,2.047614,7.078561,5.03652,5.890267,1.951415,2.212261,2.019528
191,841,adenocarcinoma,3.280977,4.412883,3.490449,3.44394,2.907395,2.170415,2.843549,2.488809,...,1.896274,2.438472,2.198505,2.270089,6.521763,4.35232,5.219437,2.146429,2.131785,2.041606
192,842,adenocarcinoma,4.870618,5.608517,4.559007,4.708531,2.577943,2.287743,2.58673,2.298392,...,1.97208,2.524833,2.070878,2.236429,7.015886,4.995253,6.068636,2.063866,2.298868,2.246761
193,843,adenocarcinoma,3.45325,4.562236,3.322978,4.270554,2.253526,2.150965,2.510634,2.596587,...,1.770813,2.367669,2.712675,1.825407,7.011039,4.301665,5.633862,2.070172,2.289038,2.180846


In [5]:
mapVars = ['type']

def binaryMap(x):
    return x.map({"adenocarcinoma" : 1, "normal" : 0})

dataset[mapVars] = dataset[mapVars].apply(binaryMap)

In [6]:
XVals = dataset.iloc[:,2:49389].values
print(XVals[:2])
YVals = dataset.iloc[:,1].values
print(YVals[:2])

[[4.03308298 3.99132789 3.52248674 ... 2.19190714 2.0894311  2.19667821]
 [3.11727335 4.04725753 3.04270444 ... 2.01357765 2.18919182 2.10655443]]
[0 0]


In [7]:
principalComponents = PCA(50).fit_transform(XVals)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(principalComponents, YVals, test_size = 0.2, random_state = 1, shuffle = True)

In [9]:
classifier = SVC(kernel='rbf', C=1E10)
classifier.fit(X_train, np.ravel(Y_train))

SVC(C=10000000000.0)

In [10]:
Y_pred = classifier.predict(X_test)

In [11]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precisison = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
print("Accuracy: " + str(accuracy))
print("precisison: " + str(precisison))
print("Recall: " + str(recall))

[[20  0]
 [ 0 19]]
Accuracy: 1.0
precisison: 1.0
Recall: 1.0


In [12]:
classifier = SVC(kernel='poly', C=1E9)
classifier.fit(X_train, np.ravel(Y_train))

SVC(C=1000000000.0, kernel='poly')

In [13]:
Y_pred = classifier.predict(X_test)

In [14]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precisison = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
print("Accuracy: " + str(accuracy))
print("precisison: " + str(precisison))
print("Recall: " + str(recall))

[[20  0]
 [ 0 19]]
Accuracy: 1.0
precisison: 1.0
Recall: 1.0


In [15]:
classifier = SVC(kernel='linear', C=1E10)
classifier.fit(X_train, np.ravel(Y_train))

SVC(C=10000000000.0, kernel='linear')

In [16]:
Y_pred = classifier.predict(X_test)

In [17]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precisison = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
print("Accuracy: " + str(accuracy))
print("precisison: " + str(precisison))
print("Recall: " + str(recall))

[[20  0]
 [ 0 19]]
Accuracy: 1.0
precisison: 1.0
Recall: 1.0


In [19]:
KFoldsClassifier = SVC(kernel='poly', C=1E9)

evalMetrics = ['accuracy', 'precision', 'recall']
K5Folds = KFold(n_splits=10, random_state=1, shuffle=True)
Scores = cross_validate(KFoldsClassifier, XVals, YVals, scoring=evalMetrics, cv=K5Folds, n_jobs=-1)

print(Scores)

{'fit_time': array([0.63210988, 0.56559825, 0.60060477, 0.59360409, 0.58710265,
       0.42757392, 0.63511109, 0.62960958, 0.48208356, 0.56409931]), 'score_time': array([0.01750422, 0.03650665, 0.0240047 , 0.02250504, 0.02700472,
       0.00750184, 0.00650096, 0.00750113, 0.0070014 , 0.02950406]), 'test_accuracy': array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.94736842, 1.        ]), 'test_precision': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), 'test_recall': array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 0.83333333, 1.        ])}
