# Intro to Machine Learning Final Project

### Bryce Readyhough 
### Robert Carter-Barkman

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
import statistics

Loading in the dataset and displaying the first few rows

In [2]:
dataset = pd.read_csv("Breast_GSE70947.csv")

In [3]:
dataset.shape

(289, 35983)

In [4]:
dataset.head()

Unnamed: 0,samples,type,NM_144987,NM_013290,ENST00000322831,NM_001625,lincRNA:chr7:226042-232442_R,NM_032391,ENST00000238571,XR_108906,...,lincRNA:chr4:77860976-77869926_F,NM_152343,NM_001005327,NM_001039355,lincRNA:chr21:44456656-44468556_R,lincRNA:chr9:4869500-4896050_F,NM_016053,NM_001080425,ENST00000555638,ENST00000508993
0,GSM1823702_252800417016_S01_GE1_107_Sep09_1_2,normal,8.693318,7.718016,6.044438,10.747077,9.133777,4.735581,5.634732,4.670231,...,7.570363,6.368684,4.784042,10.747723,5.0905,5.994149,10.649336,8.969439,4.985693,5.0905
1,GSM1823703_252800417016_S01_GE1_107_Sep09_2_1,normal,9.37598,7.072232,6.976741,10.429671,9.5265,5.221089,5.425187,4.860931,...,7.903335,5.713115,4.421074,11.2992,4.447052,4.421074,10.746854,8.174489,4.464177,4.536891
2,GSM1823704_252800416877_S01_GE1_107_Sep09_2_3,normal,8.943442,7.964573,6.269055,10.825025,9.396855,5.258506,5.824921,4.964604,...,7.705765,6.595364,4.41087,10.576807,5.003699,6.529257,10.430034,8.473468,4.668447,5.084127
3,GSM1823705_252800416894_S01_GE1_107_Sep09_1_1,normal,9.020798,7.824639,6.165165,11.646788,8.776462,4.648655,6.676692,4.770186,...,6.633058,5.786781,4.572984,11.17509,4.990888,6.669871,11.110395,8.880818,4.537626,4.648655
4,GSM1823706_252800416894_S01_GE1_107_Sep09_1_3,normal,8.806154,7.555348,6.230969,11.635247,8.911383,4.518054,6.520691,4.540453,...,6.211581,5.538635,4.613828,12.014365,4.979883,6.414621,10.909805,9.5265,4.67049,4.613828


In [5]:
mapVars = ['type']

def binaryMap(x):
    return x.map({"breast_adenocarcinoma" : 1, "normal" : 0})

dataset[mapVars] = dataset[mapVars].apply(binaryMap)

In [6]:
XVals = dataset.iloc[:,2:35984].values
print(XVals[:2])
YVals = dataset.iloc[:,1].values
print(YVals[:2])

[[8.69331779 7.71801596 6.04443769 ... 8.96943885 4.98569278 5.09049955]
 [9.37597965 7.07223164 6.97674149 ... 8.17448935 4.46417694 4.53689133]]
[0 0]


In [7]:
principalComponents = PCA(50).fit_transform(XVals)

In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(principalComponents, YVals, test_size = 0.2, random_state = 1, shuffle = True)

In [9]:
classifier = SVC(kernel='rbf', C=1E10)
classifier.fit(X_train, np.ravel(Y_train))

SVC(C=10000000000.0)

In [10]:
Y_pred = classifier.predict(X_test)

In [11]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precisison = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
print("Accuracy: " + str(accuracy))
print("precisison: " + str(precisison))
print("Recall: " + str(recall))

[[23  3]
 [ 4 28]]
Accuracy: 0.8793103448275862
precisison: 0.9032258064516129
Recall: 0.875


In [12]:
classifier = SVC(kernel='poly', C=1E9)
classifier.fit(X_train, np.ravel(Y_train))

SVC(C=1000000000.0, kernel='poly')

In [13]:
Y_pred = classifier.predict(X_test)

In [14]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precisison = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
print("Accuracy: " + str(accuracy))
print("precisison: " + str(precisison))
print("Recall: " + str(recall))

[[20  6]
 [ 4 28]]
Accuracy: 0.8275862068965517
precisison: 0.8235294117647058
Recall: 0.875


In [15]:
classifier = SVC(kernel='linear', C=1E10)
classifier.fit(X_train, np.ravel(Y_train))

SVC(C=10000000000.0, kernel='linear')

In [16]:
Y_pred = classifier.predict(X_test)

In [17]:
cnf_matrix = confusion_matrix(Y_test, Y_pred)
print(cnf_matrix)
accuracy = metrics.accuracy_score(Y_test, Y_pred)
precisison = metrics.precision_score(Y_test, Y_pred)
recall = metrics.recall_score(Y_test, Y_pred)
print("Accuracy: " + str(accuracy))
print("precisison: " + str(precisison))
print("Recall: " + str(recall))

[[21  5]
 [ 3 29]]
Accuracy: 0.8620689655172413
precisison: 0.8529411764705882
Recall: 0.90625


In [18]:
KFoldsClassifier = SVC(kernel='poly', C=1E9)

evalMetrics = ['accuracy', 'precision', 'recall']
K5Folds = KFold(n_splits=10, random_state=1, shuffle=True)
Scores = cross_validate(KFoldsClassifier, XVals, YVals, scoring=evalMetrics, cv=K5Folds, n_jobs=-1)

print(Scores)

{'fit_time': array([1.68629932, 2.21788859, 1.99985147, 1.89033461, 2.00485182,
       2.1028676 , 1.7783134 , 2.10086942, 2.09686995, 2.12087369]), 'score_time': array([0.12501907, 0.06301117, 0.18203092, 0.13952088, 0.18053055,
       0.14002562, 0.13752747, 0.130023  , 0.1400249 , 0.11752057]), 'test_accuracy': array([0.89655172, 0.86206897, 0.96551724, 0.89655172, 0.93103448,
       0.93103448, 0.86206897, 0.96551724, 0.89655172, 0.92857143]), 'test_precision': array([0.8       , 1.        , 1.        , 1.        , 0.90909091,
       1.        , 1.        , 1.        , 0.83333333, 1.        ]), 'test_recall': array([1.        , 0.8       , 0.92857143, 0.8125    , 0.90909091,
       0.88235294, 0.75      , 0.91666667, 0.90909091, 0.85714286])}


In [19]:
sorted(Scores)

['fit_time', 'score_time', 'test_accuracy', 'test_precision', 'test_recall']

In [20]:
import statistics

KFoldsAccuracy = statistics.mean(Scores['test_accuracy'])
KFoldsPrecision = statistics.mean(Scores['test_precision'])
KFoldsRecall = statistics.mean(Scores['test_recall'])

print("Accuracy:",KFoldsAccuracy) 
print("Precision:",KFoldsPrecision) 
print("Recall:",KFoldsRecall)

Accuracy: 0.9135467980295566
Precision: 0.9542424242424242
Recall: 0.8765415711739241
