# Imports

In [1]:
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
%config InlineBackend.figure_format = 'retina'
from scipy.spatial import distance
from scipy import stats 
import statistics

In [2]:
#Initialize lists for results section
SVM_8020_testing = []
SVM_5050_testing = []
SVM_2080_testing = []

DT_8020_testing = []
DT_5050_testing = []
DT_2080_testing = []

KNN_8020_testing = []
KNN_5050_testing = []
KNN_2080_testing = []

SVM_8020_training = []
SVM_5050_training = []
SVM_2080_training = []

DT_8020_training = []
DT_5050_training = []
DT_2080_training = []

KNN_8020_training = []
KNN_5050_training = []
KNN_2080_training = []

SVM_8020_validation = []
SVM_5050_validation = []
SVM_2080_validation = []

DT_8020_validation = []
DT_5050_validation = []
DT_2080_validation = []

KNN_8020_validation = []
KNN_5050_validation = []
KNN_2080_validation = []

# Breast Cancer Dataset

In [3]:
# Load data
df_breast_cancer = pd.read_table('breast-cancer-wisconsin.data',',',header=None)
df_breast_cancer.columns = ['ID','Clump Thickness','Unif. of Cell Size','Unif. of Cell Shape','Marginal Adhesion',\
                            'Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Diagnosis']
# Clean data
df_breast_cancer.replace('?', np.nan, inplace=True)
# drop rows with missing data
df_breast_cancer.dropna(inplace=True)
# Change labels from 2s and 4s to 1s and 0s
df_breast_cancer['Diagnosis'].replace(2,0,inplace=True)
df_breast_cancer['Diagnosis'].replace(4,1,inplace=True)

X_and_Y = df_breast_cancer.values # Load data from file.
X_and_Y = X_and_Y.astype('int')
np.random.shuffle(X_and_Y)   # Shuffle the data.
X = X_and_Y[:, 0:-1]         # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, -1]           # Last column: Labels (0 or 1)
print(X.shape, Y.shape)      # Check the shapes.

(683, 10) (683,)


**TRIAL 1**

**80/20 Training and Testing Split**

In [4]:
# 2) SPLIT DATA 80/20
X_train_val = X[:int(0.8*len(X))] 
X_test      = X[int(0.8*len(X)):]    
Y_train_val = Y[:int(0.8*len(Y))] 
Y_test      = Y[int(0.8*len(Y)):] 
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(546, 10) (137, 10) (546,) (137,)


**SVM with RBF Kernel**

In [5]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation Accuracy: 0.6575091575091575




In [6]:
#Report Training Accuracy
print("Training Accuracy:", SVM_8020_training_acc)

Training Accuracy: 1.0


In [7]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 1,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

Test accuracy: 0.6423357664233577


**Decision Tree**

In [8]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 4}
Validation Accuracy: 0.9578754578754579




In [9]:

#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

Test accuracy: 0.927007299270073


**K Nearest Neighbors**

In [10]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 2}
Validation Accuracy: 0.6428571428571429




In [11]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=2, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6642335766423357


**50/50 Training and Testing Split**

In [12]:
# 2) SPLIT DATA 50/50
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(341, 10) (342, 10) (341,) (342,)


**SVM with RBF Kernel**

In [13]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score'])  
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 1, 'gamma': 1e-07, 'kernel': 'rbf'}
Validation Accuracy: 0.656891495601173




In [14]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 1,kernel='rbf', gamma=.0000001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6023391812865497


**Decision Tree**

In [15]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(SVM_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 3}
Validation Accuracy: 0.9560117302052786




In [16]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
DT_5050_testing.append(test_acc_DT_1)

#Report Testing Accuracy 
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9649122807017544


**K Nearest Neighbors**

In [17]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 6}
Validation Accuracy: 0.6686217008797654




In [18]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=6, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report testing accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6023391812865497


**20/80 Training and Testing Split**

In [19]:
# 2) SPLIT DATA 20/80
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(136, 10) (547, 10) (136,) (547,)


**SVM with RBF Kernel**

In [20]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'gamma': 1e-07, 'kernel': 'rbf'}
Validation Accuracy: 0.6544117647058824




In [21]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 0.1,kernel='rbf', gamma=.0000001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

# Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6489945155393053


**Decision Tree**

In [22]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 3}
Validation Accuracy: 0.8602941176470589




In [23]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

# Report Testing Accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9506398537477148


**K Nearest Neighbors**

In [24]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)

print("Validation Accuracy:", validation_acc_KNN_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameter: {'n_neighbors': 4}
Validation Accuracy: 0.6911764705882353
Validation Accuracy: 0.6911764705882353




In [25]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=4, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6380255941499086


**TRIAL 2**

In [26]:
#Shuffle data again
X_and_Y = df_breast_cancer.values # Load data from file.
X_and_Y = X_and_Y.astype('int')
np.random.shuffle(X_and_Y)   # Shuffle the data.
X = X_and_Y[:, 0:-1]         # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, -1]           # Last column: Labels (0 or 1)

In [27]:
# 2) SPLIT DATA 80/20
X_train_val = X[:int(0.8*len(X))] 
X_test      = X[int(0.8*len(X)):]    
Y_train_val = Y[:int(0.8*len(Y))] 
Y_test      = Y[int(0.8*len(Y)):] 
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(546, 10) (137, 10) (546,) (137,)


**SVM with RBF Kernel**

In [28]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 1, 'gamma': 1e-06, 'kernel': 'rbf'}
Validation Accuracy: 0.6703296703296703




In [29]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 1,kernel='rbf', gamma=.000001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of 80/20 accuracy
SVM_8020_testing.append(test_acc_SVM_1)

# Report Test Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.5985401459854015


**Decision Tree**

In [30]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)


#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 5}
Validation Accuracy: 0.9487179487179487




In [31]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of 80/20 accuracy
DT_8020_testing.append(test_acc_DT_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9635036496350365


**K Nearest Neighbors**

In [32]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 2}
Validation Accuracy: 0.663003663003663




In [33]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=2, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of 80/20 accuracy
KNN_8020_testing.append(test_acc_KNN_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6496350364963503


In [34]:
#50/50 Training and Testing Split
X_train_val = X[:int(0.5*len(X))] 
X_test      = X[int(0.5*len(X)):]
Y_train_val = Y[:int(0.5*len(Y))] 
Y_test      = Y[int(0.5*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(341, 10) (342, 10) (341,) (342,)


**SVM with RBF Kernel**

In [35]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation Accuracy: 0.6862170087976539




In [36]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 10,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6198830409356725


**Decision Tree**

In [37]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_5050_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 4}
Validation Accuracy: 0.9530791788856305




In [38]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
DT_5050_testing.append(test_acc_DT_1)

# Report testing accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9444444444444444


**K Nearest Neighbors**

In [39]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 4}
Validation Accuracy: 0.6950146627565983




In [40]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=4, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6257309941520468


In [41]:
#20/80 Training and Testing Split
X_train_val = X[:int(0.2*len(X))] 
X_test      = X[int(0.2*len(X)):] 
Y_train_val = Y[:int(0.2*len(Y))] 
Y_test      = Y[int(0.2*len(Y)):] 
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(136, 10) (547, 10) (136,) (547,)


**SVM with RBF Kernel**

In [42]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'gamma': 1e-07, 'kernel': 'rbf'}
Validation Accuracy: 0.6617647058823529




In [43]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 0.1,kernel='rbf', gamma=.0000001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6471663619744058


**Decision Tree**

In [44]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 3}
Validation Accuracy: 0.9191176470588235




In [45]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9542961608775137


**K Nearest Neighbors**

In [46]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 2}
Validation Accuracy: 0.6691176470588235




In [47]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=2, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6307129798903108


**TRIAL 3**

In [48]:
#Shuffle Data and Split data for third trial
X_and_Y = df_breast_cancer.values 
X_and_Y = X_and_Y.astype('int')
np.random.shuffle(X_and_Y)   
X = X_and_Y[:, 0:-1]          
Y = X_and_Y[:, -1]   

In [49]:
#80/20 Training and Testing Split 
X_train_val = X[:int(0.8*len(X))] 
X_test      = X[int(0.8*len(X)):]    
Y_train_val = Y[:int(0.8*len(Y))] 
Y_test      = Y[int(0.8*len(Y)):] 
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(546, 10) (137, 10) (546,) (137,)


**SVM with RBF Kernel**

In [50]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'gamma': 1e-07, 'kernel': 'rbf'}
Validation Accuracy: 0.6611721611721612




In [51]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 0.1,kernel='rbf', gamma=.0000001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

SVM_8020_testing.append(test_acc_SVM_1)

# Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6058394160583942


**Decision Tree**

In [52]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 3}
Validation Accuracy: 0.945054945054945




In [53]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of 80/20 accuracy
DT_8020_testing.append(test_acc_DT_1)

# Report Testing Accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9343065693430657


**K Nearest Neighbors**

In [54]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 6}
Validation Accuracy: 0.6538461538461539




In [55]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=6, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of 80/20 accuracy
KNN_8020_testing.append(test_acc_KNN_1)

# Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.583941605839416


In [56]:
#50/50 Training and Testing Split
X_train_val = X[:int(0.5*len(X))] 
X_test      = X[int(0.5*len(X)):]
Y_train_val = Y[:int(0.5*len(Y))] 
Y_test      = Y[int(0.5*len(Y)):]
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(341, 10) (342, 10) (341,) (342,)


**SVM with RBF Kernel**

In [57]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 0.1, 'gamma': 1e-07, 'kernel': 'rbf'}
Validation Accuracy: 0.6598240469208211




In [58]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 0.1,kernel='rbf', gamma=.0000001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6403508771929824


**Decision Tree**

In [59]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_5050_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 3}
Validation Accuracy: 0.9589442815249267




In [60]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
DT_5050_testing.append(test_acc_DT_1)

# Report Testing Accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9298245614035088


**K Nearest Neighbors**

In [61]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 4}
Validation Accuracy: 0.6598240469208211




In [62]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=4, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6432748538011696


In [63]:
#20/80 Training and Testing Split
X_train_val = X[:int(0.2*len(X))] 
X_test      = X[int(0.2*len(X)):] 
Y_train_val = Y[:int(0.2*len(Y))] 
Y_test      = Y[int(0.2*len(Y)):] 
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

(136, 10) (547, 10) (136,) (547,)


**SVM with RBF Kernel**

In [64]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
Validation Accuracy: 0.7352941176470589




In [65]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 1,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

# Report Testing Accuracy
print('Test Accuracy:',test_acc_SVM_1)

Test Accuracy: 0.6288848263254113


**Decision Tree**

In [66]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")

#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'max_depth': 5}
Validation Accuracy: 0.9191176470588235




In [67]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Testing Accuracy
print("Test Accuracy:", test_acc_DT_1)

Test Accuracy: 0.9360146252285192


**K Nearest Neighbors**

In [68]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

Best Parameter: {'n_neighbors': 6}
Validation Accuracy: 0.6838235294117647




In [69]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=6, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Testing Accuracy
print("Test Accuracy:", test_acc_KNN_1)

Test Accuracy: 0.6544789762340036


** Accuracies Over Partitions: Breast Cancer Dataset **

**SVM Test, Training, and Validation Accuracies **

In [70]:
# Report SVM test accuracies by partition
print("SVM TEST ACCURACIES")
SVM_Testing_Accuracy_8020_Breast_Cancer = statistics.mean(SVM_8020_testing)
print("SVM 80/20 Testing Accuracy:", SVM_Testing_Accuracy_8020_Breast_Cancer)

SVM_Testing_Accuracy_5050_Breast_Cancer = statistics.mean(SVM_5050_testing)
print("SVM 50/50 Testing Accuracy:", SVM_Testing_Accuracy_5050_Breast_Cancer)

SVM_Testing_Accuracy_2080_Breast_Cancer = statistics.mean(SVM_2080_testing)
print("SVM 20/80 Testing Accuracy:", SVM_Testing_Accuracy_2080_Breast_Cancer)

print("SVM VALIDATION ACCURACIES")
SVM_Validation_Accuracy_8020_Breast_Cancer = statistics.mean(SVM_8020_validation)
print("SVM 80/20 Validation Accuracy:", SVM_Validation_Accuracy_8020_Breast_Cancer)

SVM_Validation_Accuracy_5050_Breast_Cancer = statistics.mean(SVM_5050_validation)
print("SVM 50/50 Validation Accuracy:", SVM_Validation_Accuracy_5050_Breast_Cancer)

SVM_Validation_Accuracy_2080_Breast_Cancer = statistics.mean(SVM_2080_validation)
print("SVM 20/80 Validation Accuracy:", SVM_Validation_Accuracy_2080_Breast_Cancer)

print("SVM TRAINING ACCURACIES")
SVM_Training_Accuracy_8020_Breast_Cancer = statistics.mean(SVM_8020_training)
print("SVM 80/20 Training Accuracy:", SVM_Training_Accuracy_8020_Breast_Cancer)

SVM_Training_Accuracy_5050_Breast_Cancer = statistics.mean(SVM_5050_training)
print("SVM 50/50 Training Accuracy:", SVM_Training_Accuracy_5050_Breast_Cancer)

SVM_Training_Accuracy_2080_Breast_Cancer = statistics.mean(SVM_2080_training)
print("SVM 20/80 Training Accuracy:", SVM_Training_Accuracy_2080_Breast_Cancer)

SVM TEST ACCURACIES
SVM 80/20 Testing Accuracy: 0.6155717761557178
SVM 50/50 Testing Accuracy: 0.6208576998050682
SVM 20/80 Testing Accuracy: 0.6416819012797075
SVM VALIDATION ACCURACIES
SVM 80/20 Validation Accuracy: 0.663003663003663
SVM 50/50 Validation Accuracy: 0.667644183773216
SVM 20/80 Validation Accuracy: 0.6838235294117647
SVM TRAINING ACCURACIES
SVM 80/20 Training Accuracy: 1.0
SVM 50/50 Training Accuracy: 1.0
SVM 20/80 Training Accuracy: 1.0


**DT Test, Training, and Validation Accuracies **

In [71]:
#Report Decision Tree accuracies by partition
print("DECISION TREE TESTING ACCURACIES")
DT_Testing_Accuracy_8020_Breast_Cancer = statistics.mean(DT_8020_testing)
print("DT 80/20 Testing Accuracy:", DT_Testing_Accuracy_8020_Breast_Cancer)

DT_Testing_Accuracy_5050_Breast_Cancer = statistics.mean(DT_5050_testing)
print("DT 50/50 Testing Accuracy:", DT_Testing_Accuracy_5050_Breast_Cancer)

DT_Testing_Accuracy_2080_Breast_Cancer = statistics.mean(DT_2080_testing)
print("DT 20/80 Testing Accuracy:", DT_Testing_Accuracy_2080_Breast_Cancer)

print("DECISION TREE VALIDATION ACCURACIES")
DT_Validation_Accuracy_8020_Breast_Cancer = statistics.mean(DT_8020_validation)
print("DT 80/20 Validation Accuracy:", DT_Validation_Accuracy_8020_Breast_Cancer)

DT_Validation_Accuracy_5050_Breast_Cancer = statistics.mean(DT_5050_validation)
print("DT 50/50 Validation Accuracy:", DT_Validation_Accuracy_5050_Breast_Cancer)

DT_Validation_Accuracy_2080_Breast_Cancer = statistics.mean(DT_2080_validation)
print("DT 20/80 Validation Accuracy:", DT_Validation_Accuracy_2080_Breast_Cancer)

print("DECISION TREE TRAINING ACCURACIES")
DT_Training_Accuracy_8020_Breast_Cancer = statistics.mean(DT_8020_training)
print("DT 80/20 Training Accuracy:", DT_Training_Accuracy_8020_Breast_Cancer)

DT_Training_Accuracy_5050_Breast_Cancer = statistics.mean(DT_5050_training)
print("DT 50/50 Training Accuracy:", DT_Training_Accuracy_5050_Breast_Cancer)

DT_Training_Accuracy_2080_Breast_Cancer = statistics.mean(DT_2080_training)
print("DT 20/80 Training Accuracy:", DT_Training_Accuracy_2080_Breast_Cancer)


DECISION TREE TESTING ACCURACIES
DT 80/20 Testing Accuracy: 0.9416058394160584
DT 50/50 Testing Accuracy: 0.9463937621832359
DT 20/80 Testing Accuracy: 0.9469835466179158
DECISION TREE VALIDATION ACCURACIES
DT 80/20 Validation Accuracy: 0.9523809523809523
DT 50/50 Validation Accuracy: 0.9560117302052786
DT 20/80 Validation Accuracy: 0.8995098039215687
DECISION TREE TRAINING ACCURACIES
DT 80/20 Training Accuracy: 0.9890084386348397
DT 50/50 Training Accuracy: 0.9912044550926999
DT 20/80 Training Accuracy: 1.0


**KNN Test, Training, and Validation Accuracies **

In [72]:
#Report KNN accuracies by partition
print("KNN TREE TESTING ACCURACIES")
KNN_Testing_Accuracy_8020_Breast_Cancer = statistics.mean(KNN_8020_testing)
print("KNN 80/20 Testing Accuracy:", KNN_Testing_Accuracy_8020_Breast_Cancer)

KNN_Testing_Accuracy_5050_Breast_Cancer = statistics.mean(KNN_5050_testing)
print("KNN 50/50 Testing Accuracy:", KNN_Testing_Accuracy_5050_Breast_Cancer)

KNN_Testing_Accuracy_2080_Breast_Cancer = statistics.mean(KNN_2080_testing)
print("KNN 20/80 Testing Accuracy:", KNN_Testing_Accuracy_2080_Breast_Cancer)

print("KNN TREE VALIDATION ACCURACIES")
KNN_Accuracy_Validation_8020_Breast_Cancer = statistics.mean(KNN_8020_validation)
print("KNN 80/20 Validation Accuracy:", KNN_Accuracy_Validation_8020_Breast_Cancer)

KNN_Accuracy_Validation_5050_Breast_Cancer = statistics.mean(KNN_5050_validation)
print("KNN 50/50 Validation Accuracy:", KNN_Accuracy_Validation_5050_Breast_Cancer)

KNN_Accuracy_Validation_2080_Breast_Cancer = statistics.mean(KNN_2080_validation)
print("KNN 20/80 Validation Accuracy:", KNN_Accuracy_Validation_2080_Breast_Cancer)

print("KNN TREE TRAINING ACCURACIES")
KNN_Accuracy_Training_8020_Breast_Cancer = statistics.mean(KNN_8020_training)
print("KNN 80/20 Training Accuracy:", KNN_Accuracy_Training_8020_Breast_Cancer)

KNN_Accuracy_Training_5050_Breast_Cancer = statistics.mean(KNN_5050_training)
print("KNN 50/50 Training Accuracy:", KNN_Accuracy_Training_5050_Breast_Cancer)

KNN_Accuracy_Training_2080_Breast_Cancer = statistics.mean(KNN_2080_training)
print("KNN 20/80 Training Accuracy:", KNN_Accuracy_Training_2080_Breast_Cancer)

KNN TREE TESTING ACCURACIES
KNN 80/20 Testing Accuracy: 0.6326034063260341
KNN 50/50 Testing Accuracy: 0.6237816764132553
KNN 20/80 Testing Accuracy: 0.6410725167580743
KNN TREE VALIDATION ACCURACIES
KNN 80/20 Validation Accuracy: 0.6532356532356532
KNN 50/50 Validation Accuracy: 0.6744868035190615
KNN 20/80 Validation Accuracy: 0.6813725490196079
KNN TREE TRAINING ACCURACIES
KNN 80/20 Training Accuracy: 1.0
KNN 50/50 Training Accuracy: 1.0
KNN 20/80 Training Accuracy: 1.0


# Heart Disease Dataset

In [None]:
#Initialize lists for results section
SVM_8020_testing = []
SVM_5050_testing = []
SVM_2080_testing = []

DT_8020_testing = []
DT_5050_testing = []
DT_2080_testing = []

KNN_8020_testing = []
KNN_5050_testing = []
KNN_2080_testing = []

SVM_8020_training = []
SVM_5050_training = []
SVM_2080_training = []

DT_8020_training = []
DT_5050_training = []
DT_2080_training = []

KNN_8020_training = []
KNN_5050_training = []
KNN_2080_training = []

SVM_8020_validation = []
SVM_5050_validation = []
SVM_2080_validation = []

DT_8020_validation = []
DT_5050_validation = []
DT_2080_validation = []

KNN_8020_validation = []
KNN_5050_validation = []
KNN_2080_validation = []

In [None]:
# 1) Load data.
df_heart_disease = pd.read_table('processed.cleveland.data',',',header=None)
df_heart_disease.columns = ['Age','Sex','Chest pain','Resting Blood Pressure','Cholesterol','Fasting Blood Sugar','Rest ECG', \
                           'Thalac','Exang','Oldpeak','Slope','Ca','Thal','Diagnosis']
#Drop rows with missing data
df_heart_disease.replace('?', np.nan, inplace=True)
df_heart_disease.dropna(inplace=True)
#Replace values 1,2,3,4 in diagnosis to 1 for presence of heart disease
df_heart_disease['Diagnosis'].replace([1,2,3,4],1,inplace=True)
X_and_Y = df_heart_disease.values # Load data from file.
X_and_Y = X_and_Y.astype('float')
np.random.shuffle(X_and_Y)   # Shuffle the data.
X = X_and_Y[:, 0:-1]         # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, -1]           # Last column: Labels (0 or 1)
print(X.shape, Y.shape)      # Check the shapes.

**TRIAL 1**

In [None]:
# 2) SPLIT DATA 80/20
X_train_val = X[:int(0.8*len(X))]
X[:int(0.8*len(X))] # Get features from train + val set.
X_test      = X[int(0.8*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.8*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.8*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM with RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=5, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 50/50
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_5050_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_5050_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
DT_5050_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)


In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=6, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 20/80
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_2080_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=1)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=3, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

**TRIAL 2**

In [None]:
#Shuffle Data and Split data for third trial
X_and_Y = df_heart_disease.values 
X_and_Y = X_and_Y.astype('float')
np.random.shuffle(X_and_Y)   
X = X_and_Y[:, 0:-1]          
Y = X_and_Y[:, -1]

In [None]:
#80/20 Training and Testing Split
X_train_val = X[:int(0.8*len(X))] # Get features from train + val set.
X_test      = X[int(0.8*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.8*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.8*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_8020_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 1,kernel='rbf', gamma=.00001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)


In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=4, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
#50/50 Training and Testing Split
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_5050_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.00001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_5050_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=6, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
#20/80 Training and Testing Split
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_2080_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 10,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=2, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

**TRIAL 3**

In [None]:
#Shuffle Data and Split data for third trial
X_and_Y = df_heart_disease.values 
X_and_Y = X_and_Y.astype('float')
np.random.shuffle(X_and_Y)   
X = X_and_Y[:, 0:-1]          
Y = X_and_Y[:, -1]

In [None]:
#80/20 Training and Testing Split
X_train_val = X[:int(0.8*len(X))] # Get features from train + val set.
X_test      = X[int(0.8*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.8*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.8*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_8020_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=5, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
#50/50 Training and Testing Split
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_5050_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_5050_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)


#Print training accuracy 
print("Training Accuracy:", )

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=3, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
#20/80 Training and Testing Split
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_2080_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 1,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)


In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=5, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

** Accuracies Over Partitions: Heart Disease Dataset **

**SVM Test, Training, and Validation Accuracies **

In [None]:
# Report SVM test accuracies by partition
print("SVM TEST ACCURACIES")
SVM_Testing_Accuracy_8020_heart_disease = statistics.mean(SVM_8020_testing)
print("SVM 80/20 Testing Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

SVM_Testing_Accuracy_5050_heart_disease  = statistics.mean(SVM_5050_testing)
print("SVM 50/50 Testing Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

SVM_Testing_Accuracy_2080_heart_disease  = statistics.mean(SVM_2080_testing)
print("SVM 20/80 Testing Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

print("SVM VALIDATION ACCURACIES")
SVM_Validation_Accuracy_8020_heart_disease  = statistics.mean(SVM_8020_validation)
print("SVM 80/20 Validation Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

SVM_Validation_Accuracy_5050_heart_disease  = statistics.mean(SVM_5050_validation)
print("SVM 50/50 Validation Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

SVM_Validation_Accuracy_2080_heart_disease  = statistics.mean(SVM_2080_validation)
print("SVM 20/80 Validation Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

print("SVM TRAINING ACCURACIES")
SVM_Training_Accuracy_8020_heart_disease  = statistics.mean(SVM_8020_training)
print("SVM 80/20 Training Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

SVM_Training_Accuracy_5050_heart_disease  = statistics.mean(SVM_5050_training)
print("SVM 50/50 Training Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

SVM_Training_Accuracy_2080_heart_disease  = statistics.mean(SVM_2080_training)
print("SVM 20/80 Training Accuracy:", SVM_Testing_Accuracy_8020_heart_disease)

**DT Test, Training, and Validation Accuracies **

In [None]:
#Report Decision Tree accuracies by partition
print("DECISION TREE TESTING ACCURACIES")
DT_Testing_Accuracy_8020_heart_disease = statistics.mean(DT_8020_testing)
print("DT 80/20 Testing Accuracy:", DT_Testing_Accuracy_8020_heart_disease)

DT_Testing_Accuracy_5050_heart_disease = statistics.mean(DT_5050_testing)
print("DT 50/50 Testing Accuracy:", DT_Testing_Accuracy_5050_heart_disease)

DT_Testing_Accuracy_2080_heart_disease = statistics.mean(DT_2080_testing)
print("DT 20/80 Testing Accuracy:", DT_Testing_Accuracy_2080_heart_disease)

print("DECISION TREE VALIDATION ACCURACIES")
DT_Validation_Accuracy_8020_heart_disease = statistics.mean(DT_8020_validation)
print("DT 80/20 Validation Accuracy:", DT_Validation_Accuracy_8020_heart_disease)

DT_Validation_Accuracy_5050_heart_disease = statistics.mean(DT_5050_validation)
print("DT 50/50 Validation Accuracy:", DT_Validation_Accuracy_5050_heart_disease)

DT_Validation_Accuracy_2080_heart_disease = statistics.mean(DT_2080_validation)
print("DT 20/80 Validation Accuracy:", DT_Validation_Accuracy_2080_heart_disease)

print("DECISION TREE TRAINING ACCURACIES")
DT_Training_Accuracy_8020_heart_disease = statistics.mean(DT_8020_training)
print("DT 80/20 Training Accuracy:", DT_Training_Accuracy_8020_heart_disease)

DT_Training_Accuracy_5050_heart_disease = statistics.mean(DT_5050_training)
print("DT 50/50 Training Accuracy:", DT_Training_Accuracy_5050_heart_disease)

DT_Training_Accuracy_2080_heart_disease = statistics.mean(DT_2080_training)
print("DT 20/80 Training Accuracy:", DT_Training_Accuracy_2080_heart_disease)


**KNN Test, Training, and Validation Accuracies **

In [None]:
#Report KNN accuracies by partition
print("KNN TESTING ACCURACIES")
KNN_Testing_Accuracy_8020_heart_disease = statistics.mean(KNN_8020_testing)
print("KNN 80/20 Testing Accuracy:", KNN_Testing_Accuracy_8020_heart_disease)

KNN_Testing_Accuracy_5050_heart_disease = statistics.mean(KNN_5050_testing)
print("KNN 50/50 Testing Accuracy:", KNN_Testing_Accuracy_5050_heart_disease)

KNN_Testing_Accuracy_2080_heart_disease = statistics.mean(KNN_2080_testing)
print("KNN 20/80 Testing Accuracy:", KNN_Testing_Accuracy_2080_heart_disease)

print("KNN VALIDATION ACCURACIES")
KNN_Accuracy_Validation_8020_heart_disease = statistics.mean(KNN_8020_validation)
print("KNN 80/20 Validation Accuracy:", KNN_Accuracy_Validation_8020_heart_disease)

KNN_Accuracy_Validation_5050_heart_disease = statistics.mean(KNN_5050_validation)
print("KNN 50/50 Validation Accuracy:", KNN_Accuracy_Validation_5050_heart_disease)

KNN_Accuracy_Validation_2080_heart_disease = statistics.mean(KNN_2080_validation)
print("KNN 20/80 Validation Accuracy:", KNN_Accuracy_Validation_2080_heart_disease)

print("KNN TRAINING ACCURACIES")
KNN_Accuracy_Training_8020_heart_disease = statistics.mean(KNN_8020_training)
print("KNN 80/20 Training Accuracy:", KNN_Accuracy_Training_8020_heart_disease)

KNN_Accuracy_Training_5050_heart_disease = statistics.mean(KNN_5050_training)
print("KNN 50/50 Training Accuracy:", KNN_Accuracy_Training_5050_heart_disease)

KNN_Accuracy_Training_2080_heart_disease = statistics.mean(KNN_2080_training)
print("KNN 20/80 Training Accuracy:", KNN_Accuracy_Training_2080_heart_disease)

## Wine Quality Dataset

In [None]:
#Load in data
df_wine_quality = pd.read_csv('winequality.csv',';',header=None)
df_wine_quality.columns = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide', \
                          'total sulfur dioxide','density','ph','sulphates','alcohol','quality']
#Drop the first row
df_wine_quality = df_wine_quality.iloc[1:]

#Adjust labels to fit biary classification
df_wine_quality['quality'] = pd.to_numeric(df_wine_quality['quality'],downcast='integer')
df_wine_quality['quality'].replace([0,1,2,3,4,5],0,inplace=True)
df_wine_quality['quality'].replace([6,7,8,9,10],1,inplace=True)

# Load Data
X_and_Y = df_wine_quality.values 
X_and_Y = X_and_Y.astype('float')
np.random.shuffle(X_and_Y)   # Shuffle the data.
X = X_and_Y[:, 0:-1]         # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, -1]           # Last column: Labels (0 or 1)
print(X.shape, Y.shape)      # Check the shapes.

In [None]:
#Initialize lists for results section
SVM_8020_testing = []
SVM_5050_testing = []
SVM_2080_testing = []

DT_8020_testing = []
DT_5050_testing = []
DT_2080_testing = []

KNN_8020_testing = []
KNN_5050_testing = []
KNN_2080_testing = []

SVM_8020_training = []
SVM_5050_training = []
SVM_2080_training = []

DT_8020_training = []
DT_5050_training = []
DT_2080_training = []

KNN_8020_training = []
KNN_5050_training = []
KNN_2080_training = []

SVM_8020_validation = []
SVM_5050_validation = []
SVM_2080_validation = []

DT_8020_validation = []
DT_5050_validation = []
DT_2080_validation = []

KNN_8020_validation = []
KNN_5050_validation = []
KNN_2080_validation = []

**TRIAL 1**

In [None]:
# 2) SPLIT DATA 80/20
X_train_val = X[:int(0.8*len(X))] # Get features from train + val set.
X_test      = X[int(0.8*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.8*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.8*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM with RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_8020_training_acc)

#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K-Nearest Neighbors (KNN)**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=1, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 50/50
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_5050_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

#Print training accuracy 
print("Training Accuracy:", )

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_5050_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=1, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 20/80
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_2080_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=6, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

** TRIAL 2 **

In [None]:
# Load Data
X_and_Y = df_wine_quality.values 
X_and_Y = X_and_Y.astype('float')
np.random.shuffle(X_and_Y)   # Shuffle the data.
X = X_and_Y[:, 0:-1]         # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, -1]           # Last column: Labels (0 or 1)

In [None]:
# 2) SPLIT DATA 80/20
X_train_val = X[:int(0.8*len(X))] # Get features from train + val set.
X_test      = X[int(0.8*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.8*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.8*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_8020_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=1, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 50/50
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_5050_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_5050_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=1, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 20/80
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_2080_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=3, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

** TRIAL 3 **

In [None]:
# Load Data
X_and_Y = df_wine_quality.values 
X_and_Y = X_and_Y.astype('float')
np.random.shuffle(X_and_Y)   # Shuffle the data.
X = X_and_Y[:, 0:-1]         # First column to second last column: Features (numerical values) 
Y = X_and_Y[:, -1]           # Last column: Labels (0 or 1)
print(X.shape, Y.shape)      # Check the shapes.

In [None]:
# 2) SPLIT DATA 80/20
X_train_val = X[:int(0.8*len(X))] # Get features from train + val set.
X_test      = X[int(0.8*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.8*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.8*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_8020_training_acc = max(result['mean_train_score']) 
SVM_8020_training.append(SVM_8020_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_8020_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_8020_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
SVM_8020_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_8020_training_acc = max(result['mean_train_score']) 
DT_8020_training.append(DT_8020_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_8020_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
DT_8020_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_8020_training_acc = max(result['mean_train_score']) 
KNN_8020_training.append(KNN_8020_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_8020_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=1, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 80/20
KNN_8020_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 50/50
X_train_val = X[:int(0.5*len(X))] # Get features from train + val set.
X_test      = X[int(0.5*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.5*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.5*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_5050_training_acc = max(result['mean_train_score']) 
SVM_5050_training.append(SVM_5050_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_5050_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_5050_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
SVM_5050_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_5050_training_acc = max(result['mean_train_score']) 
DT_5050_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_5050_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=4)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_5050_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_5050_training_acc = max(result['mean_train_score']) 
KNN_5050_training.append(KNN_5050_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_5050_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=1, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 50/50
KNN_5050_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

In [None]:
# 2) SPLIT DATA 20/80
X_train_val = X[:int(0.2*len(X))] # Get features from train + val set.
X_test      = X[int(0.2*len(X)):] # Get features from test set.     
Y_train_val = Y[:int(0.2*len(Y))] # Get labels from train + val set.
Y_test      = Y[int(0.2*len(Y)):] # Get labels from test set.
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

**SVM w/ RBF Kernel**

In [None]:
#initialize SVM with rbf kernel
classifier = svm.SVC(kernel='rbf')
C_list     = [0.1, 1, 10, 100] # Different C to try.
gamma_list = [1e-7, 1e-6, 1e-5, 1e-4] # Different gamma to try.
#create parameter grid to be used in GridSearchCV
param_grid = {'C': C_list,"gamma": gamma_list,'kernel':['rbf']}

#Search for optimal C and Gamma values
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 3)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#append training accuracy
result = grid_search.cv_results_
SVM_2080_training_acc = max(result['mean_train_score']) 
SVM_2080_training.append(SVM_2080_training_acc)

#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#Report Validation Accuracy
validation_acc_SVM_1 = grid_search.best_score_
SVM_2080_validation.append(validation_acc_SVM_1)
print("Validation Accuracy:", grid_search.best_score_)

#Report Training Accuracy
print("Training Accuracy:", SVM_2080_training_acc)

In [None]:
#Report Testing Accuracy for SVM w/ Optimal Hyperparameters
classifier = svm.SVC(C = 100,kernel='rbf', gamma=.0001)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_SVM_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
SVM_2080_testing.append(test_acc_SVM_1)

#Report Accuracy
print('Test accuracy:',test_acc_SVM_1)

**Decision Tree**

In [None]:
#Define Parameters
D_list = [1,2,3,4,5]
param_grid = {'max_depth': D_list}
#create tree classifier using sklearn
classifier = tree.DecisionTreeClassifier(criterion="entropy")
#Use grid search to find the best D using 5 fold validation 
grid_search = GridSearchCV(classifier, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)


#print accuracy score and optimal parameter
print("Best Parameters:", grid_search.best_params_)

#append training accuracy
result = grid_search.cv_results_
DT_2080_training_acc = max(result['mean_train_score']) 
DT_2080_training.append(DT_2080_training_acc)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)

#Report Validation Accuracy
validation_acc_DT_1 = grid_search.best_score_
DT_2080_validation.append(validation_acc_DT_1)
print("Validation Accuracy:", grid_search.best_score_)

In [None]:
#Report Testing Accuracy for Decision Tree w/ Optimal Depth
classifier = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_DT_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
DT_2080_testing.append(test_acc_DT_1)

#Report Accuracy
print("Test accuracy:", test_acc_DT_1)

**K Nearest Neighbors**

In [None]:
#Initialize KNN classifier using sklearn
k_list = [1,2,3,4,5,6]
param_grid = {'n_neighbors': k_list}
#create KNN classifier using euclidean distance
clf = KNeighborsClassifier(p=2)
#Use grid search to find optimal K
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv= 5)
grid_search = grid_search.fit(X_train_val, Y_train_val)

#Obtain Training Accuracy from Cross Validation
train_acc_array = np.reshape(grid_search.cv_results_["mean_train_score"],(-1,1))

#Obtrain Validation Accuracy from Cross Validation
val_acc_array = np.reshape(grid_search.cv_results_["mean_test_score"],(-1,1))

#append training accuracy
result = grid_search.cv_results_
KNN_2080_training_acc = max(result['mean_train_score']) 
KNN_2080_training.append(KNN_2080_training_acc)

#Print Optimal Parameters and Accuracy
print("Best Parameter:",  grid_search.best_params_)

#Report Validation Accuracy
validation_acc_KNN_1 = grid_search.best_score_
KNN_2080_validation.append(validation_acc_KNN_1)
print("Validation Accuracy:", validation_acc_KNN_1)

In [None]:
# Report Testing Accuracy w/ optimal number of neighbors 
# and Euclidean distance measure
classifier = KNeighborsClassifier(n_neighbors=3, p=2)
classifier = classifier.fit(X_train_val, Y_train_val)
test_acc_KNN_1 = classifier.score(X_test, Y_test)

# Add to list of testing accuracies for 20/80
KNN_2080_testing.append(test_acc_KNN_1)

# Report Accuracy
print("Test Accuracy:", test_acc_KNN_1)

** Accuracies Over Partitions: Wine Quality **

**SVM Test, Training, and Validation Accuracies **

In [None]:
# Report SVM test accuracies by partition
print("SVM TEST ACCURACIES")
SVM_Testing_Accuracy_8020_wine_quality = statistics.mean(SVM_8020_testing)
print("SVM 80/20 Testing Accuracy:", SVM_Testing_Accuracy_8020_wine_quality)

SVM_Testing_Accuracy_5050_wine_quality = statistics.mean(SVM_5050_testing)
print("SVM 50/50 Testing Accuracy:", SVM_Testing_Accuracy_5050_wine_quality)

SVM_Testing_Accuracy_2080_wine_quality = statistics.mean(SVM_2080_testing)
print("SVM 20/80 Testing Accuracy:", SVM_Testing_Accuracy_2080_wine_quality)

print("SVM VALIDATION ACCURACIES")
SVM_Validation_Accuracy_8020_wine_quality = statistics.mean(SVM_8020_validation)
print("SVM 80/20 Validation Accuracy:", SVM_Validation_Accuracy_8020_wine_quality)

SVM_Validation_Accuracy_5050_wine_quality = statistics.mean(SVM_5050_validation)
print("SVM 50/50 Validation Accuracy:", SVM_Validation_Accuracy_5050_wine_quality)

SVM_Validation_Accuracy_2080_wine_quality = statistics.mean(SVM_2080_validation)
print("SVM 20/80 Validation Accuracy:", SVM_Validation_Accuracy_2080_wine_quality)

print("SVM TRAINING ACCURACIES")
SVM_Training_Accuracy_8020_wine_quality = statistics.mean(SVM_8020_training)
print("SVM 80/20 Training Accuracy:", SVM_Training_Accuracy_8020_wine_quality)

SVM_Training_Accuracy_5050_wine_quality = statistics.mean(SVM_5050_training)
print("SVM 50/50 Training Accuracy:", SVM_Training_Accuracy_5050_wine_quality)

SVM_Training_Accuracy_2080_wine_quality = statistics.mean(SVM_2080_training)
print("SVM 20/80 Training Accuracy:", SVM_Training_Accuracy_2080_wine_quality)

**DT Test, Training, and Validation Accuracies **

In [None]:
#Report Decision Tree accuracies by partition
print("DECISION TREE TESTING ACCURACIES")
DT_Testing_Accuracy_8020_wine_quality = statistics.mean(DT_8020_testing)
print("DT 80/20 Testing Accuracy:", DT_Testing_Accuracy_8020_wine_quality)

DT_Testing_Accuracy_5050_wine_quality = statistics.mean(DT_5050_testing)
print("DT 50/50 Testing Accuracy:", DT_Testing_Accuracy_5050_wine_quality)

DT_Testing_Accuracy_2080_wine_quality = statistics.mean(DT_2080_testing)
print("DT 20/80 Testing Accuracy:", DT_Testing_Accuracy_2080_wine_quality)

print("DECISION TREE VALIDATION ACCURACIES")
DT_Validation_Accuracy_8020_wine_quality = statistics.mean(DT_8020_validation)
print("DT 80/20 Validation Accuracy:", DT_Validation_Accuracy_8020_wine_quality)

DT_Validation_Accuracy_5050_wine_quality = statistics.mean(DT_5050_validation)
print("DT 50/50 Validation Accuracy:", DT_Validation_Accuracy_5050_wine_quality)

DT_Validation_Accuracy_2080_wine_quality = statistics.mean(DT_2080_validation)
print("DT 20/80 Validation Accuracy:", DT_Validation_Accuracy_2080_wine_quality)

print("DECISION TREE TRAINING ACCURACIES")
DT_Training_Accuracy_8020_wine_quality = statistics.mean(DT_8020_training)
print("DT 80/20 Training Accuracy:", DT_Training_Accuracy_8020_wine_quality)

DT_Training_Accuracy_5050_wine_quality = statistics.mean(DT_5050_training)
print("DT 50/50 Training Accuracy:", DT_Training_Accuracy_5050_wine_quality)

DT_Training_Accuracy_2080_wine_quality = statistics.mean(DT_2080_training)
print("DT 20/80 Training Accuracy:", DT_Training_Accuracy_2080_wine_quality)


**KNN Test, Training, and Validation Accuracies **

In [None]:
#Report KNN accuracies by partition
print("KNN TESTING ACCURACIES")
KNN_Testing_Accuracy_8020_wine_quality = statistics.mean(KNN_8020_testing)
print("KNN 80/20 Testing Accuracy:", KNN_Testing_Accuracy_8020_wine_quality)

KNN_Testing_Accuracy_5050_wine_quality = statistics.mean(KNN_5050_testing)
print("KNN 50/50 Testing Accuracy:", KNN_Testing_Accuracy_5050_wine_quality)

KNN_Testing_Accuracy_2080_wine_quality = statistics.mean(KNN_2080_testing)
print("KNN 20/80 Testing Accuracy:", KNN_Testing_Accuracy_2080_wine_quality)

print("KNN VALIDATION ACCURACIES")
KNN_Accuracy_Validation_8020_wine_quality = statistics.mean(KNN_8020_validation)
print("KNN 80/20 Validation Accuracy:", KNN_Accuracy_Validation_8020_wine_quality)

KNN_Accuracy_Validation_5050_wine_quality = statistics.mean(KNN_5050_validation)
print("KNN 50/50 Validation Accuracy:", KNN_Accuracy_Validation_5050_wine_quality)

KNN_Accuracy_Validation_2080_wine_quality = statistics.mean(KNN_2080_validation)
print("KNN 20/80 Validation Accuracy:", KNN_Accuracy_Validation_2080_wine_quality)

print("KNN TRAINING ACCURACIES")
KNN_Accuracy_Training_8020_wine_quality = statistics.mean(KNN_8020_training)
print("KNN 80/20 Training Accuracy:", KNN_Accuracy_Training_8020_wine_quality)

KNN_Accuracy_Training_5050_wine_quality = statistics.mean(KNN_5050_training)
print("KNN 50/50 Training Accuracy:", KNN_Accuracy_Training_5050_wine_quality)

KNN_Accuracy_Training_2080_wine_quality = statistics.mean(KNN_2080_training)
print("KNN 20/80 Training Accuracy:", KNN_Accuracy_Training_2080_wine_quality)

** Accuracies Over Partition for All Datasets**

In [None]:
SVM_Validation_5050 = (SVM_Validation_Accuracy_5050_Breast_Cancer + SVM_Validation_Accuracy_5050_wine_quality + SVM_Validation_Accuracy_5050_heart_disease)/3
DT_Validation_5050 = (DT_Validation_Accuracy_5050_Breast_Cancer + DT_Validation_Accuracy_5050_wine_quality + DT_Validation_Accuracy_5050_heart_disease)/3
KNN_Validation5050 = (KNN_Accuracy_Validation_5050_Breast_Cancer + KNN_Accuracy_Validation_5050_wine_quality + KNN_Accuracy_Validation_5050_heart_disease)/3

SVM_Training_2080 = (SVM_Training_Accuracy_2080_Breast_Cancer + SVM_Training_Accuracy_2080_wine_quality + SVM_Training_Accuracy_2080_heart_disease)/3
DT_Training_2080 = (DT_Training_Accuracy_2080_Breast_Cancer + DT_Training_Accuracy_2080_wine_quality + DT_Training_Accuracy_2080_heart_disease)/3
KNN_Training_2080 = (KNN_Accuracy_Training_2080_Breast_Cancer + KNN_Accuracy_Training_2080_wine_quality + KNN_Accuracy_Training_2080_heart_disease)/3

In [None]:
#Accuracy per dataset SVM
print("SVM Testing Accuracy 80/20 Breast Cancer:",SVM_Testing_Accuracy_8020_Breast_Cancer)
print("SVM Testing Accuracy 50/50 Breast Cancer:",SVM_Testing_Accuracy_5050_Breast_Cancer)
print("SVM Testing Accuracy 20/80 Breast Cancer:",SVM_Testing_Accuracy_2080_Breast_Cancer)

print("SVM Testing Accuracy 80/20 Heart Disease:",SVM_Testing_Accuracy_8020_heart_disease)
print("SVM Testing Accuracy 50/50 Heart Disease:",SVM_Testing_Accuracy_5050_heart_disease)
print("SVM Testing Accuracy 20/80 Heart Disease:",SVM_Testing_Accuracy_2080_heart_disease)

print("SVM Testing Accuracy 80/20 Wine Quality:",SVM_Testing_Accuracy_8020_wine_quality)
print("SVM Testing Accuracy 50/50 Wine Quality:",SVM_Testing_Accuracy_5050_wine_quality)
print("SVM Testing Accuracy 20/80 Wine Quality:",SVM_Testing_Accuracy_2080_wine_quality)

#Accuracy per dataset Decision Tree
print("DT Testing Accuracy 80/20 Breast Cancer:",DT_Testing_Accuracy_8020_Breast_Cancer)
print("DT Testing Accuracy 50/50 Breast Cancer:",DT_Testing_Accuracy_5050_Breast_Cancer)
print("DT Testing Accuracy 20/80 Breast Cancer:",DT_Testing_Accuracy_2080_Breast_Cancer)

print("DT Testing Accuracy 80/20 Heart Disease:",DT_Testing_Accuracy_8020_heart_disease)
print("DT Testing Accuracy 50/50 Heart Disease:",DT_Testing_Accuracy_5050_heart_disease)
print("DT Testing Accuracy 20/80 Heart Disease:",DT_Testing_Accuracy_2080_heart_disease)

print("DT Testing Accuracy 80/20 Wine Quality:",DT_Testing_Accuracy_8020_wine_quality)
print("DT Testing Accuracy 50/50 Wine Quality:",DT_Testing_Accuracy_5050_wine_quality)
print("DT Testing Accuracy 20/80 Wine Quality:",DT_Testing_Accuracy_2080_wine_quality)

#Accuracy per dataset KNN
print("KNN Testing Accuracy 80/20 Breast Cancer:",KNN_Testing_Accuracy_8020_Breast_Cancer)
print("KNN Testing Accuracy 50/50 Breast Cancer:",KNN_Testing_Accuracy_5050_Breast_Cancer)
print("KNN Testing Accuracy 20/80 Breast Cancer:",KNN_Testing_Accuracy_2080_Breast_Cancer)

print("KNN Testing Accuracy 80/20 Heart Disease:",KNN_Testing_Accuracy_8020_heart_disease)
print("KNN Testing Accuracy 50/50 Heart Disease:",KNN_Testing_Accuracy_5050_heart_disease)
print("KNN Testing Accuracy 20/80 Heart Disease:",KNN_Testing_Accuracy_2080_heart_disease)

print("KNN Testing Accuracy 80/20 Wine Quality:",KNN_Testing_Accuracy_8020_wine_quality)
print("KNN Testing Accuracy 50/50 Wine Quality:",KNN_Testing_Accuracy_5050_wine_quality)
print("KNN Testing Accuracy 20/80 Wine Quality:",KNN_Testing_Accuracy_2080_wine_quality)

In [None]:
#Average Accuracy by Partition and Classifier
SVM_Testing_8020 = (SVM_Testing_Accuracy_8020_Breast_Cancer + SVM_Testing_Accuracy_8020_wine_quality + SVM_Testing_Accuracy_8020_heart_disease)/3
DT_Testing_8020 = (DT_Testing_Accuracy_8020_Breast_Cancer + DT_Testing_Accuracy_8020_wine_quality + DT_Testing_Accuracy_8020_heart_disease)/3
KNN_Testing_8020 = (KNN_Testing_Accuracy_8020_Breast_Cancer + KNN_Testing_Accuracy_8020_wine_quality + KNN_Testing_Accuracy_8020_heart_disease)/3

#Average Accuracy by Partition and Classifier
SVM_Testing_5050 = (SVM_Testing_Accuracy_5050_Breast_Cancer + SVM_Testing_Accuracy_5050_wine_quality + SVM_Testing_Accuracy_5050_heart_disease)/3
DT_Testing_5050 = (DT_Testing_Accuracy_5050_Breast_Cancer + DT_Testing_Accuracy_5050_wine_quality + DT_Testing_Accuracy_5050_heart_disease)/3
KNN_Testing_5050 = (KNN_Testing_Accuracy_5050_Breast_Cancer + KNN_Testing_Accuracy_5050_wine_quality + KNN_Testing_Accuracy_5050_heart_disease)/3

#Average Accuracy by Partition and Classifier
SVM_Testing_2080 = (SVM_Testing_Accuracy_2080_Breast_Cancer + SVM_Testing_Accuracy_2080_wine_quality + SVM_Testing_Accuracy_2080_heart_disease)/3
DT_Testing_2080 = (DT_Testing_Accuracy_2080_Breast_Cancer + DT_Testing_Accuracy_2080_wine_quality + DT_Testing_Accuracy_2080_heart_disease)/3
KNN_Testing_2080 = (KNN_Testing_Accuracy_2080_Breast_Cancer + KNN_Testing_Accuracy_2080_wine_quality + KNN_Testing_Accuracy_2080_heart_disease)/3


print("SVM Testing Accuracy 80/20:",SVM_Testing_8020)
print("Decision Trees Testing Accuracy 80/20:",DT_Testing_8020)
print("KNN Testing Accuracy 80/20:",KNN_Testing_8020)
print("")
print("SVM Testing Accuracy 50/50:",SVM_Testing_5050)
print("Decision Trees Testing Accuracy 50/50:",DT_Testing_5050)
print("KNN Testing Accuracy 50/50:",KNN_Testing_5050)
print("")
print("SVM Testing Accuracy 20/80:",SVM_Testing_2080)
print("Decision Trees Testing Accuracy 20/80:",DT_Testing_2080)
print("KNN Testing Accuracy 20/80:",KNN_Testing_2080)


