In [28]:
# Load the original dataset
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn import preprocessing


SEED = 0
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)
data = data.drop(columns=0)
data = data.rename(columns={1: "class"})

In [29]:
# Sample 60% of the instances for the training set
train = data.sample(frac=0.6, random_state=SEED)
data_remaining = data.drop(train.index)
val = data_remaining.sample(frac=0.5, random_state=SEED)
test = data_remaining.drop(val.index)

In [30]:
X_train = train.drop(columns=['class'])
X_val = val.drop(columns=['class'])
X_test = test.drop(columns=['class'])

y_train = train["class"]
y_val = val["class"]
y_test = test["class"]

In [31]:
# Standardize the data using the StandardScaler.
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

#Label Encoder for Class variable
encoder=preprocessing.LabelEncoder()
encoder.fit(y_train)
y_train=encoder.transform(y_train)
y_test=encoder.transform(y_test)
y_val=encoder.transform(y_val)

In [32]:

clf = KNeighborsClassifier(n_neighbors=3, metric='cosine')
clf.fit(X=X_train, y=y_train)
Y_predTest = clf.predict(X_test)
print(accuracy_score(y_test, Y_predTest))

0.9649122807017544


In [40]:
knn = KNeighborsClassifier()
n_neighbors = list(range(1,100))
param_grid = {'n_neighbors': n_neighbors,'metric': ['l1', 'l2']}
knn_grid = GridSearchCV(knn, param_grid=param_grid, scoring='accuracy',return_train_score=False, verbose=3,n_jobs=-1)
grid_search_svm=knn_grid.fit(X_train, y_train)
print(grid_search_svm.best_params_)

Fitting 5 folds for each of 198 candidates, totalling 990 fits
[CV 1/5] END ..........metric=l1, n_neighbors=1;, score=0.928 total time=   0.0s
[CV 2/5] END ..........metric=l1, n_neighbors=1;, score=0.956 total time=   0.0s
[CV 3/5] END ..........metric=l1, n_neighbors=1;, score=0.985 total time=   0.0s
[CV 4/5] END ..........metric=l1, n_neighbors=1;, score=0.941 total time=   0.0s
[CV 5/5] END ..........metric=l1, n_neighbors=1;, score=0.926 total time=   0.0s
[CV 1/5] END ..........metric=l1, n_neighbors=2;, score=0.942 total time=   0.0s
[CV 2/5] END ..........metric=l1, n_neighbors=2;, score=0.926 total time=   0.0s
[CV 3/5] END ..........metric=l1, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 4/5] END ..........metric=l1, n_neighbors=2;, score=0.956 total time=   0.0s
[CV 5/5] END ..........metric=l1, n_neighbors=2;, score=0.897 total time=   0.0s
[CV 1/5] END ..........metric=l1, n_neighbors=3;, score=0.942 total time=   0.0s
[CV 2/5] END ..........metric=l1, n_neighbors=

In [27]:
print(list(range(1,10)))

[1, 2, 3, 4, 5, 6, 7, 8, 9]


In [None]:
# Grid Search on SVM model
from sklearn.svm import SVC

svm = SVC()
svm_grid = GridSearchCV(svm, param_grid=param_grid, scoring='accuracy',return_train_score=False, verbose=3)
grid_search_svm=svm_grid.fit(X_train, y_train.values.ravel())
print(grid_search_svm.best_params_)

In [None]:
# Reporting the evaluation metric for the data

clf_new = grid_search_svm.best_estimator_

val_y_pred = clf_new.predict(X_val)
print("Validation Accuracy : {:.2f}".format(accuracy_score(val_y_pred,y_val)))

test_y_pred = clf_new.predict(X_test)
print("Overall Accuracy : {:.2f}".format(accuracy_score(test_y_pred,y_test)) )
precision, recall, fscore, support = score(y_test, test_y_pred, average='micro')
print('Precision: {:.2f}'.format(precision))
print('Recall: {:.2f}'.format(recall))
print('Fscore: {:.2f}'.format(fscore))