In [1]:
import pandas as pd
import numpy as np
from LVQ import LVQ
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [2]:
colnames = ['Sample_code_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape',
            'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
            'Normal_Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("breast-cancer-wisconsin.data", names=colnames)
data.head()


Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:

# Data pre processing
data = data.replace({'Class': {2: "Benign", 4: "Malignant"}})
# Replacing the missing values with 1
# data = data.replace({'?': 1})
# Remove data wich has missing values
data = data[data.Bare_Nuclei != "?"]

total_samples = data['Sample_code_number'].count()
print("Number of rows\t: {}".format(total_samples))
cat_vars = ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape',
                    'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin',
                    'Normal_Nucleoli', 'Mitoses', 'Class']
data_final = data[cat_vars]

Number of rows	: 683


In [4]:
data_final.isnull().sum()

Clump_Thickness                0
Uniformity_of_Cell_Size        0
Uniformity_of_Cell_Shape       0
Marginal_Adhesion              0
Single_Epithelial_Cell_Size    0
Bare_Nuclei                    0
Bland_Chromatin                0
Normal_Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [5]:
data_final['Class'].nunique()

2

In [6]:
# data_final["Bare_Nuclei"] = data_final.to_numeric(df["Bare_Nuclei"])
data_final = data_final.astype({"Bare_Nuclei": int})
data_final.head()

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,Benign
1,5,4,4,5,7,10,3,2,1,Benign
2,3,1,1,1,2,2,3,1,1,Benign
3,6,8,8,1,3,4,3,7,1,Benign
4,4,1,1,3,2,1,3,1,1,Benign


In [7]:
# col_names = data_final.columns
# from sklearn import preprocessing
# min_max_scaler = preprocessing.MinMaxScaler()
# scaled_dataframe = data_final.iloc[:, :-1]
# # print(scaled_dataframe.head())
# np_scaled = min_max_scaler.fit_transform(scaled_dataframe)
# df_scale = pd.DataFrame(np_scaled, columns=col_names[:-1])
# frames = [df_scale, data_final.iloc[:, -1]]
# result = pd.concat(frames, axis=1)
# result.head()

In [13]:
# LVQ
obj = LVQ()
reduced_data = obj.fit(data_final, 500, 0.3, 100, normalize=False)

>epoch=0, lrate=0.300, error=7614.649
>epoch=1, lrate=0.297, error=7506.475
>epoch=2, lrate=0.294, error=7084.430
>epoch=3, lrate=0.291, error=6471.083
>epoch=4, lrate=0.288, error=6158.788
>epoch=5, lrate=0.285, error=5862.694
>epoch=6, lrate=0.282, error=5643.074
>epoch=7, lrate=0.279, error=5592.361
>epoch=8, lrate=0.276, error=5597.942
>epoch=9, lrate=0.273, error=5634.625
>epoch=10, lrate=0.270, error=5524.763
>epoch=11, lrate=0.267, error=5493.201
>epoch=12, lrate=0.264, error=5512.912
>epoch=13, lrate=0.261, error=5394.562
>epoch=14, lrate=0.258, error=5458.473
>epoch=15, lrate=0.255, error=5442.027
>epoch=16, lrate=0.252, error=5329.265
>epoch=17, lrate=0.249, error=5456.333
>epoch=18, lrate=0.246, error=5402.558
>epoch=19, lrate=0.243, error=5368.247
>epoch=20, lrate=0.240, error=5339.579
>epoch=21, lrate=0.237, error=5345.022
>epoch=22, lrate=0.234, error=5333.432
>epoch=23, lrate=0.231, error=5339.173
>epoch=24, lrate=0.228, error=5307.173
>epoch=25, lrate=0.225, error=5322.

In [14]:
X = reduced_data.loc[:, reduced_data.columns != 'Class']
y = reduced_data.loc[:, reduced_data.columns == 'Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.349, random_state=1)

In [15]:
reduced_data.head()

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
0,5.0,1.0,1.0,1.0,3.0,8.0,2.0,2.0,1.0,Malignant
1,6.113859,1.856281,3.42814,0.772283,4.541999,3.374003,4.029861,9.799577,1.0,Malignant
2,6.0,3.0,0.703,1.0,3.0,4.0,0.109,1.0,1.0,Benign
3,1.0,1.0,3.0,10.0,3.0,1.0,1.0,2.0,1.0,Malignant
4,1.0,1.0,6.0,2.0,3.0,1.0,1.0,1.0,1.0,Benign


In [16]:
n=5
knn = KNeighborsClassifier(n_neighbors=n)  # Fit the classifier to the data
knn.fit(X_train, np.ravel(y_train, order='C'))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
y_pred = knn.predict(X_test)

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("\nConfution Matrix: \n{}\n".format(confusion_matrix(y_test, y_pred)))
print("\nClassification Report: \n{}\n".format(classification_report(y_test, y_pred)))

Accuracy: 0.6285714285714286

Confution Matrix: 
[[90 17]
 [48 20]]


Classification Report: 
              precision    recall  f1-score   support

      Benign       0.65      0.84      0.73       107
   Malignant       0.54      0.29      0.38        68

    accuracy                           0.63       175
   macro avg       0.60      0.57      0.56       175
weighted avg       0.61      0.63      0.60       175


