In [4]:
# Converting file from .data to .csv

filename = "breast-cancer-wisconsin.data"
# headers = "ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses"
headers = "f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,c"

with open(filename) as file:
    data = file.read()

data = data.replace('?', 'NaN')
filename = filename.replace(".data", ".csv")

with open(filename, "w") as file:
    file.write(headers + "\n")
    file.write(data)


In [1]:
# Preparing data

import pandas as pd
import io

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")

# Clearing data from empty feature's values
df = df.dropna()

# Change values for class ('2' to '0' and '4' to '1')
df['c'] = df['c'].replace(2, 0)
df['c'] = df['c'].replace(4, 1)

df.to_csv(filename)

In [2]:
# Division data to dataframes: X for data set and y for tags

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

print(X)

     Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1       f0  f1  f2  f3  f4  f5  \
0             0             0               0  1000025   5   1   1   1   2   
1             1             1               1  1002945   5   4   4   5   7   
2             2             2               2  1015425   3   1   1   1   2   
3             3             3               3  1016277   6   8   8   1   3   
4             4             4               4  1017023   4   1   1   3   2   
..          ...           ...             ...      ...  ..  ..  ..  ..  ..   
678         678           678             694   776715   3   1   1   1   3   
679         679           679             695   841769   2   1   1   1   2   
680         680           680             696   888820   5  10  10   3   7   
681         681           681             697   897471   4   8   6   4   3   
682         682           682             698   897471   4   8   8   5   4   

       f6  f7  f8  f9  
0     1.0   3   1   1  
1    10.0   3  

In [3]:
# Kendall' Tau Test - feature ranking

from scipy.stats import kendalltau

kendall_test = {}

for feature in X.columns[2:]:
    tau, p_value = kendalltau(X[feature], y)
    kendall_test[feature] = tau

{k: v for k, v in sorted(kendall_test.items(), key=lambda item: item[1])}

{'Unnamed: 0.1.1': -0.16473589876453607,
 'f0': -0.09095618046074797,
 'f9': 0.5091730914657034,
 'f1': 0.5936490155008523,
 'f7': 0.6577258126105182,
 'f4': 0.675633325449535,
 'f8': 0.6925037262305894,
 'f5': 0.7062878064817635,
 'f3': 0.7604191544017106,
 'f6': 0.7764688877762504,
 'f2': 0.7823879275615498}

In [5]:
# Simple validation

# Data division
from sklearn.model_selection import train_test_split

TEST_SIZE = 0.2


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE)

print("Length of train group:", len(X_train), "/", format(len(X_train)/len(X)*100, '.2f') + "%")
print("Length of test group: ", len(X_test), "/", format(len(X_test)/len(X)*100, '.2f') + "%")


# Building and testing the model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

NEIGHBORS = 5
METRIC = 'manhattan'

clf = KNeighborsClassifier(n_neighbors=NEIGHBORS, metric=METRIC)    # K-NN classifier
clf.fit(X_train, y_train)                 # fitting the estimator
y_pred = clf.predict(X_test)             # prediction on test data
score = accuracy_score(y_test, y_pred)   # accuracy in prediciting tags
print("Accuracy score: %.3f" % score)

Length of train group: 546 / 79.94%
Length of test group:  137 / 20.06%
Accuracy score: 0.577


In [49]:
# Multiple cross-validation

from sklearn.model_selection import RepeatedKFold
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# K-NN parameters:
NEIGHBORS = 1        # tested values: 1, 5, 10
M1 = 'manhattan'     # tested metrics: manhattan
M2 = 'euclidean'     # and euclidean

# Cross-validation parameters:
FOLDS = 2
REPEATS = 5

# Preparing classifiers
clfs = {
    'kNN_1M' : KNeighborsClassifier(n_neighbors=1,  metric=M1),  # manhattan, 1  neighbor
    'kNN_5M' : KNeighborsClassifier(n_neighbors=5,  metric=M1),  # manhattan, 5  neighbors
    'kNN_10M': KNeighborsClassifier(n_neighbors=10, metric=M1),  # manhattan, 10 neighbors
    'kNN_1E' : KNeighborsClassifier(n_neighbors=1,  metric=M2),  # euclidean, 1  neighbor
    'kNN_5E' : KNeighborsClassifier(n_neighbors=5,  metric=M2),  # euclidean, 5  neighbors
    'kNN_10E': KNeighborsClassifier(n_neighbors=10, metric=M2)   # euclidean, 10 neighbors
}

# Features used for test:
FEATURES = X[['f2']].copy()
ALL_FEATURES = X.copy()

results = np.zeros((len(clfs), FOLDS*REPEATS))


rkf = RepeatedKFold(n_splits=FOLDS, n_repeats=REPEATS, random_state=1410)    # creating rkf object with parameters

for fold, (train_index, test_index) in enumerate(rkf.split(FEATURES)):    # cross-validation   TODO 1. Petla zautomatyzowna po cechach, 2. Wyniki dla kazdego klasyfikatora
    X_train = X.loc[train_index]
    X_test = X.loc[test_index]
    y_train = y.loc[train_index]
    y_test = y.loc[test_index]
    
    for clf_index, clf_name in enumerate(clfs):
        # build model
        clf = clone(clfs[clf_name])
        clf.fit(X_train, y_train)
        
        # test model
        y_pred = clf.predict(X_test)
        results[clf_index, fold] = accuracy_score(y_test, y_pred)
    
    
# Print results
for result in results:
    mean_result = np.mean(result)    # counting mean of scores
    print(mean_result)


0.6160998782390974
0.6313337106206377
0.6570938587916516
0.6067277186122688
0.6207859580525116
0.6544597074308449
