In [39]:
# Converting file from .data to .csv

filename = "breast-cancer-wisconsin.data"
# headers = "ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses"
headers = "f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,c"

with open(filename) as file:
    data = file.read()

data = data.replace('?', 'NaN')
filename = filename.replace(".data", ".csv")

with open(filename, "w") as file:
    file.write(headers + "\n")
    file.write(data)


In [40]:
# Preparing data

import pandas as pd
import io

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")

# Clearing data from empty feature's values
df = df.dropna()

# Change values for class ('2' to '0' and '4' to '1')
df['c'] = df['c'].replace(2, 0)
df['c'] = df['c'].replace(4, 1)

df.to_csv(filename)

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'c'], dtype='object')


In [100]:
# Division data to dataframes: X for data set and Y for tags

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

print(X)

     Unnamed: 0       f0  f1  f2  f3  f4  f5    f6  f7  f8  f9
0             0  1000025   5   1   1   1   2   1.0   3   1   1
1             1  1002945   5   4   4   5   7  10.0   3   2   1
2             2  1015425   3   1   1   1   2   2.0   3   1   1
3             3  1016277   6   8   8   1   3   4.0   3   7   1
4             4  1017023   4   1   1   3   2   1.0   3   1   1
..          ...      ...  ..  ..  ..  ..  ..   ...  ..  ..  ..
678         694   776715   3   1   1   1   3   2.0   1   1   1
679         695   841769   2   1   1   1   2   1.0   1   1   1
680         696   888820   5  10  10   3   7   3.0   8  10   2
681         697   897471   4   8   6   4   3   4.0  10   6   1
682         698   897471   4   8   8   5   4   5.0  10   4   1

[683 rows x 11 columns]


In [102]:
# Kendall' Tau Test - feature ranking

from scipy.stats import kendalltau

kendall_test = {}

for feature in X.columns[2:]:
    tau, p_value = kendalltau(X[feature], y)
    kendall_test[feature] = tau

{k: v for k, v in sorted(kendall_test.items(), key=lambda item: item[1])}

{'f9': 0.5091730914657034,
 'f1': 0.5936490155008523,
 'f7': 0.6577258126105182,
 'f4': 0.675633325449535,
 'f8': 0.6925037262305894,
 'f5': 0.7062878064817635,
 'f3': 0.7604191544017106,
 'f6': 0.7764688877762504,
 'f2': 0.7823879275615498}

In [103]:
# Data division

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Length of train group:", len(X_train), "/", format(len(X_train)/len(df)*100, '.2f') + "%")
print("Length of test group: ", len(X_test), "/", format(len(X_test)/len(df)*100, '.2f') + "%")

Length of train group: 546 / 79.94%
Length of test group:  137 / 20.06%


     Unnamed: 0       f0  f1  f2  f3  f4  f5    f6  f7  f8  f9  c
0             0  1000025   5   1   1   1   2   1.0   3   1   1  0
1             1  1002945   5   4   4   5   7  10.0   3   2   1  0
2             2  1015425   3   1   1   1   2   2.0   3   1   1  0
3             3  1016277   6   8   8   1   3   4.0   3   7   1  0
4             4  1017023   4   1   1   3   2   1.0   3   1   1  0
..          ...      ...  ..  ..  ..  ..  ..   ...  ..  ..  .. ..
677         693   763235   3   1   1   1   2   1.0   2   1   2  0
678         694   776715   3   1   1   1   3   2.0   1   1   1  0
679         695   841769   2   1   1   1   2   1.0   1   1   1  0
680         696   888820   5  10  10   3   7   3.0   8  10   2  1
681         697   897471   4   8   6   4   3   4.0  10   6   1  1

[682 rows x 12 columns]
