In [39]:
# Converting file from .data to .csv

filename = "breast-cancer-wisconsin.data"
# headers = "ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses"
headers = "f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,c"

with open(filename) as file:
    data = file.read()

data = data.replace('?', 'NaN')
filename = filename.replace(".data", ".csv")

with open(filename, "w") as file:
    file.write(headers + "\n")
    file.write(data)


In [40]:
# Preparing data

import pandas as pd
import io

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
data = io.StringIO(data_string)
df = pd.read_csv(data, sep=",")

# Clearing data from empty feature's values
df = df.dropna()

# Change values for class ('2' to '0' and '4' to '1')
df['c'] = df['c'].replace(2, 0)
df['c'] = df['c'].replace(4, 1)

df.to_csv(filename)

Index(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'c'], dtype='object')


In [43]:
# Kendall' Tau Test

import pandas as pd
import io
from scipy.stats import kendalltau

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
    data = io.StringIO(data_string)
    df = pd.read_csv(data, sep=",")
    

kendall_test = {}

for feature in df.columns[2:-1]:
    tau, p_value = kendalltau(df[feature], df['c'])
    kendall_test[feature] = tau

{k: v for k, v in sorted(kendall_test.items(), key=lambda item: item[1])}



{'f9': 0.5091730914657034,
 'f1': 0.5936490155008523,
 'f7': 0.6577258126105182,
 'f4': 0.675633325449535,
 'f8': 0.6925037262305894,
 'f5': 0.7062878064817635,
 'f3': 0.7604191544017106,
 'f6': 0.7764688877762504,
 'f2': 0.7823879275615498}