In [65]:
# Converting file from .data to .csv

filename = "breast-cancer-wisconsin.data"
# headers = "ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses"
headers = "f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,c"

with open(filename) as file:
    data = file.read()

data = data.replace("?", "NaN")
filename = filename.replace(".data", ".csv")

with open(filename, "w") as file:
    file.write(headers + "\n")
    file.write(data)


In [53]:
# Kolmogorov - Smirnov Test

import pandas as pd
import io
from scipy.stats import kstest

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
    data = io.StringIO(data_string)
    df = pd.read_csv(data, sep=",")
    
print(df)
df = df[df.f7 != 'NaN']


for feature in df.columns[1:-1]:
    print(feature)
    feature = kstest(df[feature], "norm")
    print(feature)



          f1  f2  f3  f4  f5  f6    f7  f8  f9  f10  c
0    1000025   5   1   1   1   2   1.0   3   1    1  2
1    1002945   5   4   4   5   7  10.0   3   2    1  2
2    1015425   3   1   1   1   2   2.0   3   1    1  2
3    1016277   6   8   8   1   3   4.0   3   7    1  2
4    1017023   4   1   1   3   2   1.0   3   1    1  2
..       ...  ..  ..  ..  ..  ..   ...  ..  ..  ... ..
694   776715   3   1   1   1   3   2.0   1   1    1  2
695   841769   2   1   1   1   2   1.0   1   1    1  2
696   888820   5  10  10   3   7   3.0   8  10    2  4
697   897471   4   8   6   4   3   4.0  10   6    1  4
698   897471   4   8   8   5   4   5.0  10   4    1  4

[699 rows x 11 columns]


In [67]:
# Kendall' Tau Test

import pandas as pd
import io
from scipy.stats import kendalltau

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
    data = io.StringIO(data_string)
    df = pd.read_csv(data, sep=",")
    
# H0 - there is no relation between the feature and kind of cancer
# H1 - there ia a sigificant relation between the feature and kind of cancer

kendall_test = {}

for feature in df.columns[1:-1]:
    tau, p_value = kendalltau(df[feature], df['c'])
    kendall_test[feature] = tau

print(kendall_test)



{'f2': 0.5936490155008523, 'f3': 0.7823879275615498, 'f4': 0.7604191544017106, 'f5': 0.675633325449535, 'f6': 0.7062878064817635, 'f7': 0.7764688877762504, 'f8': 0.6577258126105182, 'f9': 0.6925037262305894, 'f10': 0.5091730914657034}


In [None]:
import pandas as pd
import io
from sklearn.feature_selection import SelectKBest, f_classif

filename = "breast-cancer-wisconsin.csv"

with open(filename) as file:
    data_string = file.read()
    
    data = io.StringIO(data_string)
    df = pd.read_csv(data, sep=",")
    

x = SelectKBest(f_classif, k=3)