## Data importing & feature selection

In [33]:
#%%writefile "select-filter.py"
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif

# loading data to tables
file = open("heart.dat")
all = np.loadtxt(file, delimiter=" ")
X = all[:, :-1]
y = all[:, -1].astype(int)
features_quantity = len(X[0] - 1)

# feature selection
featureSelection = SelectKBest(f_classif)
featureSelection.fit(X, y)

# sorting by relevance
toSort = np.zeros(
    (len(featureSelection.scores_)), dtype=([("key", "<i4"), ("val", "<f8")])
)
for i in range(len(featureSelection.scores_)):
    toSort[i]["key"] = i + 1
    toSort[i]["val"] = featureSelection.scores_[i]
toSort = np.sort(toSort, order="val")[::-1]

feature_order = [] 
for i in range(features_quantity):
    print("%d. Feature no. %d, Score: %f" % (i+1, toSort[i][0], toSort[i][1]))
    feature_order.append(toSort[i][0])
    



1. Feature no. 13, Score: 101.985062
2. Feature no. 12, Score: 70.098351
3. Feature no. 9, Score: 57.169632
4. Feature no. 8, Score: 56.908981
5. Feature no. 3, Score: 56.554451
6. Feature no. 10, Score: 54.564568
7. Feature no. 11, Score: 34.477673
8. Feature no. 2, Score: 26.065310
9. Feature no. 1, Score: 12.651622
10. Feature no. 7, Score: 9.190846
11. Feature no. 4, Score: 6.630479
12. Feature no. 5, Score: 5.024383
13. Feature no. 6, Score: 0.071390


## Sorting the dataset
For convenience reasons, the dataset columns can be sorted by relevance.

In [34]:
def complement(arr):
    new_arr = []
    for x in range(len(arr)):
        new_arr.append(abs(arr[x]-len(arr)))
    return np.flip(new_arr)

permutation = complement(feature_order)

idx = np.empty_like(permutation)
idx[permutation] = np.arange(len(permutation))
X = X[:, idx]
print(X)



[[  3   3   2 ...   0   0  66]
 [  7   0   2 ...   0   0  52]
 [  7   0   0 ...   0   0   5]
 ...
 [  3   0   2 ...   0   0  38]
 [  6   0   0 ...   0   0 192]
 [  3   3   2 ...   1   0  30]]


Now the first column is the most relevant feature, the second is the second most important and so on.

## Diving data into test set and training set

### 2-fold cross validation

For validation, 2-fold cross validation repated 5 times is used.



In [35]:
from sklearn.model_selection import RepeatedStratifiedKFold

rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=10)


## KNN Alghorithm

Chosen k-values are 5, 7, 9 (odd values, because amount of classes is even).
Chosen metrics are:
- Euclidean distance
- Manhattan distance

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from numpy import mean
from numpy import std

metric_list = ['manhattan', 'euclidean']
max_features = 6

for top in range(max_features):
    print("\n")
    # select only top features
    X = X[:,:(top+1)]
    
    for p in metric_list:
        for k in range(5, 10, 2):
            model = KNeighborsClassifier(n_neighbors=k, metric=p)
            scores = []
            for train_index, test_index in rskf.split(X,y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model.fit(X_train, y_train)
                predict = model.predict(X_test)
                scores.append(accuracy_score(y_test, predict))

            mean_score = mean(scores)
            std_score = std(scores)
            print('Accuracy(k=%d, %s metric, top %d features): %.3f (%.3f)' % (k, p, top+1, mean_score, std_score))
            






Accuracy(k=5, manhattan metric, top 1 features): 0.700 (0.107)
Accuracy(k=7, manhattan metric, top 1 features): 0.723 (0.097)
Accuracy(k=9, manhattan metric, top 1 features): 0.757 (0.028)
Accuracy(k=5, euclidean metric, top 1 features): 0.700 (0.107)
Accuracy(k=7, euclidean metric, top 1 features): 0.723 (0.097)
Accuracy(k=9, euclidean metric, top 1 features): 0.757 (0.028)


Accuracy(k=5, manhattan metric, top 2 features): 0.700 (0.107)
Accuracy(k=7, manhattan metric, top 2 features): 0.723 (0.097)
Accuracy(k=9, manhattan metric, top 2 features): 0.757 (0.028)
Accuracy(k=5, euclidean metric, top 2 features): 0.700 (0.107)
Accuracy(k=7, euclidean metric, top 2 features): 0.723 (0.097)
Accuracy(k=9, euclidean metric, top 2 features): 0.757 (0.028)


Accuracy(k=5, manhattan metric, top 3 features): 0.700 (0.107)
Accuracy(k=7, manhattan metric, top 3 features): 0.723 (0.097)
Accuracy(k=9, manhattan metric, top 3 features): 0.757 (0.028)
Accuracy(k=5, euclidean metric, top 3 features): 

## Statistical analysis

TBD