In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer, precision_score,recall_score
import itertools
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import StratifiedKFold

In [3]:
data = pd.read_csv('core/cleveland.csv')

In [4]:
data.num = data.num.apply(lambda x: 1 if x >0 else 0)

In [5]:
feature_list = data.columns[:-1]

In [6]:
feature_list

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object')

In [7]:
combinations = []
for r in range(4, len(feature_list) + 1):
    combinations.extend(itertools.combinations(feature_list, r))

combinations = [list(combo) for combo in combinations]

In [8]:
data_filtered=data.replace('?',pd.NA)

In [9]:
data_filtered = data_filtered.dropna()

In [10]:
features=data_filtered.corr()[['num']].apply(lambda x: abs(x)).sort_values('num',ascending=False).index[1:]

In [11]:
features

Index(['thal', 'ca', 'oldpeak', 'thalach', 'exang', 'cp', 'slope', 'sex',
       'age', 'restecg', 'trestbps', 'chol', 'fbs'],
      dtype='object')

In [12]:
class CustomKNeighbors(BaseEstimator, ClassifierMixin):
    def __init__(self, n_neighbors):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        self.nn = NearestNeighbors(n_neighbors=self.n_neighbors)
        self.nn.fit(X)
        self.classes_ = np.unique(y)
        return self

    def predict(self, X):
        distances, indices = self.nn.kneighbors(X)
        predictions = []
        for idx in indices:
            neighbor_labels = self.y_train[idx]
            predicted_value = np.median(neighbor_labels)  # Calculate the median
            
            # Adjust the prediction based on the median value
            if predicted_value == 0.5:
                predictions.append(1)  # If median is 0.5, predict 1
            else:
                predictions.append(predicted_value)  # Otherwise, use the median
        return np.array(predictions)


In [13]:
results = []

feature_indices = {
    tuple(feature_combination): [data_filtered.columns.get_loc(col) for col in feature_combination]
    for feature_combination in combinations
}


X = data_filtered.drop(columns='num').values
y = data_filtered['num'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {'n_neighbors': np.arange(1, 21, 2)}
f1_scorer = make_scorer(f1_score, average='weighted', greater_is_better=True)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for feature_combination, indices in feature_indices.items():
    X_train_subset = X_train_scaled[:, indices]
    X_test_subset = X_test_scaled[:, indices]

    custom_knn = CustomKNeighbors(n_neighbors=5)

    grid_search = GridSearchCV(custom_knn, param_grid, cv=skf, scoring=f1_scorer, verbose=1)
    grid_search.fit(X_train_subset, y_train)

    best_model = grid_search.best_estimator_

    results.append({
        'feature_combination': feature_combination,
        'best_n_neighbors': grid_search.best_params_['n_neighbors'],
        'best_cv_score': grid_search.best_score_,
    })

knn_results = pd.DataFrame(results)
knn_results.to_csv('Knn_Results1.csv')

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Fitting 10 folds for each of 10 candidates, totalling 1

In [21]:
knn_results.to_csv('Knn_Results1.csv')

In [19]:
feature_combination

('age',
 'sex',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalach',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal')

In [28]:
feature_indices

{('age', 'sex', 'cp', 'trestbps'): [0, 1, 2, 3],
 ('age', 'sex', 'cp', 'chol'): [0, 1, 2, 4],
 ('age', 'sex', 'cp', 'fbs'): [0, 1, 2, 5],
 ('age', 'sex', 'cp', 'restecg'): [0, 1, 2, 6],
 ('age', 'sex', 'cp', 'thalach'): [0, 1, 2, 7],
 ('age', 'sex', 'cp', 'exang'): [0, 1, 2, 8],
 ('age', 'sex', 'cp', 'oldpeak'): [0, 1, 2, 9],
 ('age', 'sex', 'cp', 'slope'): [0, 1, 2, 10],
 ('age', 'sex', 'cp', 'ca'): [0, 1, 2, 11],
 ('age', 'sex', 'cp', 'thal'): [0, 1, 2, 12],
 ('age', 'sex', 'trestbps', 'chol'): [0, 1, 3, 4],
 ('age', 'sex', 'trestbps', 'fbs'): [0, 1, 3, 5],
 ('age', 'sex', 'trestbps', 'restecg'): [0, 1, 3, 6],
 ('age', 'sex', 'trestbps', 'thalach'): [0, 1, 3, 7],
 ('age', 'sex', 'trestbps', 'exang'): [0, 1, 3, 8],
 ('age', 'sex', 'trestbps', 'oldpeak'): [0, 1, 3, 9],
 ('age', 'sex', 'trestbps', 'slope'): [0, 1, 3, 10],
 ('age', 'sex', 'trestbps', 'ca'): [0, 1, 3, 11],
 ('age', 'sex', 'trestbps', 'thal'): [0, 1, 3, 12],
 ('age', 'sex', 'chol', 'fbs'): [0, 1, 4, 5],
 ('age', 'sex', 'ch

In [29]:
knn_result = pd.DataFrame(results)

In [161]:
knn_result.to_csv('

In [135]:
Y

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int6

In [155]:
combinations

[['age', 'sex', 'cp', 'trestbps'],
 ['age', 'sex', 'cp', 'chol'],
 ['age', 'sex', 'cp', 'fbs'],
 ['age', 'sex', 'cp', 'restecg'],
 ['age', 'sex', 'cp', 'thalach'],
 ['age', 'sex', 'cp', 'exang'],
 ['age', 'sex', 'cp', 'oldpeak'],
 ['age', 'sex', 'cp', 'slope'],
 ['age', 'sex', 'cp', 'ca'],
 ['age', 'sex', 'cp', 'thal'],
 ['age', 'sex', 'trestbps', 'chol'],
 ['age', 'sex', 'trestbps', 'fbs'],
 ['age', 'sex', 'trestbps', 'restecg'],
 ['age', 'sex', 'trestbps', 'thalach'],
 ['age', 'sex', 'trestbps', 'exang'],
 ['age', 'sex', 'trestbps', 'oldpeak'],
 ['age', 'sex', 'trestbps', 'slope'],
 ['age', 'sex', 'trestbps', 'ca'],
 ['age', 'sex', 'trestbps', 'thal'],
 ['age', 'sex', 'chol', 'fbs'],
 ['age', 'sex', 'chol', 'restecg'],
 ['age', 'sex', 'chol', 'thalach'],
 ['age', 'sex', 'chol', 'exang'],
 ['age', 'sex', 'chol', 'oldpeak'],
 ['age', 'sex', 'chol', 'slope'],
 ['age', 'sex', 'chol', 'ca'],
 ['age', 'sex', 'chol', 'thal'],
 ['age', 'sex', 'fbs', 'restecg'],
 ['age', 'sex', 'fbs', 'thalac

In [164]:
knn_result.describe()

Unnamed: 0,best_n_neighbors,best_score
count,7814.0,7814.0
mean,13.082416,0.786616
std,5.086163,0.036562
min,1.0,0.548922
25%,9.0,0.765779
50%,15.0,0.792818
75%,17.0,0.813828
max,19.0,0.857012


In [170]:
knn_results.best_score

NameError: name 'knn_results' is not defined

In [30]:
knn_results = pd.read_csv('knn_results.csv')

In [34]:
knn_results.feature

0                        ['age', 'sex', 'cp', 'trestbps']
1                            ['age', 'sex', 'cp', 'chol']
2                             ['age', 'sex', 'cp', 'fbs']
3                         ['age', 'sex', 'cp', 'restecg']
4                         ['age', 'sex', 'cp', 'thalach']
                              ...                        
7809    ['age', 'sex', 'cp', 'chol', 'fbs', 'restecg',...
7810    ['age', 'sex', 'trestbps', 'chol', 'fbs', 'res...
7811    ['age', 'cp', 'trestbps', 'chol', 'fbs', 'rest...
7812    ['sex', 'cp', 'trestbps', 'chol', 'fbs', 'rest...
7813    ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs'...
Name: feature, Length: 7814, dtype: object