In [103]:
import numpy as np
import pandas as pd
from collections import Counter 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [104]:
df = pd.read_csv("cleveland.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [105]:
df.shape

(303, 14)

In [106]:
df['disease']=np.where(df['num']==0, 0, 1)
df.drop(columns='num', inplace=True)
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1


In [107]:
df['ca'] = df['ca'].apply(lambda x: float(0) if x == '?' else float(x))

In [108]:
df['thal'] = df['thal'].apply(lambda x: float(0) if x == '?' else float(x))

In [109]:
# Select features and target
X = df.drop('disease', axis=1)
y = df['disease']

# standardize the data (kNN is sensitive to scale)
X = (X - X.mean()) / X.std()
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,0.947160,0.685069,-2.248056,0.756274,-0.264463,2.390484,1.015005,0.017169,-0.69548,1.085542,2.270822,-0.709957,0.658044
1,1.389703,0.685069,0.876535,1.608559,0.759159,-0.416945,1.015005,-1.818896,1.43311,0.396526,0.648041,2.500744,-0.863997
2,1.389703,0.685069,0.876535,-0.664201,-0.341717,-0.416945,1.015005,-0.900864,1.43311,1.343924,0.648041,1.430510,1.165391
3,-1.929372,0.685069,-0.164995,-0.096011,0.063869,-0.416945,-0.995103,1.634655,-0.69548,2.119067,2.270822,-0.709957,-0.863997
4,-1.486829,-1.454889,-1.206525,-0.096011,-0.824558,-0.416945,1.015005,0.978917,-0.69548,0.310399,-0.974740,-0.709957,-0.863997
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,-1.044285,0.685069,-2.248056,-1.232391,0.334260,-0.416945,-0.995103,-0.769716,-0.69548,0.138144,0.648041,-0.709957,1.165391
299,1.500339,0.685069,0.876535,0.699455,-1.037008,2.390484,-0.995103,-0.376274,-0.69548,2.032940,0.648041,1.430510,1.165391
300,0.283345,0.685069,0.876535,-0.096011,-2.234453,-0.416945,-0.995103,-1.512885,1.43311,0.138144,0.648041,0.360277,1.165391
301,0.283345,-1.454889,-1.206525,-0.096011,-0.206522,-0.416945,1.015005,1.066349,-0.69548,-0.895381,0.648041,0.360277,-0.863997


# Custom Model

In [113]:
from sklearn.base import BaseEstimator, ClassifierMixin

# Custom kNN Classifier with BaseEstimator and ClassifierMixin for GridSearchCV compatibility
class CustomKNN(BaseEstimator, ClassifierMixin):
    def __init__(self, k=5):
        self.k = k
    
    # Euclidean distance calculation
    def euclidean_distance(self, row1, row2):
        return np.sqrt(np.sum((row1 - row2) ** 2))
    
    # Fit method to store the training data
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    # Prediction method
    def predict(self, X_test):
        predictions = []
        for test_point in X_test:
            # Calculate distance between test_point and all training points
            distances = []
            for i, train_point in enumerate(self.X_train):
                dist = self.euclidean_distance(test_point, train_point)
                distances.append((dist, self.y_train[i]))

            # Sort distances and get k-nearest neighbors
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]

            # Get labels of the k-nearest neighbors and do majority voting
            k_nearest_labels = [label for _, label in k_nearest]
            most_common = Counter(k_nearest_labels).most_common(1)
            predictions.append(most_common[0][0])

        return np.array(predictions)

# Checking all the features combination from 3 upto 13

In [114]:
import itertools
com = []
for i in range(3, 12):  
    com.append(list(itertools.combinations(range(13), i)))
f_com = [tup for sublist in com for tup in sublist]

In [115]:
saved = []
for i in range(len(f_com)):
    features = []
    for j in range(len(f_com[i])):
        features.append(df.columns[f_com[i][j]])
    #print(features)
    # Initialize kNN model
    k = 5  # You can adjust this based on cross-validation results
    knn = CustomKNN(k=k)
    X = df[features].values
    y = df['disease'].values

    # Initialize KFold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # Arrays to hold evaluation metrics for each fold
    precision_scores = []
    recall_scores = []
    f1_scores = []

    # 10-fold cross-validation
    fold = 1
    for train_index, test_index in kf.split(X):
        #print(f"Fold {fold}")
        fold += 1
        
        # Split the data into training and test sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Fit the model
        knn.fit(X_train, y_train)
        
        # Make predictions
        y_pred = knn.predict(X_test)
        
        # Calculate metrics for this fold
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        # Append metrics to lists
        precision_scores.append(precision)
        recall_scores.append(recall)
        f1_scores.append(f1)
        
        # Print fold results
        #print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

    # Calculate average metrics across all folds
    mean_precision = np.mean(precision_scores)
    mean_recall = np.mean(recall_scores)
    mean_f1 = np.mean(f1_scores)
    saved.append((features, mean_precision, mean_recall, mean_f1))
    # Print the final average metrics
    # print("\nFinal Results (Averaged over 10 folds):")
    # print(f'Mean Precision: {mean_precision:.4f}')
    # print(f'Mean Recall: {mean_recall:.4f}')
    # print(f'Mean F1 Score: {mean_f1:.4f}')

In [116]:
import json
# Save the list to a file
with open("my_list.json", "w") as f:
    json.dump(saved, f)

# Load the list from the file
with open("my_list.json", "r") as f:
    loaded_list = json.load(f)

In [117]:
len(loaded_list)

8086

In [118]:
findings = pd.DataFrame()

In [119]:
findings['features'] = [loaded_list[i][0] for i in range(len(loaded_list))]
findings['precision'] = [loaded_list[i][1] for i in range(len(loaded_list))]
findings['recall'] = [loaded_list[i][2] for i in range(len(loaded_list))]
findings['f1'] = [loaded_list[i][3] for i in range(len(loaded_list))]

In [120]:
findings

Unnamed: 0,features,precision,recall,f1
0,"[age, sex, cp]",0.657621,0.679363,0.658082
1,"[age, sex, trestbps]",0.514637,0.531945,0.516291
2,"[age, sex, chol]",0.488714,0.472127,0.462639
3,"[age, sex, fbs]",0.607764,0.549424,0.552925
4,"[age, sex, restecg]",0.538236,0.490566,0.498375
...,...,...,...,...
8081,"[sex, cp, trestbps, chol, restecg, thalach, ex...",0.628142,0.535162,0.568684
8082,"[sex, cp, trestbps, fbs, restecg, thalach, exa...",0.715588,0.603983,0.647857
8083,"[sex, cp, chol, fbs, restecg, thalach, exang, ...",0.626239,0.595760,0.598022
8084,"[sex, trestbps, chol, fbs, restecg, thalach, e...",0.633697,0.535162,0.571065


In [121]:
findings.iloc[findings['f1'].idxmax(), :]

features     [cp, fbs, exang, ca, thal]
precision                      0.878244
recall                         0.804011
f1                             0.830261
Name: 1996, dtype: object

# Searching for K value

In [123]:
# Define parameter grid for GridSearchCV (different values of k)
param_grid = {
    'k': [3, 5, 7, 9, 11, 13, 15, 17]  # Different values of k for grid search
}
knn = CustomKNN()
# Initialize GridSearchCV with 10-fold cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='f1')

X = df[["cp", "fbs", "exang", "ca", "thal"]].values
y = df['disease'].values
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from grid search
best_params = grid_search.best_params_
print(f'Best parameters found: {best_params}')

# Use the best model found by GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


Best parameters found: {'k': 3}
Precision: 0.9041
Recall: 0.8998
F1 Score: 0.9010


# Checking Consistancy

In [124]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_pred = best_model.predict(X_test)
# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Output the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Precision: 0.8849
Recall: 0.8858
F1 Score: 0.8851


# Sample data

In [137]:
sample_data = pd.read_csv('cleveland-test-sample.csv')
sample_data.replace('?', pd.NA, inplace=True)
sample_data.dropna(inplace=True)
sample_data.drop('Unnamed: 0',inplace=True, axis=1)
sample_data['ca']= sample_data['ca'].astype('float')
sample_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,44.0,1.0,3.0,120.0,226.0,0.0,0.0,169.0,0.0,0.0,1.0,0.0,3.0,0
1,62.0,0.0,3.0,130.0,263.0,0.0,0.0,97.0,0.0,1.2,2.0,1.0,7.0,1
2,65.0,0.0,3.0,160.0,360.0,0.0,2.0,151.0,0.0,0.8,1.0,0.0,3.0,0
3,41.0,1.0,3.0,112.0,250.0,0.0,0.0,179.0,0.0,0.0,1.0,0.0,3.0,0
4,37.0,0.0,3.0,120.0,215.0,0.0,0.0,170.0,0.0,0.0,1.0,0.0,3.0,0
5,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,0
6,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
7,63.0,1.0,4.0,130.0,330.0,1.0,2.0,132.0,1.0,1.8,1.0,3.0,7.0,1
8,43.0,1.0,4.0,115.0,303.0,0.0,0.0,181.0,0.0,1.2,2.0,0.0,3.0,0
9,65.0,0.0,3.0,140.0,417.0,1.0,2.0,157.0,0.0,0.8,1.0,1.0,3.0,0


In [138]:
sample_data_x = sample_data[["cp", "fbs", "exang", "ca", "thal"]].values
sample_data_y = sample_data['disease'].values

In [139]:
y_pred = best_model.predict(sample_data_x)
# Calculate evaluation metrics
precision = precision_score(sample_data_y, y_pred, average='macro')
recall = recall_score(sample_data_y, y_pred, average='macro')
f1 = f1_score(sample_data_y, y_pred, average='macro')

# Output the results
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Precision: 0.8505
Recall: 0.8413
F1 Score: 0.8436
