In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import statistics
from sklearn.model_selection import KFold

ANALYSIS 1:

In [None]:
df = pd.read_csv('cleveland.csv')

In [None]:
df.head()

In [None]:
len(df)
#303 points

In [None]:
df['num'] = df.num.apply(lambda x: (1 if x>0 else 0))

In [None]:
df.isna()
df.dropna()

In [None]:
df_filtered=df[df.thal!='?']

In [None]:
len(df_filtered)

In [None]:
df_filtered[df_filtered['ca'] == '?']

In [None]:
df_filtered=df_filtered[df_filtered.ca!='?']

In [None]:
len(df_filtered)

In [None]:
for column in list(df_filtered.columns):
    df_filtered[column] = df_filtered[column].astype(float)

In [None]:
df_filtered=(df_filtered-df_filtered.min())/(df_filtered.max()-df_filtered.min())

In [None]:
for column in list(df_filtered.columns)[:-1]:
    (r,p)=stats.pearsonr(df_filtered[column], df_filtered['num'])
    if p<0.05:
        print("The pearson correlation between "+column+" and value is : "+str(r)+" "+str(p))

For the above column names, we have the pearson corrlation with the output less than 0.05, which means that we are confident that these columns are correlated

In [None]:
X = df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].values
# X=df_filtered[['trestbps',  'cp']].values

In [None]:
Y=df_filtered['num'].values

In [None]:
 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42)

We have filtered and normalized out data till this step, now we create our functions for training and testing which we would later use to train on the k-folded data

In [None]:
def evaluate(model,X_train,y_train,labels=[0,1]):
    distances, indices = model.kneighbors(X_train)
    y_pred=[]
    for i in range(len(X_train)):
        nearest_neighbours=indices[i]
        healthy=0
        sick=0
        for neighbour in nearest_neighbours:
            if y_train[neighbour]==0:
                healthy+=1
            else:
                sick+=1
        if healthy>sick:
            y_pred.append(0)
        else:
            y_pred.append(1)
    (p,r,f,s) = precision_recall_fscore_support(y_train, y_pred, labels=[0,1])
    return (p,r,f,s)

In [None]:
def test(model,y_train,X_test,y_test,labels=[0,1]):
    distances, indices = model.kneighbors(X_test)
    y_pred=[]
    for i in range(len(X_test)):
        nearest_neighbours=indices[i]
        healthy=0
        sick=0
        for neighbour in nearest_neighbours:
            if y_train[neighbour]==0:
                healthy+=1
            else:
                sick+=1
        if healthy>sick:
            y_pred.append(0)
        else:
            y_pred.append(1)
    (p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])
    return (p,r,f,s)

In [None]:
best_f_score_model=[0,0]
for i in range(2,len(X)//2):
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42)
    kfold = KFold(10)
    nn = NearestNeighbors(n_neighbors=i, metric='euclidean', algorithm='auto')
    for train, test in kfold.split(X,Y):
        X_train,y_train = df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].iloc[train], df_filtered[['num']].iloc[train]
        X_test,y_test=df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].iloc[test], df_filtered[['num']].iloc[test]
        model=nn.fit(X_train.values)
        distances, indices = model.kneighbors(X_test.values)
        y_pred=[]
        for i in range(len(X_test.values)):
            nearest_neighbours=indices[i]
            healthy=0
            sick=0
            for neighbour in nearest_neighbours:
                if y_train.values[neighbour]==0:
                    healthy+=1
                else:
                    sick+=1
            if healthy>sick:
                y_pred.append(0)
            else:
                y_pred.append(1)
        (p,r,f,s) = precision_recall_fscore_support(y_test.values, y_pred, labels=[0,1])

    #     print(f'precision={p}, recall={r}, f-score={f}, support={s}')
        if best_f_score_model[0]<f[0]:
            best_f_score_model[0]=max(best_f_score_model[0],f[0])
            best_f_score_model[1]=i

In [None]:
best_f_score_model

The Best model is the one with 28 nearest neighbours, with the above 9 attributes.


In [None]:
scores_for_k_value=[]
best_f_score_model=[0,0]
for i in range(2,len(X)//2):
    sample_score_array=[]
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42)
    kfold = KFold(10)
    nn = NearestNeighbors(n_neighbors=i, metric='euclidean', algorithm='auto')
    for train, test in kfold.split(X,Y):
        X_train,y_train = df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].iloc[train], df_filtered[['num']].iloc[train]
        X_test,y_test=df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].iloc[test], df_filtered[['num']].iloc[test]
        model=nn.fit(X_train.values)
        distances, indices = model.kneighbors(X_test.values)
        y_pred=[]
        for i in range(len(X_test.values)):
            nearest_neighbours=indices[i]
            healthy=0
            sick=0
            for neighbour in nearest_neighbours:
                if y_train.values[neighbour]==0:
                    healthy+=1
                else:
                    sick+=1
            if healthy>sick:
                y_pred.append(0)
            else:
                y_pred.append(1)
        (p,r,f,s) = precision_recall_fscore_support(y_test.values, y_pred, labels=[0,1])
        sample_score_array.append((p,r,f,s))

        if best_f_score_model[0]<f[0]:
            best_f_score_model[0]=max(best_f_score_model[0],f[0])
            best_f_score_model[1]=i
    scores_for_k_value.append(sample_score_array)

In [None]:
f1_scores=[]
for k_measure_array in scores_for_k_value:
    score_sum=0
    for scores in k_measure_array:
            score_sum+=scores[2][0]
    score_sum/=10
    f1_scores.append(score_sum)

In [None]:
for i in range(1,len(f1_scores)):
    print(i+1,f1_scores[i])

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(2,len(X)//2),f1_scores)
plt.savefig("img1.png")

In [None]:
median_scores=[]
for k_measure_array in scores_for_k_value:
    f1_scores_internal=[]
    for scores in k_measure_array:
            f1_scores_internal.append(scores[2][0])
            f1_scores_internal.sort()
    median_scores.append((f1_scores_internal[4]+f1_scores_internal[5])/2)

In [None]:
for i in range(1,len(median_scores)):
    print(i+1,median_scores[i])

In [None]:
import matplotlib.pyplot as plt
plt.plot(range(2,len(X)//2),median_scores)
plt.savefig("img2.png")

In [None]:
X = df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].values
Y=df_filtered['num'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42)
model= NearestNeighbors(n_neighbors=8, metric='euclidean', algorithm='auto').fit(X_train)
distances, indices = model.kneighbors(X_test)
y_pred=[]
for i in range(len(X_test)):
    nearest_neighbours=indices[i]
    healthy=0
    sick=0
    for neighbour in nearest_neighbours:
        if y_train[neighbour]==0:
            healthy+=1
        else:
            sick+=1
    if healthy>sick:
        y_pred.append(0)
    else:
        y_pred.append(1)
(p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])

In [None]:
p,r,f,s

In [None]:
print("accuracy is ", f[0])

In [None]:
#Adding the code for prediciton on a test dataset:
df=pd.read_csv('cleveland-test-sample.csv')
df['disease'] = df.disease.apply(lambda x: (1 if x>0 else 0))
df.isna()
df.dropna()
df_filtered=df[df.thal!='?']
df_filtered=df_filtered[df_filtered.ca!='?']
for column in list(df_filtered.columns)[:-1]:
    df_filtered[column] = df_filtered[column].astype(float)
df_filtered=(df_filtered-df_filtered.min())/(df_filtered.max()-df_filtered.min())
X = df_filtered[['age', 'sex', 'cp', 'trestbps', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']].values
# X=df_filtered[['trestbps',  'cp']].values
Y=df_filtered['disease'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42)
model= NearestNeighbors(n_neighbors=12, metric='euclidean', algorithm='auto').fit(X_train)
distances, indices = model.kneighbors(X_test)
y_pred=[]
for i in range(len(X_test)):
    nearest_neighbours=indices[i]
    healthy=0
    sick=0
    for neighbour in nearest_neighbours:
        if y_train[neighbour]==0:
            healthy+=1
        else:
            sick+=1
    if healthy>sick:
        y_pred.append(0)
    else:
        y_pred.append(1)
(p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])
print(p,r,f,s)