In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import random
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import statistics
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv('iris.data')
df['Iris-setosa'].unique()

In [None]:
df.columns = ["sepal length", "sepal width", "petal length", "petal width", "class"]
print(df['class'].unique)

In [None]:
df.columns = ["sepal length", "sepal width", "petal length", "petal width", "class"]
df["class"] = df['class'].apply(lambda x: (1 if x=="Iris-versicolor" else 0))

In [None]:
df

In [None]:
df.isna()
df.dropna()

In [None]:
df_filtered=df

In [None]:
print(len(df_filtered[df_filtered['class'] == 1]))

In [None]:
for column in list(df_filtered.columns)[:-1]:
    df_filtered[column] = df_filtered[column].astype(float)

In [None]:
df_filtered=(df_filtered-df_filtered.min())/(df_filtered.max()-df_filtered.min())

In [None]:
X = df_filtered[["sepal length", "sepal width", "petal length", "petal width"]].values

In [None]:
Y = df_filtered[['class']].values

In [None]:
def evaluate(model,X_train,y_train,labels=[0,1]):
    distances, indices = model.kneighbors(X_train)
    y_pred=[]
    for i in range(len(X_train)):
        nearest_neighbours=indices[i]
        versicolor=0
        nonversicolor=0
        for neighbour in nearest_neighbours:
            if y_train[neighbour]==1:
                versicolor+=1
            else:
                nonversicolor+=1
        if versicolor>nonversicolor:
            y_pred.append(1)
        else:
            y_pred.append(0)
    (p,r,f,s) = precision_recall_fscore_support(y_train.values, y_pred, labels=[0, 1])
    return (p,r,f,s)

In [None]:
def test(model,y_train,X_test,y_test,labels=[0,1]):
    distances, indices = model.kneighbors(X_test)
    y_pred=[]
    for i in range(len(X_test)):
        nearest_neighbours=indices[i]
        versicolor=0
        nonversicolor=0
        for neighbour in nearest_neighbours:
            if y_train[neighbour]==1:
                versicolor+=1
            else:
                nonversicolor+=1
        if versicolor>nonversicolor:
            y_pred.append(1)
        else:
            y_pred.append(0)
    (p,r,f,s) = precision_recall_fscore_support(y_test.values, y_pred, labels=[0, 1])
    return (p,r,f,s)

In [None]:
import warnings
warnings.filterwarnings('ignore')

best_f_score_model=[0,0]
for i in range(2,10):
    kfold = KFold(10)
    nn = NearestNeighbors(n_neighbors=i, metric='euclidean', algorithm='auto')
    for train, test in kfold.split(X,Y):
        X_train,y_train = df_filtered[["sepal length", "sepal width", "petal length", "petal width"]].iloc[train], df_filtered[['class']].iloc[train]
        X_test,y_test=df_filtered[["sepal length", "sepal width", "petal length", "petal width"]].iloc[test], df_filtered[['class']].iloc[test]

        model=nn.fit(X_train.values)
        distances, indices = model.kneighbors(X_test.values)
        y_pred=[]
        for i in range(len(X_test.values)):
            nearest_neighbours=indices[i]
            versicolor=0
            nonversicolor=0
            for neighbour in nearest_neighbours:
                if y_train.values[neighbour]==1:
                    versicolor+=1
                else:
                    nonversicolor+=1
            if versicolor>nonversicolor:
                y_pred.append(1)
            else:
                y_pred.append(0)
        (p,r,f,s) = precision_recall_fscore_support(y_test.values, y_pred, labels=[0,1])

        if best_f_score_model[0]<f[0]:
            best_f_score_model[0]=max(best_f_score_model[0],f[0])
            best_f_score_model[1]=i

In [None]:
best_f_score_model

In [None]:
scores_for_k_value=[]
best_f_score_model=[0,0]
for i in range(2,len(X)//2):
    sample_score_array=[]
    kfold = KFold(2)
    nn = NearestNeighbors(n_neighbors=i, metric='euclidean', algorithm='auto')
    for train, test in kfold.split(X,Y):
        X_train,y_train = df_filtered[["sepal length", "sepal width", "petal length", "petal width"]].iloc[train], df_filtered[['class']].iloc[train]
        X_test,y_test=df_filtered[["sepal length", "sepal width", "petal length", "petal width"]].iloc[test], df_filtered[['class']].iloc[test]
        model=nn.fit(X_train.values)
        distances, indices = model.kneighbors(X_test.values)
        y_pred=[]
        for i in range(len(X_test.values)):
            nearest_neighbours=indices[i]
            versicolor=0
            nonversicolor=0
            for neighbour in nearest_neighbours:
                if y_train.values[neighbour]==1:
                    versicolor+=1
                else:
                    nonversicolor+=1
            if versicolor>nonversicolor:
                y_pred.append(1)
            else:
                y_pred.append(0)
                
        (p,r,f,s) = precision_recall_fscore_support(y_test.values, y_pred, labels=[0, 1])
        sample_score_array.append((p,r,f,s))

        print(f'precision={p}, recall={r}, f-score={f}, support={s}')
        if best_f_score_model[0]<f[0]:
            best_f_score_model[0]=max(best_f_score_model[0],f[0])
            best_f_score_model[1]=i
    scores_for_k_value.append(sample_score_array)

In [None]:
X = df_filtered[["sepal length", "sepal width", "petal length", "petal width"]].values
Y=df_filtered['class'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1,random_state=42)
model= NearestNeighbors(n_neighbors=30, metric='euclidean', algorithm='auto').fit(X_train)
distances, indices = model.kneighbors(X_test)
y_pred=[]
for i in range(len(X_test)):
    nearest_neighbours=indices[i]
    versicolor=0
    nonversicolor=0
    for neighbour in nearest_neighbours:
        if y_train[neighbour]==1:
            versicolor+=1
        else:
            nonversicolor+=1
    if versicolor>nonversicolor:
        y_pred.append(1)
    else:
        y_pred.append(0)
(p,r,f,s) = precision_recall_fscore_support(y_test, y_pred, labels=[0,1])

In [None]:
p,r,f,s