# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

In [None]:
df=pd.read_csv("/kaggle/input/iris-flower-dataset/IRIS.csv")

In [None]:
df.head()

In [None]:
df.tail()

# Preprocessing

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df["species"].unique()

In [None]:
df["species"].value_counts()

In [None]:
df["species"].hist()

In [None]:
df.isnull().sum()

In [None]:
Setosa=df[df["species"].isin(["Iris-setosa"])]
Versicolour=df[df["species"].isin(["Iris-versicolor"])]
Virginica=df[df["species"].isin(["Iris-virginica"])]

In [None]:
fig,ax=plt.subplots(figsize=(10,10))
ax.scatter(Setosa["petal_length"],Setosa["petal_width"],c="b",marker="o",label="Setosa")
ax.scatter(Versicolour["petal_length"],Versicolour["petal_width"],c="orange",marker="o",label="Versicolour")
ax.scatter(Virginica["petal_length"],Virginica["petal_width"],c="g",marker="o",label="Virginica")
leg=ax.legend()
ax.set_xlabel("petal_length")
ax.set_ylabel("petal_width")
ax.set_title("classification of Setosa,Versicolour and Virginica according to petal_length and petal_width")

In [None]:
le = LabelEncoder()
df['species']=le.fit_transform(df['species'])

# Correlation between Features

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(20, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
X=df.drop("species",axis=1)
y=df["species"]

In [None]:
Features=X.columns

In [None]:
scaler=MinMaxScaler()
X=scaler.fit_transform(X)

In [None]:
X=pd.DataFrame(X,columns=Features)

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=True)

In [None]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

# KNN built_in

In [None]:
knn=KNeighborsClassifier(n_neighbors=15,algorithm="auto")
knn.fit(X_train,y_train)
print('KNNClassifierModel Train Score is : ' , knn.score(X_train, y_train))
print('KNNClassifierModel Test Score is : ' , knn.score(X_test, y_test))
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

In [None]:
def plot_elbow_method(X_train, X_test, title, color='blue'):
    acc = []
    err = []
    ks = list(range(1, 100, 2))
    
    for k in ks:
        knn = KNeighborsClassifier(n_neighbors=k,algorithm="auto")
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        error = np.mean(y_pred != y_test)
        err.append(error)
        acc.append(accuracy)

    plt.figure()
    plt.title(title)
    plt.xlabel("Number of Ks")
    plt.ylabel("Error")
    plt.plot(ks, err, color=color)

In [None]:
plot_elbow_method(X_train, X_test, "Elbow Method for X_train and X_test")

# Applying PCA then use KNN

In [None]:
pca = PCA(n_components=2)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
X_train_pca.shape,y_train.shape,X_test_pca.shape,y_test.shape

In [None]:
plot_elbow_method(X_train_pca, X_test_pca, "Elbow Method for PCA-transformed X_train and X_test", color='red')

In [None]:
knn=KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train_pca,y_train)
print('KNNClassifierModel Train Score is : ' , knn.score(X_train_pca, y_train))
print('KNNClassifierModel Test Score is : ' , knn.score(X_test_pca, y_test))
y_pred = knn.predict(X_test_pca)
print(accuracy_score(y_test, y_pred))

# steps to make KNN
1. Choose the Value of K: Determine the value of K, which represents the number of nearest neighbors to consider. Typically, you can use techniques such as cross-validation to select the optimal K value.

2. Calculate Distance: For each data point in the test set, calculate its distance to all data points in the training set. Common distance metrics include Euclidean distance, Manhattan distance, or cosine similarity.

3. Find K Nearest Neighbors: Select the K data points from the training set that are closest to the test data point based on the calculated distances.
 
4. Majority Vote (Classification) or Weighted Average (Regression): For classification problems, assign the class label that is most frequent among the K nearest neighbors. For regression problems, compute the weighted average of the target values of the K nearest neighbors, where the weights are inversely proportional to the distance.

5. Make Predictions: Use the majority class or the computed average to make predictions for the test data points.

6. Evaluate the Model: Assess the performance of the KNN model using evaluation metrics such as accuracy, precision, recall, F1-score (for classification), or Mean Squared Error, Mean Absolute Error, R-squared (for regression).

7. Tune Hyperparameters: Fine-tune hyperparameters such as the value of K or the choice of distance metric based on the model's performance on the validation set.

# KNN from Scratch

In [None]:
from collections import Counter
from sklearn.neighbors import KDTree

class KNN:
    def __init__(self, n_neighbors=5, algorithm='brute'): 
        self.n_neighbors = n_neighbors
        self.algorithm = algorithm
        self.x_train = None
        self.y_train = None
        self.tree = None

    def fit(self, x_train, y_train):
        self.x_train = np.array(x_train)
        self.y_train = np.array(y_train)

        if self.algorithm == 'kd_tree':
            self.tree = KDTree(self.x_train)

    def predict(self, x_test):
        x_test = np.array(x_test)
        predictions = []

        for test_point in x_test:
            if self.algorithm == 'brute':
                distances = np.sqrt(np.sum((self.x_train - test_point)**2, axis=1))
                nearest_indices = distances.argsort()[:self.n_neighbors]
            elif self.algorithm == 'kd_tree':
                _, nearest_indices = self.tree.query([test_point], k=self.n_neighbors)

            nearest_labels = self.y_train[nearest_indices]
            most_common_label = Counter(nearest_labels).most_common(1)[0][0]
            predictions.append(most_common_label)

        return predictions

    def accuracy(self, x_test, y_test):
        predictions = self.predict(x_test)
        correct = sum(1 for pred, true in zip(predictions, y_test) if pred == true)
        return correct / len(y_test)

 


In [None]:
knn = KNN(n_neighbors=5, algorithm='brute')

 
knn.fit(X_train, y_train)

 
accuracy = knn.accuracy(X_test, y_test)
print("Accuracy:", accuracy)

In [None]:
                                                        # Using the KNN implementation from scratch
acc = []
err = []
ks = list(range(1, 100, 2))

for k in ks:
    knn = KNN(n_neighbors=k, algorithm='brute')  
    knn.fit(X_train, y_train)
    accuracy = knn.accuracy(X_test, y_test)
    error = 1 - accuracy
    acc.append(accuracy)
    err.append(error)

plt.figure()
plt.title("KNN Accuracy and Error")
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy/Error")
plt.plot(ks, acc, label='Accuracy', color='blue')
plt.plot(ks, err, label='Error', color='red')
plt.legend()
plt.show()