### Machine Learning with KNN classification

In [1]:
# Step 1 import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
#step 2: load the dataset
df = pd.read_excel('Assets/iris.xlsx')

In [5]:
# Step 3: Handle Missing Values (Drop NA)
df.dropna(inplace=True)  # Removes rows with missing values

In [6]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
154,6.7,3.1,5.6,2.4,Virginica
155,6.9,3.1,5.1,2.3,Virginica
156,7.7,2.8,6.7,2.0,Virginica
157,6.3,2.7,4.9,1.8,Virginica


In [7]:
# Step 4: Remove Duplicate Records
df.drop_duplicates(inplace=True)

In [8]:
df

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
144,6.7,3.3,5.7,2.5,Virginica
145,6.7,3.0,5.2,2.3,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [17]:
# Step 5: Exploratory Data Analysis (EDA)
print("Dataset Info:\n", df.info())
print("\nSummary Statistics:\n", df.describe())
print("\n Correlation of the continuous features:\n", df[['sepal.length', 'sepal.width', 'petal.length', 'petal.width']].corr())

<class 'pandas.core.frame.DataFrame'>
Index: 132 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  132 non-null    float64
 1   sepal.width   132 non-null    float64
 2   petal.length  132 non-null    float64
 3   petal.width   132 non-null    float64
 4   variety       132 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.2+ KB
Dataset Info:
 None

Summary Statistics:
        sepal.length  sepal.width  petal.length  petal.width
count    132.000000   132.000000    132.000000   132.000000
mean       5.834848     3.069697      3.747727     1.195455
std        0.823415     0.441361      1.761655     0.767057
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.575000     0.300000
50%        5.800000     3.000000      4.300000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.

In [36]:
# Step 6: Split Features and Target Variable
X = df.drop(columns=['variety']) #Excluding the variety column as it is our label
y = df['variety'] #Defining the label column

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [41]:
# Step 7: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [42]:
# Step 8: Train KNN Classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_scaled, y_train)

In [43]:
y_pred = knn.predict(X_test_scaled)

In [44]:
y_pred

array(['Versicolor', 'Setosa', 'Setosa', 'Virginica', 'Versicolor',
       'Versicolor', 'Setosa', 'Versicolor', 'Setosa', 'Versicolor',
       'Virginica', 'Setosa', 'Versicolor', 'Virginica', 'Versicolor',
       'Virginica', 'Virginica', 'Virginica', 'Versicolor', 'Virginica',
       'Setosa', 'Setosa', 'Setosa', 'Virginica', 'Setosa', 'Virginica',
       'Versicolor'], dtype=object)

In [45]:
print("\nModel Accuracy:\n", accuracy_score(y_test, y_pred))
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Define labels
labels = ['Setosa', 'Versicolor', 'Virginica']

# Convert to DataFrame with labels
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

print("\nConfusion Matrix:\n", cm_df)
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Model Accuracy:
 0.9259259259259259

Confusion Matrix:
             Setosa  Versicolor  Virginica
Setosa           9           0          0
Versicolor       0           8          1
Virginica        0           1          8

Classification Report:
               precision    recall  f1-score   support

      Setosa       1.00      1.00      1.00         9
  Versicolor       0.89      0.89      0.89         9
   Virginica       0.89      0.89      0.89         9

    accuracy                           0.93        27
   macro avg       0.93      0.93      0.93        27
weighted avg       0.93      0.93      0.93        27



In [46]:
accuracy_score(y_test, y_pred)

0.9259259259259259

In [47]:
# Step 10: Prediction on New Data
new_sample = [[5.1, 3.5, 1.4, 0.2]]  # Example input
new_sample_scaled = scaler.transform(new_sample)
predicted_species = knn.predict(new_sample_scaled)
print(f'\nPredicted Species: {predicted_species[0]}')



Predicted Species: Setosa


