# 1- Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 2- Dataset Import: Iris Dataset

In [None]:
df = pd.read_csv('IRIS.csv')

In [None]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# 3- Dataset Information

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [None]:
df.shape

(150, 5)

In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal_length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
sepal_width,150.0,3.054,0.433594,2.0,2.8,3.0,3.3,4.4
petal_length,150.0,3.758667,1.76442,1.0,1.6,4.35,5.1,6.9
petal_width,150.0,1.198667,0.763161,0.1,0.3,1.3,1.8,2.5


# 4- Species Distribution

In [None]:
df['species'].value_counts()

species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

# 5- Calculating Sepal and Petal Areas

In [None]:
df["sepal_area"] = np.pi * (df["sepal_length"] / 2) * (df["sepal_width"] / 2)
df["petal_area"] = np.pi * (df["petal_length"] / 2) * (df["petal_width"] / 2)

In [None]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,petal_area
0,5.1,3.5,1.4,0.2,Iris-setosa,14.019357,0.219911
1,4.9,3.0,1.4,0.2,Iris-setosa,11.545353,0.219911
2,4.7,3.2,1.3,0.2,Iris-setosa,11.812388,0.204204
3,4.6,3.1,1.5,0.2,Iris-setosa,11.199778,0.235619
4,5.0,3.6,1.4,0.2,Iris-setosa,14.137167,0.219911


In [None]:
# Calculating petal_area and sepal_area averages by species
grouped_means = df.groupby("species")[["sepal_area", "petal_area"]].mean()

print(grouped_means)

                 sepal_area  petal_area
species                                
Iris-setosa       13.515760    0.284942
Iris-versicolor   12.979647    4.492792
Iris-virginica    15.460249    8.872015


In [None]:
# Numerical coding of types
df["species_encoded"] = df["species"].astype("category").cat.codes

# Correlation of sepal and petal areas with species
correlation = df[["sepal_area", "petal_area", "species_encoded"]].corr()

print(correlation)

                 sepal_area  petal_area  species_encoded
sepal_area         1.000000    0.459484         0.301045
petal_area         0.459484    1.000000         0.950142
species_encoded    0.301045    0.950142         1.000000


In [None]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area,petal_area,species_encoded
0,5.1,3.5,1.4,0.2,Iris-setosa,14.019357,0.219911,0
1,4.9,3.0,1.4,0.2,Iris-setosa,11.545353,0.219911,0
2,4.7,3.2,1.3,0.2,Iris-setosa,11.812388,0.204204,0
3,4.6,3.1,1.5,0.2,Iris-setosa,11.199778,0.235619,0
4,5.0,3.6,1.4,0.2,Iris-setosa,14.137167,0.219911,0


# 6- Checking for Missing Values in the Dataset

In [None]:
df.isnull().sum()

sepal_length       0
sepal_width        0
petal_length       0
petal_width        0
species            0
sepal_area         0
petal_area         0
species_encoded    0
dtype: int64

# 7- Splitting Features and Target Variable

In [None]:
X = df.drop(['species', 'species_encoded'], axis=1)

In [None]:
y = df['species_encoded']

# 8- Feature Scaling with Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X)

In [None]:
X = pd.DataFrame(X_scaled, columns= X.columns)

# 9- Training the K-Nearest Neighbors (KNN) Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier().fit(X, y)

# 10- Randomly Selecting a Sample for Prediction

In [None]:
random_select = X.sample(1, random_state = 0)

In [None]:
random_select

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,sepal_area,petal_area
114,-0.052506,-0.587764,0.762759,1.579429,-0.466585,1.372327


In [None]:
knn_model.predict(random_select)

array([2], dtype=int8)

In [None]:
df.iloc[114].species_encoded

2

In [None]:
random_select = X.sample(1, random_state = 2)
knn_model.predict(random_select)

array([0], dtype=int8)

In [None]:
random_select

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,sepal_area,petal_area
6,-1.506521,0.800654,-1.341272,-1.181504,-0.645292,-1.143765


In [None]:
df.iloc[6].species_encoded

0

# 11- Importing Metrics for Model Evaluation

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
y_pred = knn_model.predict(X)

In [None]:
y_prob = knn_model.predict_proba(X)[:, 1] # 1.index yani 1 olma durumu

In [None]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.96      0.94      0.95        50
           2       0.94      0.96      0.95        50

    accuracy                           0.97       150
   macro avg       0.97      0.97      0.97       150
weighted avg       0.97      0.97      0.97       150



In [None]:
y_prob = knn_model.predict_proba(X)

In [None]:
roc_auc_score(y, y_prob, multi_class='ovr')

0.9972

In [None]:
from sklearn.model_selection import GridSearchCV, cross_validate

In [None]:
import warnings
warnings.filterwarnings("ignore")

# 12- Implementing Cross-Validation with Accuracy, F1, and ROC AUC Scores

In [None]:
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, average='weighted'),
    'roc_auc': make_scorer(roc_auc_score, multi_class='ovr', needs_proba=True)
}

cv_result = cross_validate(knn_model, X, y, cv=5, scoring=scoring)

In [None]:
cv_result

{'fit_time': array([0.0039475 , 0.0034852 , 0.00251007, 0.00215816, 0.00217628]),
 'score_time': array([0.01766729, 0.01639462, 0.01065135, 0.01060009, 0.01043081]),
 'test_accuracy': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'test_f1': array([0.96658312, 0.96658312, 0.89769821, 0.93333333, 1.        ]),
 'test_roc_auc': array([0.99333333, 0.99666667, 1.        , 0.99      , 1.        ])}

In [None]:
print('Accuracy = ', cv_result['test_accuracy'].mean() )
print('F1 = ', cv_result['test_f1'].mean() )
print('Roc_auc = ', cv_result['test_roc_auc'].mean() )

Accuracy =  0.9533333333333334
F1 =  0.9528395584015452
Roc_auc =  0.9960000000000001


In [None]:
# Let's see if a better model can be built

In [None]:
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [None]:
knn_params = {'n_neighbors': range(2, 50)}

In [None]:
knn_gs_best = GridSearchCV(knn_model, knn_params, cv=5, n_jobs=-1, verbose=1).fit(X, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
knn_gs_best.best_params_

{'n_neighbors': 8}

In [None]:
knn_final = knn_model.set_params(**knn_gs_best.best_params_).fit(X, y)

In [None]:
scoring = {
    'accuracy': 'accuracy',
    'f1': make_scorer(f1_score, average='weighted'),
    'roc_auc': make_scorer(roc_auc_score, multi_class='ovr', needs_proba=True)
}

cv_result = cross_validate(knn_model, X, y, cv=5, scoring=scoring)

In [None]:
cv_result

{'fit_time': array([0.00285769, 0.00240684, 0.00205016, 0.00204515, 0.0022428 ]),
 'score_time': array([0.01252651, 0.01017904, 0.0098896 , 0.00996542, 0.01029038]),
 'test_accuracy': array([0.96666667, 0.96666667, 1.        , 0.9       , 1.        ]),
 'test_f1': array([0.96658312, 0.96658312, 1.        , 0.89974937, 1.        ]),
 'test_roc_auc': array([0.98666667, 0.99833333, 1.        , 0.99166667, 1.        ])}

In [None]:
print('Accuracy = ', cv_result['test_accuracy'].mean() )
print('F1 = ', cv_result['test_f1'].mean() )
print('Roc_auc = ', cv_result['test_roc_auc'].mean() )

Accuracy =  0.9666666666666668
F1 =  0.9665831244778612
Roc_auc =  0.9953333333333333
