################################################
# KNN
################################################

1. Exploratory Data Analysis
2. Data Preprocessing & Feature Engineering
3. Modeling & Prediction
4. Model Evaluation
5. Hyperparameter Optimization
6. Final Model

-------------------------------------------------------------

Importing classes and modules

In [1]:
import pandas as pd
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

################################################
# 1. Exploratory Data Analysis
################################################

In [2]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

print(df.head())
print("-------")
print(df.shape)
print("-------")
print(df.describe().T)
print("-------")
df["Outcome"].value_counts()

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72             35        0  33.6                     0.627   50        1
1            1       85             66             29        0  26.6                     0.351   31        0
2            8      183             64              0        0  23.3                     0.672   32        1
3            1       89             66             23       94  28.1                     0.167   21        0
4            0      137             40             35      168  43.1                     2.288   33        1
-------
(768, 9)
-------
                          count        mean         std     min       25%       50%        75%     max
Pregnancies               768.0    3.845052    3.369578   0.000   1.00000    3.0000    6.00000   17.00
Glucose                   768.0  120.894531   31.972618   0.000  99.00000  117.0000  140.25000  199.00
BloodPressur

Outcome
0    500
1    268
Name: count, dtype: int64

################################################
# 2. Data Preprocessing & Feature Engineering
################################################

In [3]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

X_scaled = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)
print(X)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  DiabetesPedigreeFunction       Age
0       0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013                  0.468492  1.425995
1      -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422                 -0.365061 -0.190672
2       1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255                  0.604397 -0.105584
3      -0.844885 -0.998208      -0.160546       0.154533  0.123302 -0.494043                 -0.920763 -1.041549
4      -1.141852  0.504055      -1.504687       0.907270  0.765836  1.409746                  5.484909 -0.020496
..           ...       ...            ...            ...       ...       ...                       ...       ...
763     1.827813 -0.622642       0.356432       1.722735  0.870031  0.115169                 -0.908682  2.532136
764    -0.547919  0.034598       0.046245       0.405445 -0.692891  0.610154                 -0.

################################################
# 3. Modeling & Prediction
################################################

In [4]:
knn_model = KNeighborsClassifier().fit(X, y)

random_user = X.sample(1, random_state=45)
print(random_user)
print("---------------")
print(knn_model.predict(random_user))

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  DiabetesPedigreeFunction       Age
195     0.342981  1.161295       0.770014       1.283638  1.130518  0.940144                 -0.232176 -0.360847
---------------
[1]


################################################
# 4. Model Evaluation
################################################

In [5]:
# Confusion matrix için y_pred:
y_pred = knn_model.predict(X)

# AUC için y_prob:
y_prob = knn_model.predict_proba(X)[:, 1]
print(classification_report(y, y_pred))
# acc 0.83
# f1 0.74

# AUC
print("AUC:" + str(roc_auc_score(y, y_prob)))
# 0.90

print("---------------------")
print("KROSS VALIDATION")

cv_results = cross_validate(knn_model, X, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])

print("Test Accuracy: " +str(cv_results['test_accuracy'].mean()))
print("Test f1: "+ str(cv_results['test_f1'].mean()))
print("Test ROC AUC: " + str(cv_results['test_roc_auc'].mean()))
# 0.73
# 0.59
# 0.78
print("---------------------")

knn_model.get_params()

              precision    recall  f1-score   support

           0       0.85      0.90      0.87       500
           1       0.79      0.70      0.74       268

    accuracy                           0.83       768
   macro avg       0.82      0.80      0.81       768
weighted avg       0.83      0.83      0.83       768

AUC:0.9017686567164179
---------------------
KROSS VALIDATION
Test Accuracy: 0.733112638994992
Test f1: 0.5905780011534191
Test ROC AUC: 0.7805279524807827
---------------------


{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

################################################
# 5. Hyperparameter Optimization
################################################

In [6]:
knn_model = KNeighborsClassifier()
knn_model.get_params()

knn_params = {"n_neighbors": range(2, 50)}

knn_gs_best = GridSearchCV(knn_model,
                           knn_params,
                           cv=5,
                           n_jobs=-1,
                           verbose=1).fit(X, y)
print("-------------------")

print(knn_gs_best.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
-------------------
{'n_neighbors': 17}


################################################
# 6. Final Model
################################################

In [7]:
knn_final = knn_model.set_params(**knn_gs_best.best_params_).fit(X, y)

cv_results = cross_validate(knn_final,
                            X,
                            y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

cv_results['test_accuracy'].mean()
cv_results['test_f1'].mean()
cv_results['test_roc_auc'].mean()

random_user = X.sample(1)

knn_final.predict(random_user)

array([0])