# KNN Algoritması


In [23]:
import pandas as pd
import warnings
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [24]:
df = pd.read_csv("diabetes.csv")
df.head()                           # diabet veri setinin tanımlayıcı özelliklerine göz at
df.shape
df.describe().T
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [25]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [26]:
df.shape

(768, 9)

In [27]:
df.describe() 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [28]:
df["Outcome"].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [35]:

# 2. Data Preprocessing & Feature Engineering


y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)     # Veriyi test ve eğitim olarak ikiye ayır 

In [36]:
X_scaled = StandardScaler().fit_transform(X)

X = pd.DataFrame(X_scaled, columns=X.columns)

In [37]:

# 3. Modeling & Prediction


knn_model = KNeighborsClassifier().fit(X, y)            # Modeli kuralım 

In [38]:
random_user = X.sample(5)

In [39]:
knn_model.predict(random_user)

array([1, 0, 0, 1, 0], dtype=int64)

#  Model Başarı değerlendirme

In [40]:

# 4. Model Başarı değerlendirme


# Confusion matrix için y_pred:

y_pred = knn_model.predict(X)          # Karmaşıklık matrisi ile tüm başarı değerlendirme ölçütlerini bir araya getir 

In [41]:
# AUC için y_prob:
y_prob = knn_model.predict_proba(X)[:, 1]

In [42]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.90      0.87       500
           1       0.79      0.70      0.74       268

    accuracy                           0.83       768
   macro avg       0.82      0.80      0.81       768
weighted avg       0.83      0.83      0.83       768



In [43]:
roc_auc_score(y, y_prob)

0.9017686567164179

In [44]:
cv_results = cross_validate(knn_model, X, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])

In [45]:
cv_results['test_accuracy'].mean()

0.733112638994992

In [48]:
cv_results['test_f1'].mean() # en kuvvetli test sonucu f1 değeri

0.5905780011534191

In [49]:
cv_results['test_roc_auc'].mean()

0.7805279524807827

# Hyperparameter Optimization

In [50]:
knn_model = KNeighborsClassifier()
knn_model.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [51]:
knn_params = {"n_neighbors": range(2, 50)}      # 2 ile 50 arası komşuluk kurmak için yapılan işlemler

In [52]:
knn_gs_best = GridSearchCV(knn_model,
                           knn_params,
                           cv=5,
                           n_jobs=-1,
                           verbose=1).fit(X, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [53]:
knn_gs_best.best_params_           # En iyi komşuluk değeri 17 olarak bulundu 

{'n_neighbors': 17}

# Final Model

In [54]:
################################################
# 6. Final Model
################################################

In [55]:
knn_final = knn_model.set_params(**knn_gs_best.best_params_).fit(X, y)

In [56]:
cv_results = cross_validate(knn_final,
                            X,
                            y,
                            cv=5,
                            scoring=["accuracy", "f1", "roc_auc"])

In [57]:
cv_results['test_accuracy'].mean()

0.7669892199303965

In [58]:
cv_results['test_f1'].mean()

0.6170909049720137

In [59]:
cv_results['test_roc_auc'].mean()

0.8127938504542278

In [60]:
random_user = X.sample(5)

In [61]:
knn_final.predict(random_user)

array([0, 1, 0, 0, 1], dtype=int64)