In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import plotly.offline as pyo
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,roc_auc_score,classification_report,RocCurveDisplay
from sklearn.model_selection import train_test_split,cross_val_score
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
def outliers(df,variable):
    q1= df[variable].quantile(0.10)
    q3 = df[variable].quantile(0.90)
    iqr = q3 - q1
    lower_lim = q1 - 1.5*iqr
    upper_lim = q3 + 1.5*iqr
    return lower_lim,upper_lim

def check_outliers(df,variable):
    lower_lim,upper_lim = outliers(df,variable)
    if df.loc[(df[variable]<lower_lim) | (df[variable]>upper_lim)].any(axis=None):
        return True
    else:
        return False
    
def replace_outliers(df,variable):
    lower_lim, upper_lim = outliers(df,variable)
    df[variable].clip(lower=lower_lim, upper=upper_lim,inplace=True)

In [3]:
df["Glucose"] =df["Glucose"].replace(0,np.nan)
df["BloodPressure"] = df["BloodPressure"].replace(0,np.nan)
df["SkinThickness"] = df["SkinThickness"].replace(0,np.nan)
df["Insulin"] = df["Insulin"].replace(0,np.nan)
df["BMI"] = df["BMI"].replace(0,np.nan)
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [4]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
imp= IterativeImputer(
    estimator=XGBRegressor(n_estimators=300,max_depth=7),
    max_iter=30,
    random_state=0
).set_output(transform="pandas")

df2=imp.fit_transform(df)


[IterativeImputer] Early stopping criterion not reached.



In [5]:
X = df2.drop("Outcome",axis=1)
replace_outliers(df2,"SkinThickness")
replace_outliers(df2,"Insulin")
for col in list(X.keys()):
    print(col,check_outliers(df2,col))

Pregnancies False
Glucose False
BloodPressure False
SkinThickness False
Insulin False
BMI False
DiabetesPedigreeFunction True
Age False


In [7]:
y = df["Outcome"]
rb = RobustScaler().set_output(transform="pandas")
X = rb.fit_transform(X)

In [8]:
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.6,0.738095,0.000,0.457571,0.893217,0.148352,0.665359,1.235294
1,-0.4,-0.761905,-0.375,0.000000,-0.503854,-0.620879,-0.056209,0.117647
2,1.0,1.571429,-0.500,-0.735076,0.409470,-0.983516,0.783007,0.176471
3,-0.4,-0.666667,-0.375,-0.457571,-0.334765,-0.456044,-0.537255,-0.470588
4,-0.6,0.476190,-2.000,0.457571,0.353363,1.192308,5.007843,0.235294
...,...,...,...,...,...,...,...,...
763,1.4,-0.380952,0.250,1.448975,0.464952,0.071429,-0.526797,2.000000
764,-0.2,0.119048,-0.125,-0.152524,0.165538,0.500000,-0.084967,-0.117647
765,0.4,0.095238,0.000,-0.457571,-0.167383,-0.664835,-0.333333,0.058824
766,-0.4,0.214286,-0.750,-0.343494,-0.199081,-0.236264,-0.061438,1.058824


In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
from sklearn.model_selection import cross_validate
cv_results= cross_validate(knn,X,y,cv=10,scoring=["accuracy","precision","recall","f1","roc_auc"])
cv_results

{'fit_time': array([0.00300074, 0.00200033, 0.00200057, 0.00095296, 0.00094914,
        0.00195074, 0.00100517, 0.00202608, 0.00203991, 0.00196075]),
 'score_time': array([0.01499963, 0.01303887, 0.0110085 , 0.01204801, 0.0120492 ,
        0.01104784, 0.01199818, 0.01192307, 0.01199985, 0.01204944]),
 'test_accuracy': array([0.71428571, 0.75324675, 0.76623377, 0.66233766, 0.72727273,
        0.74025974, 0.77922078, 0.72727273, 0.75      , 0.76315789]),
 'test_precision': array([0.5862069 , 0.7       , 0.65517241, 0.52      , 0.65      ,
        0.64      , 0.72727273, 0.6       , 0.64      , 0.65384615]),
 'test_recall': array([0.62962963, 0.51851852, 0.7037037 , 0.48148148, 0.48148148,
        0.59259259, 0.59259259, 0.66666667, 0.61538462, 0.65384615]),
 'test_f1': array([0.60714286, 0.59574468, 0.67857143, 0.5       , 0.55319149,
        0.61538462, 0.65306122, 0.63157895, 0.62745098, 0.65384615]),
 'test_roc_auc': array([0.78222222, 0.77925926, 0.83962963, 0.69074074, 0.80222222,
 

In [10]:
cv_results["test_accuracy"].mean()

0.7383287764866713

In [17]:
cv_results["test_precision"].mean()

0.6372498191463708

In [16]:
cv_results["test_recall"].mean()

0.5935897435897435

In [30]:
cv_results["test_roc_auc"].mean()

0.7997492877492878

In [11]:
random_user = X.sample(1,random_state=10)
random_user

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
568,0.2,0.880952,0.0,0.0,-0.037196,-0.104396,-0.090196,0.470588


In [19]:
knn.fit(X,y)

In [14]:
knn.predict(random_user)

array([0], dtype=int64)

In [23]:
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,stratify=y)

In [27]:
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
y_prob = knn.predict_proba(X_test)[:,1]

In [28]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       100
           1       0.69      0.63      0.66        54

    accuracy                           0.77       154
   macro avg       0.75      0.74      0.74       154
weighted avg       0.77      0.77      0.77       154



In [29]:
roc_auc_score(y_test,y_prob)

0.797037037037037

In [31]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [32]:
from xgboost import XGBClassifier
xg = XGBClassifier()
xg.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [40]:
from sklearn.model_selection import GridSearchCV
knn_params = {"n_neighbors":range(2,20)}

gs = GridSearchCV(knn,param_grid=knn_params,cv=10,n_jobs=-1,verbose=1,scoring="recall").fit(X,y)
gs.best_params_

Fitting 10 folds for each of 18 candidates, totalling 180 fits


{'n_neighbors': 9}

In [41]:
knn = KNeighborsClassifier(**gs.best_params_)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
y_prob = knn.predict_proba(X_test)[:,1]

In [42]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       100
           1       0.72      0.61      0.66        54

    accuracy                           0.78       154
   macro avg       0.76      0.74      0.75       154
weighted avg       0.77      0.78      0.77       154



In [43]:
cv_results= cross_validate(knn,X,y,cv=10,scoring=["accuracy","precision","recall","f1","roc_auc"])
cv_results

{'fit_time': array([0.00151873, 0.00258017, 0.00212431, 0.00255132, 0.00204873,
        0.00208616, 0.00106168, 0.00202298, 0.00151658, 0.00206065]),
 'score_time': array([0.01827765, 0.01583433, 0.01520634, 0.01318479, 0.01240087,
        0.01262593, 0.01294756, 0.01317596, 0.01242423, 0.01236463]),
 'test_accuracy': array([0.72727273, 0.74025974, 0.75324675, 0.68831169, 0.75324675,
        0.75324675, 0.80519481, 0.79220779, 0.71052632, 0.81578947]),
 'test_precision': array([0.59375   , 0.70588235, 0.64285714, 0.55555556, 0.68181818,
        0.66666667, 0.77272727, 0.7037037 , 0.59090909, 0.75      ]),
 'test_recall': array([0.7037037 , 0.44444444, 0.66666667, 0.55555556, 0.55555556,
        0.59259259, 0.62962963, 0.7037037 , 0.5       , 0.69230769]),
 'test_f1': array([0.6440678 , 0.54545455, 0.65454545, 0.55555556, 0.6122449 ,
        0.62745098, 0.69387755, 0.7037037 , 0.54166667, 0.72      ]),
 'test_roc_auc': array([0.81740741, 0.78407407, 0.85407407, 0.73777778, 0.83666667,
 

In [44]:
cv_results["test_precision"].mean()

0.6663869967178792