In [None]:
!pip install lazypredict
from matplotlib import pyplot as plt
import numpy as np
import missingno as msno
import scipy as sc
import pandas as pd
import seaborn as sns

plt.rcParams['figure.figsize'] =  (20,7)

In [None]:
df = pd.read_csv("/kaggle/input/diabetes-dataset/diabetes.csv")
df.sample(10)

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]] = df[["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]].replace(0,np.nan)

In [None]:
df.describe()

In [None]:
sns.pairplot(data=df,hue="Outcome")

In [None]:
def show_information(col,h = "Outcome",):
    fig,axs = plt.subplots(nrows=1,ncols=2)
    axs[0].set_title(f"{col} colored by {h}")
    sns.histplot(data=df,x=col,hue=h,kde=True,multiple="layer",ax=axs[0])
    sns.histplot(data=df,x=col,hue=h,multiple="fill",ax=axs[1])

In [None]:
for col in df.columns.values:
    show_information(col)

In [None]:
df = df[df.BloodPressure > df.BloodPressure.quantile(0.03)]
df = df[df.Glucose > df.Glucose.quantile(0.03)]


In [None]:
df.drop(df[((df[['Pregnancies',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']] > df[['Pregnancies',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']].quantile(0.95)).sum(axis=1) > 0)].index,inplace=True)

In [None]:
from sklearn.impute import KNNImputer


kn = KNNImputer()
df.loc[:,"Glucose":"DiabetesPedigreeFunction"] = kn.fit_transform(df.loc[:,"Glucose":"DiabetesPedigreeFunction"])

In [None]:
df.columns.values

In [None]:
df["no_children"] = (df["Pregnancies"] <=6).astype("int64")

In [None]:
show_information("no_children")

In [None]:
df["ft1"] = df["Insulin"] *  np.power(df["BloodPressure"],1)

In [None]:
show_information("ft1")

In [None]:
df["ft2"] = df["Glucose"] /  np.power(df["BloodPressure"],0.5)

In [None]:
show_information("ft2")
show_information("Glucose")
show_information("BloodPressure")

# Model Preparation and comparaison

In [None]:
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,get_scorer_names
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score,RandomizedSearchCV,GridSearchCV,cross_validate,train_test_split
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
plt.rcParams['figure.figsize'] =  (20,10)

In [None]:
df = pd.read_csv("/kaggle/input/diabetes-dataset/diabetes.csv")
df.sample(10)

In [None]:
df.drop(df[((df[['Pregnancies',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']] > df[['Pregnancies',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']].quantile(0.95)).sum(axis=1) > 0)].index,inplace=True)

In [None]:
df = df[df.BloodPressure > df.BloodPressure.quantile(0.03)]
df = df[df.Glucose > df.Glucose.quantile(0.03)]


In [None]:

class null_finder(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X.replace(0,np.nan)
class custom_features(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        res = np.array([X[:,0] / np.sqrt(X[:,1]),X[:,0] * X[:,2]]).transpose()
        return res
class glucose_high(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return (X > 145).astype("int64").reshape(-1,1)
class no_children(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return (X <= 6).astype("int64").reshape(-1,1)

In [None]:
clean_data = make_pipeline(
make_column_transformer(
    (null_finder(),list(range(1,8))),
    ("passthrough",[0])
),
make_column_transformer(
    (KNNImputer(),list(range(0,8)))
),
make_column_transformer(
    ("passthrough",list(range(0,8)))
),
make_column_transformer(
    (StandardScaler(),list(range(0,8)))
)
)
#0.84

In [None]:
target = df["Outcome"]
inputs = clean_data.fit_transform(df)

In [None]:
rnd = RandomForestClassifier(random_state=0,n_estimators=300,max_depth=40)

In [None]:
cv_rf = pd.DataFrame(cross_validate(rnd,inputs,target,scoring=["f1","roc_auc","neg_log_loss","precision"],cv=20,n_jobs=-1))
cv_rf.mean()

In [None]:
xgc = XGBClassifier(**{'subsample': 1.0,
 'n_estimators': 700,
 'min_child_weight': 10,
 'max_depth': 10,
 'learning_rate': 0.007,
 'gamma': 5,
 'colsample_bytree': 0.6})

In [None]:
cv_xgc = pd.DataFrame(cross_validate(xgc,inputs,target,scoring=["f1","roc_auc","neg_log_loss","precision"],cv=20,n_jobs=-1))
cv_xgc.mean()

In [None]:
from lazypredict.Supervised import LazyClassifier
res = []
for _ in range(100):
    lzc = LazyClassifier(False)
    X_test,X_valid,y_test,y_valid = train_test_split(inputs,target)
    models,preds = lzc.fit(X_test,X_valid,y_test,y_valid)
    res.append(models)

    


In [None]:
res = pd.concat(res)

In [None]:
summ = res.groupby("Model").agg(["mean","median","std"]).sort_values(("ROC AUC","mean"),ascending=False)
summ