In [None]:
from matplotlib import pyplot as plt
import missingno as msno
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 5]

In [None]:
df = pd.read_csv("/kaggle/input/early-diabetes-classification/diabetes_data.csv",sep=";")

# EDA

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
num_cats = df.describe()
num_cats

In [None]:
cat_cats = df.drop(columns=num_cats.columns.values).describe()
cat_cats

In [None]:

fig,axs = plt.subplots(nrows=17,ncols=1,figsize=(20,70))
for i,feature in enumerate(num_cats.columns.values):
    sns.histplot(data=df,x=feature,hue="class",ax=axs[i],stat="density",multiple="layer")

In [None]:

fig,axs = plt.subplots(nrows=2,ncols=1,figsize=(20,20))
upgraded = df.assign(all_symptoms=lambda d:d.iloc[:,2:-1].sum(axis=1))
sns.histplot(data=upgraded,x="all_symptoms",hue="class",ax=axs[0])



In [None]:
sns.histplot(data=df,x="age",hue="class",kde=True)

# Model Training

In [None]:
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_absolute_error,mean_squared_error,get_scorer_names
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score,RandomizedSearchCV,GridSearchCV,cross_validate
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
main_pipeline = make_pipeline(make_column_transformer(
        (OneHotEncoder(),["gender"]),remainder="passthrough"
    )
)

In [None]:
features = df.drop(columns=["class"])
target = df["class"]
X_test,X_valid,y_test,y_valid = train_test_split(features,target)

## GRID Search

In [None]:
from scipy.stats import randint
rsv = RandomizedSearchCV(RandomForestClassifier(),
    {
        "n_estimators": randint(100,500), 
        'bootstrap': [True, False],
        'max_depth': [40,60,80,120],
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 3, 5, 10]
    },random_state=0,n_iter=400,scoring="roc_auc",n_jobs=-1)

In [None]:
rsv.fit(main_pipeline.fit_transform(X_test),y_test)

In [None]:
rnd = rsv.best_estimator_

In [None]:
pd.DataFrame(rsv.cv_results_)

In [None]:
predictions = rnd.predict(main_pipeline.transform(X_valid))

# Evaluation

In [None]:
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_valid,predictions),annot=True)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_valid,predictions)

In [None]:
# from xgboost import XGBRFClassifier
# csv = RandomizedSearchCV(XGBRFClassifier(eval_metric="auc"),
#     {
#         "n_estimators": randint(100,1500), 
#         'learning_rate': [0.0001, 0.001, 0.007, 0.1, 0.15, 0.25],
#     },random_state=0,n_iter=70,scoring="roc_auc")

In [None]:
# csv.fit(main_pipeline.fit_transform(X_test),y_test)

In [None]:
# xgb = csv.best_estimator_

In [None]:
# xgb_predictions = xgb.predict(main_pipeline.transform(X_valid))

In [None]:
# sns.heatmap(confusion_matrix(y_valid,xgb_predictions),annot=True)