In [1]:
#importing necessary libraries

import pandas as pd 


from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

# for model evaluation
from sklearn import metrics # Scores the performance of the model

from sklearn.model_selection import cross_validate

from sklearn.ensemble import GradientBoostingClassifier

import optuna
import xgboost as xgb

In [2]:
 #reading the uploaded dataset into a dataframe
df = pd.read_csv('./Data/cleaned_diabetes2.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69057 entries, 0 to 69056
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   Unnamed: 0            69057 non-null  int64
 1   Diabetes_binary       69057 non-null  int64
 2   HighBP                69057 non-null  int64
 3   HighChol              69057 non-null  int64
 4   CholCheck             69057 non-null  int64
 5   BMI                   69057 non-null  int64
 6   Smoker                69057 non-null  int64
 7   Stroke                69057 non-null  int64
 8   HeartDiseaseorAttack  69057 non-null  int64
 9   PhysActivity          69057 non-null  int64
 10  Fruits                69057 non-null  int64
 11  Veggies               69057 non-null  int64
 12  HvyAlcoholConsump     69057 non-null  int64
 13  AnyHealthcare         69057 non-null  int64
 14  NoDocbcCost           69057 non-null  int64
 15  GenHlth               69057 non-null  int64
 16  Ment

In [4]:
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

In [5]:
# dev_X, val_X, dev_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [7]:
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
    
    params = {
    'n_estimators': trial.suggest_int('n_estimators', 50, 400),
        'max_depth': trial.suggest_int('max_depth', 2, 25),
        # 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
         "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
          "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
          "objective": "binary:logistic"
 
}
 
 
      # Initialize and train the XGBoost model
    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)

    # Predict and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

In [8]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Best hyperparameters
print('Best trial:', study.best_trial.params)

[I 2023-12-02 23:49:45,980] A new study created in memory with name: no-name-e0e6559c-7fb0-4d90-917b-9fdf47c454ba
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
  "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
[I 2023-12-02 23:49:47,000] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 146, 'max_depth': 15, 'subsample': 0.8545927479963827, 'colsample_bytree': 0.5999730756334989, 'lambda': 0.0003131419510679231, 'min_child_weight': 394.0527873860699}. Best is trial 0 with value: 1.0.
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  "gamma": trial.suggest_loguniform("lambda", 1e-8, 10.0),
  "min_child_weight": trial.suggest_loguniform("min_child_weight", 10, 1000),
[I 2023-12-02 23:49:47,543] Trial 1 finished 

Best trial: {'n_estimators': 146, 'max_depth': 15, 'subsample': 0.8545927479963827, 'colsample_bytree': 0.5999730756334989, 'lambda': 0.0003131419510679231, 'min_child_weight': 394.0527873860699}


In [9]:
# Best hyperparameters
print('Best trial:', study.best_trial.params)

Best trial: {'n_estimators': 146, 'max_depth': 15, 'subsample': 0.8545927479963827, 'colsample_bytree': 0.5999730756334989, 'lambda': 0.0003131419510679231, 'min_child_weight': 394.0527873860699}


In [10]:

best_params = study.best_trial.params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

In [11]:
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Add more evaluation metrics as necessary


Accuracy: 1.0


In [12]:
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [13]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        tn, fp, fn, tp = confusion_matrix(y_train, pred).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print(f"Specificity: {specificity * 100:.2f}%")
        print(f"Sensitivity: {sensitivity * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix:\n{confusion_matrix(y_train, pred)}\n")
        
    elif not train:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
        specificity = tn / (tn + fp)
        sensitivity = tp / (tp + fn)
        print("Test Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print(f"Specificity: {specificity * 100:.2f}%")
        print(f"Sensitivity: {sensitivity * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix:\n{confusion_matrix(y_test, pred)}\n")


In [14]:
model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
  max_depth=1, random_state=0).fit(X_train, y_train)
model.fit(X_train, y_train)

print_score(model, X_train, y_train, X_test, y_test, train=True)
print_score(model, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
Specificity: 100.00%
Sensitivity: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0        1  accuracy  macro avg  weighted avg
precision      1.0      1.0       1.0        1.0           1.0
recall         1.0      1.0       1.0        1.0           1.0
f1-score       1.0      1.0       1.0        1.0           1.0
support    23840.0  24499.0       1.0    48339.0       48339.0
_______________________________________________
Confusion Matrix:
[[23840     0]
 [    0 24499]]

Test Result:
Accuracy Score: 100.00%
Specificity: 100.00%
Sensitivity: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
                 0        1  accuracy  macro avg  weighted avg
precision      1.0      1.0       1.0        1.0           1.0
recall         1.0      1.0       1.0        1.0           1.0
f1-score       1.0      1.0       1.0        1.0           1.0
support    10120.0  10598.0       

In [15]:
# Predictions
y_pred = model.predict(X_test)

In [16]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", report)


Accuracy: 1.0
Confusion Matrix:
 [[10120     0]
 [    0 10598]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     10120
           1       1.00      1.00      1.00     10598

    accuracy                           1.00     20718
   macro avg       1.00      1.00      1.00     20718
weighted avg       1.00      1.00      1.00     20718

