In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
# To ignore all warnings
warnings.filterwarnings("ignore")

# Or you can ignore specific types of warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
df = pd.read_csv(r"diabetes_binary_5050split_health_indicators_BRFSS2015.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Diabetes_binary       70692 non-null  float64
 1   HighBP                70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   Stroke                70692 non-null  float64
 7   HeartDiseaseorAttack  70692 non-null  float64
 8   PhysActivity          70692 non-null  float64
 9   Fruits                70692 non-null  float64
 10  Veggies               70692 non-null  float64
 11  HvyAlcoholConsump     70692 non-null  float64
 12  AnyHealthcare         70692 non-null  float64
 13  NoDocbcCost           70692 non-null  float64
 14  GenHlth               70692 non-null  float64
 15  MentHlth           

In [3]:
df= df.drop(columns = ['Income','Education'])

In [6]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0


In [4]:
# Selecting features and target
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

# Check the shapes of the splits
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(49484, 19) (21208, 19) (49484,) (21208,)


In [12]:
from sklearn.ensemble import RandomForestClassifier
# Initialize and train the model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Random Forest Classifier:
Accuracy: 0.7283572236891739
Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.69      0.72     10601
         1.0       0.71      0.76      0.74     10607

    accuracy                           0.73     21208
   macro avg       0.73      0.73      0.73     21208
weighted avg       0.73      0.73      0.73     21208



In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and model evaluation
print("Best Parameters:", grid_search.best_params_)
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
print("Accuracy with best parameters:", accuracy_score(y_test, y_pred))
print("Classification Report with best parameters:\n", classification_report(y_test, y_pred))


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Parameters: {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Accuracy with best parameters: 0.7538664654847227
Classification Report with best parameters:
               precision    recall  f1-score   support

         0.0       0.78      0.71      0.74     10601
         1.0       0.73      0.80      0.76     10607

    accuracy                           0.75     21208
   macro avg       0.76      0.75      0.75     21208
weighted avg       0.76      0.75      0.75     21208



In [14]:
import pickle
# Save the model to disk
filename = 'Random_Forest.pkl'
with open(filename, 'wb') as file:
    pickle.dump(best_rf_model, file)

print(f"Model saved as {filename}")

Model saved as Random_Forest.pkl


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import xgboost as xgb



# Initialize models
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Train models and evaluate their performance
model_scores = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    model_scores[model_name] = accuracy
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print('-'*30)

# Select the best model based on accuracy
best_model_name = max(model_scores, key=model_scores.get)
best_model = models[best_model_name]

print(f"The best model is {best_model_name} with an accuracy of {model_scores[best_model_name]:.4f}")

# Train the best model on the entire dataset
best_model.fit(X, y)

# Save the best model
import joblib
joblib.dump(best_model, 'best_model.pkl')
print(f"Best model {best_model_name} saved as 'best_model.pkl'")


Logistic Regression Accuracy: 0.7480
              precision    recall  f1-score   support

         0.0       0.76      0.72      0.74     10601
         1.0       0.74      0.77      0.75     10607

    accuracy                           0.75     21208
   macro avg       0.75      0.75      0.75     21208
weighted avg       0.75      0.75      0.75     21208

------------------------------
Decision Tree Accuracy: 0.6602
              precision    recall  f1-score   support

         0.0       0.66      0.67      0.66     10601
         1.0       0.66      0.65      0.66     10607

    accuracy                           0.66     21208
   macro avg       0.66      0.66      0.66     21208
weighted avg       0.66      0.66      0.66     21208

------------------------------
Random Forest Accuracy: 0.7267
              precision    recall  f1-score   support

         0.0       0.75      0.69      0.72     10601
         1.0       0.71      0.77      0.74     10607

    accuracy         

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import joblib


# Initialize the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'num_leaves': [31, 50, 70],
    'max_depth': [-1, 10, 20],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the model using grid search
grid_search.fit(X_train, y_train)

# Get the best estimator (model with the best hyperparameters)
best_lgb_model = grid_search.best_estimator_

print(f"Best LightGBM model parameters: {grid_search.best_params_}")

# Evaluate the best model on the test set
y_pred = best_lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized LightGBM Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the best model
joblib.dump(best_lgb_model, 'optimized_lightgbm_model.pkl')
print("Optimized LightGBM model saved as 'optimized_lightgbm_model.pkl'")


Fitting 5 folds for each of 972 candidates, totalling 4860 fits
[LightGBM] [Info] Number of positive: 24739, number of negative: 24745
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 49484, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499939 -> initscore=-0.000243
[LightGBM] [Info] Start training from score -0.000243
Best LightGBM model parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 10, 'min_child_samples': 50, 'n_estimators': 100, 'num_leaves': 31, 'subsample': 0.8}
Optimized LightGBM Accuracy: 0.7553
              precision    recall  f1-score   support

         0.0       0.78      0.71      0.74     10601
         1.0       0.73      0.80      0.77     10607



In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import joblib

from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'num_leaves': np.arange(20, 150, 10),
    'max_depth': np.arange(5, 50, 5),
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500, 1000],
    'min_child_samples': [10, 20, 30, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],   # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1.0],  # L2 regularization
    'min_split_gain': [0.0, 0.1, 0.2, 0.3],
    'min_child_weight': [0.001, 0.01, 0.1, 1]
}

# Initialize the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    estimator=lgb_model, 
    param_distributions=param_dist, 
    n_iter=100,  # Number of parameter settings that are sampled
    cv=5, 
    scoring='accuracy', 
    verbose=2, 
    n_jobs=-1,
    random_state=42
)

# Fit the model using random search
random_search.fit(X_train, y_train)

# Get the best estimator (model with the best hyperparameters)
best_lgb_model = random_search.best_estimator_

print(f"Best LightGBM model parameters: {random_search.best_params_}")

# Evaluate the best model on the test set
y_pred = best_lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized LightGBM Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the best model
joblib.dump(best_lgb_model, 'further_optimized_lightgbm_model.pkl')
print("Further optimized LightGBM model saved as 'further_optimized_lightgbm_model.pkl'")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[LightGBM] [Info] Number of positive: 24739, number of negative: 24745
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002877 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 49484, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499939 -> initscore=-0.000243
[LightGBM] [Info] Start training from score -0.000243
Best LightGBM model parameters: {'subsample': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 0.5, 'num_leaves': 140, 'n_estimators': 1000, 'min_split_gain': 0.2, 'min_child_weight': 1, 'min_child_samples': 50, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
Optimized LightGBM Accuracy: 0.7563
              precision    recall  f1-score   support

         0.0       0.78  

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib


# Define the parameter distribution for hyperparameter tuning
param_dist = {
    'num_leaves': np.arange(20, 150, 10),
    'max_depth': np.arange(5, 50, 5),
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 500, 1000],
    'min_child_samples': [10, 20, 30, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],   # L1 regularization
    'reg_lambda': [0, 0.1, 0.5, 1.0],  # L2 regularization
    'min_split_gain': [0.0, 0.1, 0.2, 0.3],
    'min_child_weight': [0.001, 0.01, 0.1, 1]
}

# Initialize the LightGBM model
lgb_model = lgb.LGBMClassifier()

# Set up Stratified K-Fold cross-validation to ensure balanced splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Set up RandomizedSearchCV with cross-validation
random_search = RandomizedSearchCV(
    estimator=lgb_model, 
    param_distributions=param_dist, 
    n_iter=200,  # Increased iterations for a more exhaustive search
    cv=skf, 
    scoring='accuracy', 
    verbose=2, 
    n_jobs=-1,
    random_state=42
)

# Fit the model using random search
random_search.fit(X_train, y_train)

# Get the best estimator (model with the best hyperparameters)
best_lgb_model = random_search.best_estimator_

print(f"Best LightGBM model parameters: {random_search.best_params_}")

# Evaluate the best model on the test set
y_pred = best_lgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized LightGBM Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

# Save the best model
joblib.dump(best_lgb_model, 'further_optimized_lightgbm_model.pkl')
print("Further optimized LightGBM model saved as 'lightgbm_model.pkl'")


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[LightGBM] [Info] Number of positive: 24739, number of negative: 24745
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002831 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 182
[LightGBM] [Info] Number of data points in the train set: 49484, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499939 -> initscore=-0.000243
[LightGBM] [Info] Start training from score -0.000243
Best LightGBM model parameters: {'subsample': 1.0, 'reg_lambda': 0.1, 'reg_alpha': 0.5, 'num_leaves': 20, 'n_estimators': 500, 'min_split_gain': 0.1, 'min_child_weight': 1, 'min_child_samples': 20, 'max_depth': 35, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
Optimized LightGBM Accuracy: 0.7562
              precision    recall  f1-score   support

         0.0       0.78   