In [None]:
import dash
import dash_html_components as html
from dash import dcc as dcc
import dash_bootstrap_components as dbc
import webbrowser
import pandas as pd
import pickle
from dash.dependencies import Input, Output, State
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

# Step 1: Load the dataset
data = pd.read_csv("/content/diabetes_prediction_dataset.csv")

# Check if the 'diabetes' column is present
if 'diabetes' not in data.columns:
    raise KeyError("Column 'diabetes' not found in the loaded dataset. Please check the column names in your CSV file.")

# Step 2: Exploratory Data Analysis (EDA)

print(data.info())
print(data.describe())
print(data['diabetes'].value_counts())

# Step 3: Data Preprocessing
# Step 3.2: Outlier Observation Analysis
numeric_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Select only the desired features
selected_features = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']
data = data[numeric_columns + ['diabetes']]

outlier_detector = EllipticEnvelope(contamination=0.05)
data['outlier'] = outlier_detector.fit_predict(data[numeric_columns])
print("Outlier Analysis:\n", data['outlier'].value_counts())

# Step 3.3: Local Outlier Factor (LOF)
lof = LocalOutlierFactor(contamination=0.05)
data['lof'] = lof.fit_predict(data[numeric_columns])
print("LOF Analysis:\n", data['lof'].value_counts())

# Remove outliers
data = data[data['outlier'] != -1]
data = data[data['lof'] != -1]

# Drop temporary columns
data = data.drop(['outlier', 'lof'], axis=1)

if 'diabetes' not in data.columns:
    raise KeyError("Column 'diabetes' not found after preprocessing. Please check the column names.")

# Step 5: One Hot Encoding
categorical_columns = []  # No categorical columns in the selected features
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Only select the desired features
data = data[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']]

if 'diabetes' not in data.columns:
    raise KeyError("Column 'diabetes' not found after one-hot encoding. Please check the column names.")

# Step 6: Base Models
X = data.drop('diabetes', axis=1)
y = data['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(),
    'Linear Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'LightGBM': LGBMClassifier(),
    'XGBoost': XGBClassifier()
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f'{model_name} Accuracy: {accuracy * 100:.2f}%')

# Step 7: Model Tuning
# Hyperparameter tuning for each model
# Model Tuning for Naive Bayes
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Model Tuning for SVM
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Model Tuning for Logistic Regression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

# Model Tuning for K Nearest Neighbor (KNN)
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

# Model Tuning for Decision Tree
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

# Model Tuning for LightGBM
lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train, y_train)

# Model Tuning for XGBoost
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Model Tuning for Random Forest
rf_params = {
    'n_estimators': 100,       # Adjust the number of trees in the forest
    'max_depth': None,         # Adjust the maximum depth of the tree
    'min_samples_split': 2,    # Adjust the minimum number of samples required to split an internal node
    'min_samples_leaf': 1      # Adjust the minimum number of samples required to be at a leaf node
}

rf_classifier = RandomForestClassifier(**rf_params, random_state=42)


# Step 8: Comparison of Final Models

best_accuracy = 0.0  # Variable to store the best accuracy
best_model_name = ""  # Variable to store the name of the best-performing model

for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    # Calculate accuracy within the loop
    accuracy = accuracy_score(y_test, predictions)

    # Print accuracy
    print(f'\n{model_name} Accuracy: {accuracy * 100:.2f}%')

    # Print classification report
    print(f'\n{model_name}:\n', classification_report(y_test, predictions))

    # Check if the current model has a higher accuracy than the best one so far
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = model_name


# Step 9: Reporting
print(f"\nBest-performing model: {best_model_name} with accuracy {best_accuracy * 100:.2f}%")

# Save the best-performing model to a pickle file
best_model = models[best_model_name]
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)

# Save the StandardScaler to a pickle file
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Dash Web Application
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])

def preprocess_input(age, bmi, hba1c, blood_glucose):
    # Convert input values to DataFrame
    input_data = pd.DataFrame({
        'age': [age],
        'bmi': [bmi],
        'HbA1c_level': [hba1c],
        'blood_glucose_level': [blood_glucose]
    })

    # Use the saved StandardScaler for preprocessing
    with open('scaler.pkl', 'rb') as scaler_file:
        loaded_scaler = pickle.load(scaler_file)

    input_data_scaled = pd.DataFrame(loaded_scaler.transform(input_data), columns=input_data.columns)
    return input_data_scaled

def check_data(age, bmi, hba1c, blood_glucose):
    # Load the best-performing model
    with open('best_model.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

    # Additional conditions for age, diabetic status, and pre-diabetic status
    age_group = 'Adult' if age >= 18 else 'Child'
    diabetic_status = 'Diabetic' if hba1c > 6.0 or blood_glucose > 140 else 'Non-Diabetic'
    prediabetic_status = 'Pre-Diabetic' if 5.7 < hba1c <= 6.0 or 100 < blood_glucose <= 140 else 'Non-Diabetic'

    # Preprocess input data
    input_data_scaled = preprocess_input(age, bmi, hba1c, blood_glucose)

    # Make predictions
    prediction = model.predict(input_data_scaled)

    return age_group, diabetic_status, prediabetic_status, prediction

def UI_main():
    main_ui = dbc.Container(html.Div(
        [
            dbc.Alert(children="Diabetes Prediction Checker", id="Main_head", color="success", style={'fontSize': '24px'}),
            dbc.Label("Age"),
            dbc.Input(id="age_input", placeholder="Age of the patient", type="number", min=0),
            html.Br(),
            dbc.Label("BMI"),
            dbc.Input(id="BMI_input", placeholder="BMI of the patient", type="number", min=0),
            html.Br(),
            dbc.Label("HbA1c Level"),
            dbc.Input(id="HbA1c_input", placeholder="HbA1c Level of the patient", type="number", min=0),
            html.Br(),
            dbc.Label("Blood Glucose Level"),
            dbc.Input(id="BloodGlucose_input", placeholder="Blood Glucose Level of the patient", type="number", min=0),
            html.Br(),
            dbc.Button("Submit", id="button", color='secondary', className="mt-3", style={'width': '100%', 'fontSize': '24px'}),
            dbc.Button(children="Result", id='result', color='light', className="mt-3", disabled=True,
                       style={'width': '100%', 'fontSize': '20px'}),
        ],
        className="p-5", style={'marginTop': '20px'}
    ))
    return main_ui

@app.callback(
    Output('result', 'children'),
    Output('result', 'color'),
    Output('result', 'disabled'),
    [
        Input('button', 'n_clicks'),
        Input('age_input', 'value'),
        Input('BMI_input', 'value'),
        Input('HbA1c_input', 'value'),
        Input('BloodGlucose_input', 'value')
    ],
    [
        State('button', 'n_clicks_previous')
    ]
)
def review_update(n_clicks, age, bmi, hba1c, blood_glucose, n_clicks_previous):
    print("Callback function called.")
    print(f"Inputs: n_clicks={n_clicks}, age={age}, bmi={bmi}, hba1c={hba1c}, blood_glucose={blood_glucose}, n_clicks_previous={n_clicks_previous}")

    if n_clicks is not None and (n_clicks_previous is None or n_clicks > n_clicks_previous):
        age_group, diabetic_status, prediabetic_status, response = check_data(age, bmi, hba1c, blood_glucose)
        print("Response:", response)

        if response and response[0] == 0:
            return f'The patient is {age_group} and {diabetic_status} ({prediabetic_status})', 'success', False
        elif response:
            return f'The patient is {age_group} and {diabetic_status} ({prediabetic_status})', 'danger', False
    return "", "light", True

if __name__ == '__main__':
    app.title = "Diabetes Prediction"
    app.layout = UI_main()
    app.run_server()
    webbrowser.open_new("http://127.0.0.1:8050/")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
None
                 age  hypertension  heart_disease            bmi  \
count  100000.000000  100000.00000  100000.000000  100000.000000   
mean       41.885856       0.07485       0.039420      27.320767   
std        22.516840       0.26315       0.194593       6.6

<IPython.core.display.Javascript object>

In [None]:
pip install dash_bootstrap_components

Collecting dash_bootstrap_components
  Downloading dash_bootstrap_components-1.5.0-py3-none-any.whl (221 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m221.2/221.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dash>=2.0.0 (from dash_bootstrap_components)
  Downloading dash-2.16.1-py3-none-any.whl (10.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
Collecting dash-html-components==2.0.0 (from dash>=2.0.0->dash_bootstrap_components)
  Downloading dash_html_components-2.0.0-py3-none-any.whl (4.1 kB)
Collecting dash-core-components==2.0.0 (from dash>=2.0.0->dash_bootstrap_components)
  Downloading dash_core_components-2.0.0-py3-none-any.whl (3.8 kB)
Collecting dash-table==5.0.0 (from dash>=2.0.0->dash_bootstrap_components)
  Downloading dash_table-5.0.0-py3-none-any.whl (3.9 kB)
Collecting retrying (from dash>=2.0.0->dash_bootstrap_components)
  Downloading retrying-1.3.4-py

In [None]:
pip install dash_html_components



In [None]:
pip install dash



In [None]:
pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0m

In [None]:
!pip install pandas numpy scikit-learn lightgbm xgboost Flask




In [None]:
!pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0m

In [None]:

import pickle

# Assuming 'models' is a dictionary containing your trained models
models = {
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(),
    'Linear Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'LightGBM': LGBMClassifier(),
    'XGBoost': XGBClassifier()
}

# Save each model to a pickle file
for model_name, model in models.items():
    with open(f'{model_name}_model.pkl', 'wb') as file:
        pickle.dump(model, file)

print("Models saved as pickle files.")


Models saved as pickle files.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

In [None]:
# Step 1: Load the dataset
data = pd.read_csv('/content/diabetes_prediction_dataset.csv')
print(data)

if 'diabetes' not in data.columns:
    raise KeyError("Column 'diabetes' not found in the loaded dataset.")


In [None]:
# Step 2: Exploratory Data Analysis (EDA)

print(data.info())
print(data.describe())
print(data['diabetes'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
None
                 age  hypertension  heart_disease            bmi  \
count  100000.000000  100000.00000  100000.000000  100000.000000   
mean       41.885856       0.07485       0.039420      27.320767   
std        22.516840       0.26315       0.194593       6.6

In [None]:
# Step 3: Data Preprocessing

# Step 3.1: Missing Observation Analysis

missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

# Step 3.2: Outlier Observation Analysis

numeric_columns = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
outlier_detector = EllipticEnvelope(contamination=0.05)
data['outlier'] = outlier_detector.fit_predict(data[numeric_columns])
print("Outlier Analysis:\n", data['outlier'].value_counts())

# Step 3.3: Local Outlier Factor (LOF)

from sklearn.neighbors import LocalOutlierFactor
lof = LocalOutlierFactor(contamination=0.05)
data['lof'] = lof.fit_predict(data[numeric_columns])
print("LOF Analysis:\n", data['lof'].value_counts())

# Remove outliers
data = data[data['outlier'] != -1]
data = data[data['lof'] != -1]

# Drop temporary columns
data = data.drop(['outlier', 'lof'], axis=1)

if 'diabetes' not in data.columns:
    raise KeyError("Column 'diabetes' not found after preprocessing.")

Missing Values:
 gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
Outlier Analysis:
  1    95000
-1     5000
Name: outlier, dtype: int64
LOF Analysis:
  1    95000
-1     5000
Name: lof, dtype: int64


In [None]:
# Step 4: One Hot Encoding
categorical_columns = ['gender', 'hypertension', 'heart_disease', 'smoking_history']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

if 'diabetes' not in data.columns:
    raise KeyError("Column 'diabetes' not found after one-hot encoding.")


In [None]:
# Step 5: Base Models
X = data.drop('diabetes', axis=1)
y = data['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(),
    'Linear Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'LightGBM': LGBMClassifier(),
    'XGBoost': XGBClassifier()
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f'{model_name} Accuracy: {accuracy * 100:.2f}%')

Naive Bayes Accuracy: 85.98%
SVM Accuracy: 95.90%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Linear Regression Accuracy: 96.08%
Random Forest Accuracy: 97.11%
KNN Accuracy: 96.21%
Decision Tree Accuracy: 95.34%
[LightGBM] [Info] Number of positive: 3378, number of negative: 69190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 72568, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.046549 -> initscore=-3.019573
[LightGBM] [Info] Start training from score -3.019573
LightGBM Accuracy: 97.35%
XGBoost Accuracy: 97.28%


In [None]:
# Step 6: Model Tuning

# Hyperparameter tuning for each model
# Model Tuning for Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

# Model Tuning for SVM
from sklearn.svm import SVC
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

# Model Tuning for Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)

# Model Tuning for K Nearest Neighbor (KNN)
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

# Model Tuning for Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

# Model Tuning for LightGBM
from lightgbm import LGBMClassifier
lgbm_classifier = LGBMClassifier()
lgbm_classifier.fit(X_train, y_train)

# Model Tuning for XGBoost
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train)

# Model Tuning for Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_params = {
    'n_estimators': 100,       # Adjusting the number of trees in the forest
    'max_depth': None,         # Adjusting the maximum depth of the tree
    'min_samples_split': 2,    # Adjusting the minimum number of samples required to split an internal node
    'min_samples_leaf': 1      # Adjusting the minimum number of samples required to be at a leaf node
}

rf_classifier = RandomForestClassifier(**rf_params, random_state=42)
rf_classifier.fit(X_train, y_train)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 3378, number of negative: 69190
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008868 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 72568, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.046549 -> initscore=-3.019573
[LightGBM] [Info] Start training from score -3.019573


In [None]:
# Step 7: Comparison of Final Models

for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f'\n{model_name}:\n', classification_report(y_test, predictions))



Naive Bayes:
               precision    recall  f1-score   support

           0       0.98      0.87      0.92     17344
           1       0.19      0.65      0.29       799

    accuracy                           0.86     18143
   macro avg       0.58      0.76      0.61     18143
weighted avg       0.95      0.86      0.89     18143


SVM:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     17344
           1       1.00      0.07      0.13       799

    accuracy                           0.96     18143
   macro avg       0.98      0.53      0.55     18143
weighted avg       0.96      0.96      0.94     18143



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Linear Regression:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     17344
           1       0.63      0.26      0.37       799

    accuracy                           0.96     18143
   macro avg       0.80      0.63      0.67     18143
weighted avg       0.95      0.96      0.95     18143


Random Forest:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99     17344
           1       0.83      0.43      0.57       799

    accuracy                           0.97     18143
   macro avg       0.90      0.72      0.78     18143
weighted avg       0.97      0.97      0.97     18143


KNN:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     17344
           1       0.68      0.27      0.38       799

    accuracy                           0.96     18143
   macro avg       0.82      0.63      0.68     18143
weighted avg       0.95      

In [None]:
# Step 8: Reporting

# To Find the best-performing model
best_model_name = ""  # Variable to store the name of the best-performing model
best_model_accuracy = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Update the best-performing model
    if accuracy > best_model_accuracy:
        best_model_accuracy = accuracy
        best_model_name = model_name

print("\n Reporting")
print(f"In conclusion, we explored the dataset, handled missing values, identified and removed outliers, performed feature engineering, and applied one-hot encoding.")
print(f"We trained various machine learning models, including Naive Bayes, SVM, Linear Regression, Random Forest, KNN, Decision Tree, LightGBM, and XGBoost.")

if best_model_name:
    print(f"After tuning hyperparameters and evaluating the models, we observed that {best_model_name} achieved the highest accuracy of {best_model_accuracy * 100:.2f}%.")
    print(f"This model can be considered for further deployment and usage in predicting diabetes based on the given features.")
else:
    print("No best-performing model found. Please review the results of each model.")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 3378, number of negative: 69190
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005849 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 72568, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.046549 -> initscore=-3.019573
[LightGBM] [Info] Start training from score -3.019573

 Reporting
In conclusion, we explored the dataset, handled missing values, identified and removed outliers, performed feature engineering, and applied one-hot encoding.
We trained various machine learning models, including Naive Bayes, SVM, Linear Regression, Random Forest, KNN, Decision Tree, LightGBM, and XGBoost.
After tuning hyperparameters and evaluating the models, we observed that LightGBM achieved the highest accuracy of 97.35%.
Th