# Optimization for the Gradient Boosting model

## Loading and running the Gradient Boosting model

In [None]:
# Import the dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# Get the file path for the CSV file
file_path = Path("../Data/diabetes_binary_health_indicators_BRFSS2015.csv")

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
df.head()

In [None]:
# Drop any 'na' values
df.dropna(inplace=True)

In [None]:
# Get the features and the target variables
X = df.drop(columns=['Diabetes_binary'], axis=1)
y = df['Diabetes_binary']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Load the gradient_boosting.pkl model
model = joblib.load('gradient_boosting.pkl')

In [None]:
# Make predictions
y_pred = gb_clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

## Optimizing the model

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

In [None]:
# Initialize the model
gb_clf = GradientBoostingClassifier()

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gb_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [None]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

In [None]:
# Get the best parameters
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

In [None]:
# Train the model with the best parameters
optimized_gb_clf = GradientBoostingClassifier(**best_params)
optimized_gb_clf.fit(X_train, y_train)

In [None]:
# Get predictions
y_pred = optimized_gb_clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

In [None]:
# Overwrite the model if the scores are better than the original model
joblib.dump(optimized_gb_clf, 'Models/gradient_boosting.pkl')