In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split

# Load the dataset
url = "https://raw.githubusercontent.com/4GeeksAcademy/decision-tree-project-tutorial/main/diabetes.csv"
diabetes_data = pd.read_csv(url)

# Display the first few rows of the dataset
print(diabetes_data.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [4]:
X = diabetes_data.drop(['Outcome', 'BloodPressure'], axis=1)
y = diabetes_data['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [5]:
model = XGBClassifier(random_state = 42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [6]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))

Accuracy: 0.7142857142857143
F1 Score: 0.6271186440677966


In [32]:
param_grid = {
    'n_estimators': [25, 50, 75],
    'learning_rate': [0.01, 0.1, 0.8],
    'max_depth': [1, 3, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}


In [33]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 2187 candidates, totalling 6561 fits
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=1, n_estimators=25, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=1, n_estimators=25, subsample=0.7; total time=   0.0s
[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=1, n_estimators=25, subsample=0.8; total time=   0.0s[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=1, n_estimators=25, subsample=0.7; total time=   0.0s

[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=1, n_estimators=25, subsample=0.9; total time=   0.0s[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1, min_child_weight=1, n_estimators=25, subsample=0.8; total time=   0.0s

[CV] END colsample_bytree=0.7, gamma=0, learning_rate=0.01, max_depth=1

In [35]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.9}
Accuracy: 0.7402597402597403
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.80        99
           1       0.63      0.65      0.64        55

    accuracy                           0.74       154
   macro avg       0.72      0.72      0.72       154
weighted avg       0.74      0.74      0.74       154



In [None]:
import pickle

saved_model = 'optimized_XGBoost_model.pkl'

with open(saved_model, 'wb') as file:
    pickle.dump(model, file)