# XGboost Project

In the two previous projects we saw how we could use a decision tree and then a random forest to improve the prediction of diabetes. We have reached a point where we need to improve. Can boosting be the best alternative to optimize the results?

Boosting is a sequential composition of models (usually decision trees) in which the new model aims to correct the errors of the previous one. This view may be useful in this data set, since several of the assumptions studied in the module are met.

In [18]:
import pandas as pd
import optuna


In [8]:
# Loading data of Diabetes prediction dataset with and without outliers

outliers_data = pd.read_excel("/workspaces/xgboost-project/data/processed/outliers_data.xlsx")
no_outliers_data = pd.read_excel("/workspaces/xgboost-project/data/processed/no_outliers_data.xlsx")

outliers_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [9]:
no_outliers_data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,2.959184,122.627551,70.596939,29.066327,130.660714,32.697704,0.491561,29.589286,0.331633
std,2.744605,30.860781,11.306056,10.376823,71.308664,6.310185,0.264094,8.639242,0.471401
min,0.0,56.0,38.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,113.0,32.9,0.4495,26.0,0.0
75%,4.0,143.0,78.0,36.25,172.0,36.8,0.687,34.0,1.0
max,11.0,198.0,102.0,60.0,342.0,49.7,1.292,55.0,1.0


In [12]:
# split both datasets into train and test

from sklearn.model_selection import train_test_split

#data with outliers 
predictors = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target = 'Outcome'

X = outliers_data[predictors]
y = outliers_data[target]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


#data without outliers

predictors1 = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target1 = 'Outcome'

X = no_outliers_data[predictors1]
y = no_outliers_data[target1]

#data with outliers no scaled
X_train_no_out, X_test_no_out, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
Mejores parámetros:  {'n_estimators': 218, 'learning_rate': 0.0899598553283333, 'subsample': 0.6126950362885215, 'max_depth': 6, 'gamma': 0.0012937020191838994, 'min_child_weight': 9.179599573247735, 'colsample_bylevel': 0.6714388804457142}


### Hyperparameter Rendering (Optuna)

In [28]:
import xgboost as xgb


def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),
        'objective': 'binary:logistic',
        'subsample': trial.suggest_loguniform('subsample', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 10.0),
        'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.5, 1.0),
        'random_state': 42
    }
    
    # Crear y entrenar el modelo
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)
    
    # Predecir y calcular la precisión
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    return accuracy

In [29]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print('Número de pruebas: ', len(study.trials))
print('Mejores parámetros: ', study.best_params)
print('Mejor precisión: ', study.best_value)

[I 2024-05-15 18:47:13,671] A new study created in memory with name: no-name-a155fa8f-6a14-421f-80f0-a3b89e32e203
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),
  'subsample': trial.suggest_loguniform('subsample', 0.5, 1.0),
  'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
  'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 10.0),
  'colsample_bylevel': trial.suggest_loguniform('colsample_bylevel', 0.5, 1.0),
[I 2024-05-15 18:47:13,837] Trial 0 finished with value: 0.7341772151898734 and parameters: {'n_estimators': 237, 'learning_rate': 0.02715502868554827, 'subsample': 0.6558275985657962, 'max_depth': 7, 'gamma': 0.007049039210042, 'min_child_weight': 0.0017425348304262417, 'colsample_bylevel': 0.7038518097196931}. Best is trial 0 with value: 0.7341772151898734.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.3),
  'subsample': trial.suggest_loguniform('subsample', 0.5, 1.0),
  'gamma': trial.suggest_loguni

Número de pruebas:  100
Mejores parámetros:  {'n_estimators': 289, 'learning_rate': 0.14197104731466412, 'subsample': 0.5819998851844587, 'max_depth': 6, 'gamma': 7.689364666271964e-06, 'min_child_weight': 0.00012105325271544954, 'colsample_bylevel': 0.7425979993167775}
Mejor precisión:  0.7848101265822784


#### XGboost Model with outliers

In [33]:
from xgboost import XGBClassifier

model_outliers = XGBClassifier(n_estimators = 200, learning_rate = 0.0899598553283333, subsample = 0.6126950362885215, max_depth = 6, gamma = 0.0012937020191838994, min_child_weight = 9.179599573247735, colsample_bylevel = 0.6714388804457142, random_state = 42)
model_outliers.fit(X_train, y_train)

In [34]:
# prediction

y_pred = model_outliers.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0])

In [35]:
# accuracy model

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.810126582278481

In [37]:
# save the model

from pickle import dump

dump(model_outliers, open("/workspaces/xgboost-project/models/XGboost_rseed_42.sav", "wb"))