# Model Training

In [1]:
import random
import numpy as np

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

## Load and prepare data

In [2]:
import pandas as pd

df = pd.read_csv("data/adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le


In [4]:
X = df.drop('income', axis=1)
y = df['income']

In [6]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=SEED, stratify=y)

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

## Model training/ hyperparameter optimization

### Set up Optuna objective for hyperparameter tuning

In [16]:
import mlflow
from sklearn.metrics import accuracy_score
import xgboost as xgb

def objective(trial):
    with mlflow.start_run(nested=True):
        # Hyperparameters
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'random_state': SEED
        }

        # Model training
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_balanced, y_train_balanced)

        # Prediction
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # MLFlow logging
        mlflow.log_params(params)
        mlflow.log_metric('accuracy', accuracy)

    return accuracy

### Start MLFlow Session

In [17]:
import optuna
import warnings
from datetime import datetime


warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.ERROR)

run_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}_XGBoost"


with mlflow.start_run(run_name=run_name):
    study_name = 'XGBoost'
    study = optuna.create_study(study_name=study_name, direction='maximize')
    study.optimize(objective, n_trials=50)

    best_trial = study.best_trial
    best_params = best_trial.params
    best_accuracy = best_trial.value

    mlflow.log_params(best_params)
    mlflow.log_metric('best_accuracy', best_accuracy)
    mlflow.set_tag('study_name', study_name)

    print(f"Best trial: {best_trial}")
    print(f"Best params: {best_params}")
    print(f"Best accuracy: {best_accuracy}")
    print(f"Study name: {study_name}")
    print(f"MLFlow run_id: {mlflow.active_run().info.run_id}")
    print(f"MLFlow experiment_id: {mlflow.active_run().info.experiment_id}")

  from .autonotebook import tqdm as notebook_tqdm


Best trial: FrozenTrial(number=31, state=1, values=[0.86764254273723], datetime_start=datetime.datetime(2025, 3, 1, 12, 52, 4, 548674), datetime_complete=datetime.datetime(2025, 3, 1, 12, 52, 5, 160830), params={'n_estimators': 145, 'max_depth': 7, 'learning_rate': 0.23338981369622516, 'subsample': 0.9489765994924921, 'colsample_bytree': 0.6813256602690655, 'gamma': 4.733553592549725e-05, 'min_child_weight': 1}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=200, log=False, low=50, step=1), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.5, step=None), 'gamma': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1)}, trial