In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import xgboost as xgb
import wandb
import os
import optuna
import joblib

# Downloading file from Wandb 

In [29]:
run = wandb.init(project="risk_credit", job_type="EDA")

In [30]:
WANDB_API_KEY=os.environ.get('1d620fa1eff54f2f0ba01b14c81969f4ce70bd6c')
!wandb login --relogin 1d620fa1eff54f2f0ba01b14c81969f4ce70bd6c

wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\PC\_netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin


In [31]:
artifact_X_train_new = wandb.use_artifact('risk_credit/X_train_new.csv:latest', type='Feature engineering')
artifact_X_test_new = wandb.use_artifact('risk_credit/X_test_new.csv:latest', type='Feature engineering')
artifact_y_train_new = wandb.use_artifact('risk_credit/y_train_new.csv:latest', type='Feature engineering')
artifact_y_test_new = wandb.use_artifact('risk_credit/y_test_new.csv:latest', type='Feature engineering')

X_train_new_dir = artifact_X_train_new.download()
X_test_new_dir = artifact_X_test_new.download()
y_train_new_dir = artifact_y_train_new.download()
y_test_new_dir = artifact_y_test_new.download()

X_train_new_path = os.path.join(X_train_new_dir, "X_train_new.csv")
X_test_new_path = os.path.join(X_test_new_dir, "X_test_new.csv")
y_train_new_path = os.path.join(y_train_new_dir, "y_train_new.csv")
y_test_new_path = os.path.join(y_test_new_dir, "y_test_new.csv")

X_train = pd.read_csv(X_train_new_path)
X_test = pd.read_csv(X_test_new_path)
y_train = pd.read_csv(y_train_new_path)
y_test = pd.read_csv(y_test_new_path)

[34m[1mwandb[0m: Downloading large artifact X_train_new.csv:latest, 370.88MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:2.0
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


# I. Model

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)

params = {
    'objective': 'binary:logistic',  
    'max_depth': 5,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
    'eval_metric': 'logloss'  
}

def f1_metric(y_pred, dtrain):
    y_true = dtrain.get_label()
    y_pred_binary = (y_pred > 0.5).astype(int)  
    f1 = f1_score(y_true, y_pred_binary, average='weighted')  
    return 'f1', f1

cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    nfold=5,
    early_stopping_rounds=10,
    feval=f1_metric,    
    maximize=True,     
    as_pandas=True,
    seed=42
)

  evals: Optional[Sequence[Tuple[DMatrix, str]]] = None,


    train-logloss-mean  train-logloss-std  ...  test-f1-mean  test-f1-std
0             0.648894           0.000261  ...      0.333334     0.001484
1             0.613051           0.000648  ...      0.333334     0.001484
2             0.582499           0.000482  ...      0.534393     0.007394
3             0.556470           0.000444  ...      0.611587     0.016646
4             0.532914           0.000982  ...      0.684368     0.014713
..                 ...                ...  ...           ...          ...
95            0.224377           0.000297  ...      0.904382     0.000729
96            0.223812           0.000262  ...      0.904574     0.000793
97            0.223324           0.000288  ...      0.904694     0.000852
98            0.222809           0.000368  ...      0.904775     0.000733
99            0.222244           0.000460  ...      0.904941     0.000697

[100 rows x 8 columns]
Best F1 Score: 0.9049


# II. Optimization 

In [34]:
def objective(trial):
    param = {
        'objective': 'binary:logistic',    
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),  
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),    
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'eval_metric': 'logloss',
        'seed': 42,
        'verbosity': 0
    }

    dtrain = xgb.DMatrix(X_train, label=y_train)

    cv_results = xgb.cv(
        params=param,
        dtrain=dtrain,
        num_boost_round=100,
        nfold=5,
        stratified=True,    
        early_stopping_rounds=10,
        feval=f1_metric,  
        maximize=True,
        as_pandas=True,
        seed=42
    )

    return cv_results['test-f1-mean'].max()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best parameters found: ", study.best_params)

[I 2025-04-27 21:56:23,760] A new study created in memory with name: no-name-67ac7d9c-6dbd-40ea-98e9-4c0607c4ee3d
  evals: Optional[Sequence[Tuple[DMatrix, str]]] = None,
[I 2025-04-27 21:57:47,402] Trial 0 finished with value: 0.9141222000000001 and parameters: {'max_depth': 7, 'learning_rate': 0.1802994420956606, 'subsample': 0.983137312560146, 'colsample_bytree': 0.8588711963856961, 'gamma': 2.5266536576026555, 'lambda': 0.11192814272790903, 'alpha': 6.048819057060116, 'min_child_weight': 4}. Best is trial 0 with value: 0.9141222000000001.
  evals: Optional[Sequence[Tuple[DMatrix, str]]] = None,
[I 2025-04-27 21:59:03,985] Trial 1 finished with value: 0.8842301999999999 and parameters: {'max_depth': 5, 'learning_rate': 0.045329873648981424, 'subsample': 0.9046893492835408, 'colsample_bytree': 0.6613864449364464, 'gamma': 4.031963706379152, 'lambda': 2.9642477387553726e-06, 'alpha': 0.05669729637519211, 'min_child_weight': 5}. Best is trial 0 with value: 0.9141222000000001.
  evals: 

Best parameters found:  {'max_depth': 9, 'learning_rate': 0.29657245912968044, 'subsample': 0.923977221480187, 'colsample_bytree': 0.5433451887033703, 'gamma': 0.6742139265636637, 'lambda': 4.4000011182175415e-07, 'alpha': 2.0735422839625516e-06, 'min_child_weight': 3}


In [37]:
model_optimized = xgb.XGBClassifier(**study.best_params)
model_optimized.fit(X_train, y_train)

y_pred_test = model_optimized.predict(X_test)

f1 = f1_score(y_test, y_pred_test, average='weighted')
acc = accuracy_score(y_test, y_pred_test)

print(f"Optimized Model Test F1 Score: {f1:.4f}")
print(f"Optimized Model Test Accuracy: {acc:.4f}")

Optimized Model Test F1 Score: 0.8363
Optimized Model Test Accuracy: 0.8622


# Uploading model, best parameters, metrics to Wandb

In [38]:
run = wandb.init(project="risk_credit", job_type="model-training", name="xgboost_optuna_best_model")

wandb.config.update(study.best_params)
wandb.log({
    "f1_score": f1,
    "accuracy": acc,
})

model_filename = "best_xgb_model.pkl"
joblib.dump(model_optimized, model_filename)

artifact = wandb.Artifact(
    name="xgb_optuna_model",
    type="model",
    description="Best XGBoost model after Optuna tuning"
)
artifact.add_file(model_filename)
wandb.log_artifact(artifact)

run.finish()

0,1
accuracy,▁
f1_score,▁

0,1
accuracy,0.86225
f1_score,0.83627
