**Binary Prediction with a Rainfall Dataset - Kaggle Competition**

Author: Tihoc Andrei

In [16]:
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

In [17]:
# Load dataset
def load_data():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    X = train.drop(columns="rainfall")
    y = train["rainfall"]
    return X, y, test

# Preprocessing pipeline for numeric data
def create_preprocessor(X):
    numeric_features = X.columns.tolist()
    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features)
    ])
    return preprocessor

In [18]:
# Objective function for XGBoost
def objective_xgb(trial, X_train, X_valid, y_train, y_valid, preprocessor):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(**params, random_state=RANDOM_STATE))
    ])
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, y_prob)

In [19]:
# Objective function for RandomForest
def objective_rf(trial, X_train, X_valid, y_train, y_valid, preprocessor):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    model = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(**params, random_state=RANDOM_STATE))
    ])
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, y_prob)

In [21]:
def main():
    X, y, test = load_data()
    preprocessor = create_preprocessor(X)
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

    # Optimize XGB
    study_xgb = optuna.create_study(direction="maximize")
    study_xgb.optimize(lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid, preprocessor), n_trials=30)
    print("Best AUC XGBoost:", study_xgb.best_value)
    print("Best Params XGBoost:", study_xgb.best_params)

    # Optimize RF
    study_rf = optuna.create_study(direction="maximize")
    study_rf.optimize(lambda trial: objective_rf(trial, X_train, X_valid, y_train, y_valid, preprocessor), n_trials=30)
    print("Best AUC RandomForest:", study_rf.best_value)
    print("Best Params RF:", study_rf.best_params)

    # Final models with best params
    xgb_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(**study_xgb.best_params, random_state=RANDOM_STATE))
    ])

    rf_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("classifier", RandomForestClassifier(**study_rf.best_params, random_state=RANDOM_STATE))
    ])

    xgb_pipeline.fit(X_train, y_train)
    rf_pipeline.fit(X_train, y_train)

    # Evaluation
    print("XGBoost Accuracy:", accuracy_score(y_valid, xgb_pipeline.predict(X_valid)))
    print("RandomForest Accuracy:", accuracy_score(y_valid, rf_pipeline.predict(X_valid)))

    # Predict and export submission using XGBoost
    predictions = xgb_pipeline.predict(test)
    submission = pd.DataFrame({"id":test["id"],"rainfall": predictions})
    submission.to_csv("submission.csv", index=False)

if __name__ == "__main__":
   main()

[I 2025-04-03 16:12:32,088] A new study created in memory with name: no-name-f746e3d0-c1ac-4eda-9fe9-d5147200b7b7
[I 2025-04-03 16:12:32,369] Trial 0 finished with value: 0.8552988593556545 and parameters: {'n_estimators': 399, 'max_depth': 7, 'learning_rate': 0.24564329800820853, 'subsample': 0.8247800229896708, 'colsample_bytree': 0.7805849972042482, 'gamma': 2.182597884296194}. Best is trial 0 with value: 0.8552988593556545.
[I 2025-04-03 16:12:33,101] Trial 1 finished with value: 0.8596190827428151 and parameters: {'n_estimators': 683, 'max_depth': 11, 'learning_rate': 0.14239797830155998, 'subsample': 0.6900802888929873, 'colsample_bytree': 0.7041532656102787, 'gamma': 2.2990634527674803}. Best is trial 1 with value: 0.8596190827428151.
[I 2025-04-03 16:12:33,412] Trial 2 finished with value: 0.8677590158320381 and parameters: {'n_estimators': 614, 'max_depth': 10, 'learning_rate': 0.22339850118798815, 'subsample': 0.6642337582623175, 'colsample_bytree': 0.8172905477371281, 'gamma

Best AUC XGBoost: 0.8714206685809119
Best Params XGBoost: {'n_estimators': 841, 'max_depth': 15, 'learning_rate': 0.014210654474443803, 'subsample': 0.5034664777092748, 'colsample_bytree': 0.5113870605676322, 'gamma': 3.6199328896171785}


[I 2025-04-03 16:12:44,871] Trial 0 finished with value: 0.8690234714575485 and parameters: {'n_estimators': 174, 'max_depth': 21, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': None, 'bootstrap': True}. Best is trial 0 with value: 0.8690234714575485.
[I 2025-04-03 16:12:48,753] Trial 1 finished with value: 0.8709201548958141 and parameters: {'n_estimators': 852, 'max_depth': 28, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 0.8709201548958141.
[I 2025-04-03 16:12:52,895] Trial 2 finished with value: 0.8713152972787861 and parameters: {'n_estimators': 959, 'max_depth': 26, 'min_samples_split': 6, 'min_samples_leaf': 9, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 2 with value: 0.8713152972787861.
[I 2025-04-03 16:13:03,840] Trial 3 finished with value: 0.86509839045336 and parameters: {'n_estimators': 837, 'max_depth': 13, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 

Best AUC RandomForest: 0.8735544374489607
Best Params RF: {'n_estimators': 274, 'max_depth': 22, 'min_samples_split': 6, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'bootstrap': True}
XGBoost Accuracy: 0.867579908675799
RandomForest Accuracy: 0.8515981735159818
