In [1]:
import pandas as pd
import numpy as np

from src.utils import load_processed_data
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import optuna

In [2]:
kick_clean = load_processed_data(use_relative = True)

In [3]:
def drop_final_irrelevant_columns_and_encode_target(df: pd.DataFrame) -> pd.DataFrame:
    # These are useful for lookup / human insight, but not for machine learning
    df = df.copy()
    irrelevant_columns = [
        "index",
        "name",
        "blurb",
    ]
    df = df.drop(columns=irrelevant_columns)

    # Encode target
    df["is_successful"] = df["state"] == "successful"
    df = df.drop(columns=["state"])
    return df

def handle_datetime_features(df: pd.DataFrame) -> pd.DataFrame:
    datetime_columns = [
        "created_at",
        "deadline",
        "launched_at"
    ]

    df["age_days"] = (pd.Timestamp.now() - df["launched_at"]).dt.days

    df = df.drop(columns = datetime_columns)
    return df

def machine_ready_preprocessing(
    df: pd.DataFrame
) -> tuple[pd.DataFrame, LabelEncoder, StandardScaler]:
    df = handle_datetime_features(df)
    df = drop_final_irrelevant_columns_and_encode_target(df)
    return df

kick_transformed = machine_ready_preprocessing(kick_clean)

In [None]:
# ====================================
#               CONFIG
# ====================================
K_splits = 5
random_state = 1337

def preprocess_data(X_train, X_test, categorical_cols, numerical_cols):
    # Training set will fit the encoders / scalers, the test set will use the fitted encoders / scalers
    # ====================================
    #        CATEGORICAL ENCODING
    # ====================================
    # Encode categorical features
    # OneHotEncoding for features that have relatively low cardinality (< 100 unique values)
    # FrequencyEncoding for features that have relatively high cardinality (>= 100 unique values)
    for cat_col in categorical_cols:
        if X_train[cat_col].nunique() < 100: # OneHotEncoding for low cardinality
            oh_encoder = OneHotEncoder(
                handle_unknown='ignore',
                sparse_output=False
            )
            oh_encoder.fit(X_train[[cat_col]])

            # Transform and join back to dataframe
            X_train_oh = pd.DataFrame(
                oh_encoder.transform(X_train[[cat_col]]),
                columns=[f"{cat_col}__{cat}" for cat in oh_encoder.categories_[0]],
                index=X_train.index
            )
            X_test_oh = pd.DataFrame(
                oh_encoder.transform(X_test[[cat_col]]),
                columns=[f"{cat_col}__{cat}" for cat in oh_encoder.categories_[0]],
                index=X_test.index
            )

            X_train = X_train.drop(columns=[cat_col]).join(X_train_oh)
            X_test = X_test.drop(columns=[cat_col]).join(X_test_oh)
        else: # FrequencyEncoding for high cardinality
            freq_encoding = X_train[cat_col].value_counts(normalize=True)
            X_train.loc[:, cat_col] = X_train[cat_col].map(freq_encoding)
            X_test.loc[:, cat_col] = X_test[cat_col].map(freq_encoding).fillna(1 / (X_train.__len__() + X_test.__len__())) # Fill unseen with "rare" frequency
            X_train[cat_col] = X_train[cat_col].astype("float64")
            X_test[cat_col] = X_test[cat_col].astype("float64")
    
    # ====================================
    #         NUMERICAL SCALING
    # ====================================

    num_scaler = StandardScaler()
    num_scaler.fit(X_train[numerical_cols])
    X_train.loc[:, numerical_cols] = num_scaler.transform(X_train[numerical_cols])
    X_test.loc[:, numerical_cols] = num_scaler.transform(X_test[numerical_cols])

    return X_train, X_test

def perform_CV(
        data: pd.DataFrame, 
        model_class, 
        model_params, 
        verbose = False
    ) -> list[dict[str, float]]:
    df = data.copy()
    categorical_cols = df.select_dtypes(include=["object"]).columns
    numerical_cols = df.select_dtypes(include=["number"]).columns
    train = df.drop(columns=["is_successful"])
    target = df["is_successful"]

    # Ensure numerical columns are float64
    train[numerical_cols] = train[numerical_cols].astype("float64")

    # ====================================
    #        CROSS-VALIDATION LOOP
    # ====================================
    fold_scores = []
    sk_fold = StratifiedKFold(n_splits=K_splits, shuffle=True, random_state=random_state)
    for i, (train_idx, test_idx) in enumerate(sk_fold.split(train, target)):
        if verbose:
            print(f"Fold {i+1}")

        X_train, X_test = train.iloc[train_idx], train.iloc[test_idx]
        y_train, y_test = target.iloc[train_idx], target.iloc[test_idx]

        # APPLY PREPROCESSING
        X_train, X_test = preprocess_data(X_train, X_test, categorical_cols, numerical_cols)

        # ====================================
        #         TRAIN AND EVALUTATE
        # ====================================
        model = model_class(**model_params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        fold_result = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, zero_division=0),
            "recall": recall_score(y_test, y_pred, zero_division=0),
            "f1": f1_score(y_test, y_pred, zero_division=0),
        }
        fold_scores.append(fold_result)

        if verbose:
            print(
                f"\taccuracy={fold_result['accuracy']:.4f}, \n"
                f"\tprecision={fold_result['precision']:.4f}, \n"
                f"\trecall={fold_result['recall']:.4f}, \n"
                f"\tf1={fold_result['f1']:.4f}"
            )
    return fold_scores


Optimization

In [None]:
def objective_rf(trial: optuna.Trial) -> float:
    model_params = {
        "n_estimators": trial.suggest_int("n_estimators", 10, 100),
        "max_depth": trial.suggest_int("max_depth", 2, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        # "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
    }

    fold_scores = perform_CV(
        data=kick_transformed,
        model_class=RandomForestClassifier,
        model_params=model_params,
        verbose=False,
    )

    mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
    return mean_f1
optuna.logging.set_verbosity(optuna.logging.INFO)
study_rf = optuna.create_study(direction="maximize", study_name="rf_f1")
study_rf.optimize(objective_rf, n_trials=50)

print("RF best F1:", study_rf.best_value)
print("RF best params:", study_rf.best_params)

[I 2025-11-15 13:06:49,960] A new study created in memory with name: rf_f1


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-15 13:07:37,543] Trial 0 finished with value: 0.8561253620589275 and parameters: {'n_estimators': 73, 'max_depth': 16, 'min_samples_split': 19}. Best is trial 0 with value: 0.8561253620589275.
[I 2025-11-15 13:08:39,559] Trial 1 finished with value: 0.8561951365244977 and parameters: {'n_estimators': 96, 'max_depth': 16, 'min_samples_split': 5}. Best is trial 1 with value: 0.8561951365244977.
[I 2025-11-15 13:08:49,278] Trial 2 finished with value: 0.7929651720133478 and parameters: {'n_estimators': 42, 'max_depth': 3, 'min_samples_split': 13}. Best is trial 1 with value: 0.8561951365244977.
[I 2025-11-15 13:09:58,568] Trial 3 finished with value: 0.8639876440169967 and parameters: {'n_estimators': 73, 'max_depth': 30, 'min_samples_split': 5}. Best is trial 3 with value: 0.8639876440169967.
[I 2025-11-15 13:10:19,882] Trial 4 finished with value: 0.8537609609619728 and parameters: {'n_estimators': 29, 'max_depth': 16, 'min_samples_split': 14}. Best is trial 3 with value: 0.8

In [8]:
def objective_xgb(trial: optuna.Trial) -> float:
    model_params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 400),
        "max_depth": trial.suggest_int("max_depth", 2, 12),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "eval_metric": "logloss",
    }

    fold_scores = perform_CV(
        data=kick_transformed,
        model_class=XGBClassifier,
        model_params=model_params,
        verbose=False,
    )

    mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
    return mean_f1

optuna.logging.set_verbosity(optuna.logging.INFO)
study_xgb = optuna.create_study(direction="maximize", study_name="xgb_f1")
study_xgb.optimize(objective_xgb, n_trials=50)

print("XGB best F1:", study_xgb.best_value)
print("XGB best params:", study_xgb.best_params)

df_xgb_cat = study_xgb.trials_dataframe()
df_xgb_cat.to_csv("../results/optuna_xgb_trials.csv", index=False)

[I 2025-11-16 10:16:31,187] A new study created in memory with name: xgb_f1
[I 2025-11-16 10:16:39,105] Trial 0 finished with value: 0.8508975429288291 and parameters: {'n_estimators': 316, 'max_depth': 7, 'learning_rate': 0.009961344023082274, 'gamma': 0.6409944022450786}. Best is trial 0 with value: 0.8508975429288291.
[I 2025-11-16 10:16:48,158] Trial 1 finished with value: 0.8703135545847209 and parameters: {'n_estimators': 361, 'max_depth': 12, 'learning_rate': 0.06058812302453238, 'gamma': 1.8448862663032084}. Best is trial 1 with value: 0.8703135545847209.
[I 2025-11-16 10:16:57,427] Trial 2 finished with value: 0.8697249577107012 and parameters: {'n_estimators': 261, 'max_depth': 12, 'learning_rate': 0.04202422869214421, 'gamma': 2.478330462923914}. Best is trial 1 with value: 0.8703135545847209.
[I 2025-11-16 10:17:06,125] Trial 3 finished with value: 0.8388865250622054 and parameters: {'n_estimators': 277, 'max_depth': 8, 'learning_rate': 0.0033877565384754075, 'gamma': 0.334

XGB best F1: 0.871592369171208
XGB best params: {'n_estimators': 370, 'max_depth': 9, 'learning_rate': 0.0706357630618904, 'gamma': 0.11828426674429565}


In [13]:
def objective_lr(trial: optuna.Trial) -> float:
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = "liblinear"  # supports both l1 and l2

    model_params = {
        "C": trial.suggest_float("C", 1e-3, 100.0, log=True),
        "penalty": penalty,
        "solver": solver,
        "max_iter": 1000,
    }

    fold_scores = perform_CV(
        data=kick_transformed,
        model_class=LogisticRegression,
        model_params=model_params,
        verbose=False,
    )

    mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
    return mean_f1

optuna.logging.set_verbosity(optuna.logging.INFO)
study_lr = optuna.create_study(direction="maximize", study_name="lr_f1")
study_lr.optimize(objective_lr, n_trials=50, show_progress_bar=True)

print("LR best F1:", study_lr.best_value)
print("LR best params:", study_lr.best_params)

[I 2025-11-15 15:55:15,462] A new study created in memory with name: lr_f1


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-15 15:55:49,565] Trial 0 finished with value: 0.8039321655012264 and parameters: {'penalty': 'l2', 'C': 3.203430277457991}. Best is trial 0 with value: 0.8039321655012264.
[I 2025-11-15 15:56:38,493] Trial 1 finished with value: 0.804152703664686 and parameters: {'penalty': 'l1', 'C': 0.14138073711070656}. Best is trial 1 with value: 0.804152703664686.
[I 2025-11-15 15:57:23,174] Trial 2 finished with value: 0.8040577586809678 and parameters: {'penalty': 'l1', 'C': 7.65503165689389}. Best is trial 1 with value: 0.804152703664686.
[I 2025-11-15 15:57:30,494] Trial 3 finished with value: 0.7954142533791833 and parameters: {'penalty': 'l2', 'C': 0.0019168861539635826}. Best is trial 1 with value: 0.804152703664686.
[I 2025-11-15 15:57:59,011] Trial 4 finished with value: 0.8031096344239298 and parameters: {'penalty': 'l2', 'C': 1.7295219197797516}. Best is trial 1 with value: 0.804152703664686.
[I 2025-11-15 15:58:13,234] Trial 5 finished with value: 0.7978365493619296 and para

Optimization (XGBoost) Categorical

In [29]:
optuna.logging.set_verbosity(optuna.logging.WARN)

best_xgb_cat_results = []
for cat_parent_name in kick_transformed['cat_parent_name'].unique():
    print(f"Optimizing for category: {cat_parent_name}")

    def objective_cat(trial: optuna.Trial) -> float:
        model_params = {
            "n_estimators": trial.suggest_int("n_estimators", 100, 400),
            "max_depth": trial.suggest_int("max_depth", 2, 12),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "gamma": trial.suggest_float("gamma", 0.0, 5.0),
            "eval_metric": "logloss",
        }

        fold_scores = perform_CV(
            data=kick_transformed[kick_transformed['cat_parent_name'] == cat_parent_name],
            model_class=XGBClassifier,
            model_params=model_params,
            verbose=False,
        )

        mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
        return mean_f1

    study_cat = optuna.create_study(direction="maximize", study_name=f"xgb_f1_{cat_parent_name}")
    study_cat.optimize(objective_cat, n_trials=100)

    print(f"XGB best F1 for {cat_parent_name}:", study_cat.best_value)
    print(f"XGB best params for {cat_parent_name}:", study_cat.best_params)
    best_xgb_cat_results.append({
        "cat_parent_name": cat_parent_name,
        "best_f1": study_cat.best_value,
        "best_params": study_cat.best_params
    })
    
    # XGBoost study
    df_xgb_cat = study_cat.trials_dataframe()
    df_xgb_cat.to_csv(f"../results/model_results/optuna_{cat_parent_name}_xgb_trials.csv", index=False)

Optimizing for category: Theater
XGB best F1 for Theater: 0.8092362504895695
XGB best params for Theater: {'n_estimators': 293, 'max_depth': 6, 'learning_rate': 0.0034021548922418725, 'gamma': 2.0946971222798467}
Optimizing for category: Technology
XGB best F1 for Technology: 0.8730187080472896
XGB best params for Technology: {'n_estimators': 348, 'max_depth': 9, 'learning_rate': 0.01877675526370727, 'gamma': 2.671432995667378}
Optimizing for category: Dance
XGB best F1 for Dance: 0.8038695078740385
XGB best params for Dance: {'n_estimators': 114, 'max_depth': 3, 'learning_rate': 0.13188114596703313, 'gamma': 4.1070555608529915}
Optimizing for category: Film & Video
XGB best F1 for Film & Video: 0.8979781407635491
XGB best params for Film & Video: {'n_estimators': 355, 'max_depth': 3, 'learning_rate': 0.23874158155646538, 'gamma': 0.9203490652325381}
Optimizing for category: Music
XGB best F1 for Music: 0.8963609845152269
XGB best params for Music: {'n_estimators': 356, 'max_depth': 5,

In [17]:
xgb_results_df = pd.read_csv("../results/optuna_xgb_trials.csv")

# Extract parameters of the best trial
best_trial = xgb_results_df.loc[xgb_results_df['value'].idxmax()]
xgb_best_params = {
    'n_estimators': int(best_trial['params_n_estimators']),
    'max_depth': int(best_trial['params_max_depth']),
    'learning_rate': float(best_trial['params_learning_rate']),
    'gamma': float(best_trial['params_gamma']),
    'eval_metric': 'logloss',
}
xgb_model = XGBClassifier(**xgb_best_params)

# Use perform_CV on categorical subsets using best params
for cat_parent_name in kick_transformed['cat_parent_name'].unique():
    print(f"Evaluating best XGB model for category: {cat_parent_name}")
    fold_scores = perform_CV(
        data=kick_transformed[kick_transformed['cat_parent_name'] == cat_parent_name],
        model_class=XGBClassifier,
        model_params=xgb_best_params,
        verbose=False,
    )
    mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
    print(f"Mean F1 for {cat_parent_name} with best XGB params: {mean_f1:.4f}")

Evaluating best XGB model for category: Theater
Mean F1 for Theater with best XGB params: 0.7886
Evaluating best XGB model for category: Technology
Mean F1 for Technology with best XGB params: 0.8672
Evaluating best XGB model for category: Dance
Mean F1 for Dance with best XGB params: 0.7775
Evaluating best XGB model for category: Film & Video
Mean F1 for Film & Video with best XGB params: 0.8935
Evaluating best XGB model for category: Music
Mean F1 for Music with best XGB params: 0.8928
Evaluating best XGB model for category: Comics
Mean F1 for Comics with best XGB params: 0.9228
Evaluating best XGB model for category: Publishing
Mean F1 for Publishing with best XGB params: 0.8610
Evaluating best XGB model for category: Food
Mean F1 for Food with best XGB params: 0.7682
Evaluating best XGB model for category: None
Mean F1 for None with best XGB params: 0.9828
Evaluating best XGB model for category: Journalism
Mean F1 for Journalism with best XGB params: 0.6016
Evaluating best XGB mode

In [30]:
def perform_CV_xgb_ensemble(
        data: pd.DataFrame,
        verbose = False
    ) -> list[dict[str, float]]:
    df = data.copy()
    categorical_cols = df.select_dtypes(include=["object"]).columns
    numerical_cols = df.select_dtypes(include=["number"]).columns
    train = df.drop(columns=["is_successful"])
    target = df["is_successful"]

    train[numerical_cols] = train[numerical_cols].astype("float64")

    # ====================================
    #        CROSS-VALIDATION LOOP
    # ====================================
    fold_scores = []
    skf = StratifiedKFold(n_splits=K_splits, shuffle=True, random_state=random_state)
    for i, (train_idx, test_idx) in enumerate(skf.split(train, target)):
        if verbose:
            print(f"\nFold {i+1}")

        X_train, X_test = train.iloc[train_idx], train.iloc[test_idx]
        y_train, y_test = target.iloc[train_idx], target.iloc[test_idx]

        # container for predictions on the full test fold
        y_pred_full = pd.Series(index=X_test.index, dtype=int)
        
        # ====================================
        #           TRAIN AND EVALUTATE (ONE PER CATEGORY PARENT)
        # ====================================
        for cat_parent_name in data['cat_parent_name'].unique():
            # get data for this category
            train_mask = (X_train['cat_parent_name'] == cat_parent_name)
            test_mask = (X_test['cat_parent_name'] == cat_parent_name)
            X_train_cat = X_train[train_mask]
            y_train_cat = y_train[train_mask]
            X_test_cat = X_test[test_mask]
            # y_test_cat = y_test[test_mask]

            # get best params:
            best_params_df = pd.read_csv(f"../results/model_results/xgb_categorical/optuna_{cat_parent_name}_xgb_trials.csv")
            best_trial = best_params_df.loc[best_params_df['value'].idxmax()]
            best_params = {
                'n_estimators': int(best_trial['params_n_estimators']),
                'max_depth': int(best_trial['params_max_depth']),
                'learning_rate': float(best_trial['params_learning_rate']),
                'gamma': float(best_trial['params_gamma']),
                'eval_metric': 'logloss',
            }

            # preprocess per category (fit on cat-train, apply to cat-test)
            X_train_proc, X_test_proc = preprocess_data(
                X_train_cat,
                X_test_cat,
                categorical_cols=categorical_cols,
                numerical_cols=numerical_cols,
            )

            model = XGBClassifier(**best_params)
            model.fit(X_train_proc, y_train_cat)

            y_pred_cat = model.predict(X_test_proc)

            # write predictions back into the global y_pred_full
            y_pred_full.loc[test_mask] = y_pred_cat

        # Now evaluate ensemble on this fold
        y_true_fold = y_test
        y_pred_fold = y_pred_full

        fold_result = {
            "accuracy": accuracy_score(y_true_fold, y_pred_fold),
            "precision": precision_score(y_true_fold, y_pred_fold, zero_division=0),
            "recall": recall_score(y_true_fold, y_pred_fold, zero_division=0),
            "f1": f1_score(y_true_fold, y_pred_fold, zero_division=0),
        }
        fold_scores.append(fold_result)

        if verbose:
            print(
                f"\taccuracy={fold_result['accuracy']:.4f}, \n"
                f"\tprecision={fold_result['precision']:.4f}, \n"
                f"\trecall={fold_result['recall']:.4f}, \n"
                f"\tf1={fold_result['f1']:.4f}"
            )

    return fold_scores

fold_scores = perform_CV_xgb_ensemble(kick_transformed, verbose=True)
# get average f1 score across folds
mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
print(f"Average F1 across all data with ensemble XGB: {mean_f1:.4f}")


Fold 1
	accuracy=0.8408, 
	precision=0.8736, 
	recall=0.8692, 
	f1=0.8714

Fold 2
	accuracy=0.8417, 
	precision=0.8721, 
	recall=0.8730, 
	f1=0.8725

Fold 3
	accuracy=0.8395, 
	precision=0.8691, 
	recall=0.8728, 
	f1=0.8710

Fold 4
	accuracy=0.8372, 
	precision=0.8669, 
	recall=0.8716, 
	f1=0.8692

Fold 5
	accuracy=0.8379, 
	precision=0.8700, 
	recall=0.8685, 
	f1=0.8693
Average F1 across all data with ensemble XGB: 0.8707


In [31]:
fold_scores = perform_CV(kick_transformed, XGBClassifier, xgb_best_params, verbose=True)
# get average f1 score across folds

mean_f1 = float(np.mean([fold["f1"] for fold in fold_scores]))
print(f"Average F1 across all data with best XGB params: {mean_f1:.4f}")

Fold 1
	accuracy=0.8403, 
	precision=0.8718, 
	recall=0.8708, 
	f1=0.8713
Fold 2
	accuracy=0.8417, 
	precision=0.8679, 
	recall=0.8787, 
	f1=0.8733
Fold 3
	accuracy=0.8392, 
	precision=0.8676, 
	recall=0.8744, 
	f1=0.8710
Fold 4
	accuracy=0.8373, 
	precision=0.8661, 
	recall=0.8728, 
	f1=0.8694
Fold 5
	accuracy=0.8380, 
	precision=0.8681, 
	recall=0.8715, 
	f1=0.8698
Average F1 across all data with best XGB params: 0.8710
