In [1]:
import os
import numpy as np
import pickle
import matplotlib.pyplot as plt
from helpers import *

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from utils import (
    split_data,
    get_cross_val_scores,
    prep_hyperparam_search,
    now_str,
    accuracy,
    f1,
)

## Get data

In [3]:
all_data = load_clean_data()
x, x_final, y, ids, ids_final = all_data["x"], all_data["x_final"], all_data["y"], all_data["ids"], all_data["ids_final"]

## Run methods

In [15]:
import os
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from utils import (
    split_data,
    get_cross_val_scores,
    prep_hyperparam_search,
    now_str,
    accuracy,
    f1,
)

#### Set configuration

In [16]:
cfg = {
    "seed": 0,
    "verbose": False,
    "scoring_fn": f1,
    "train": {
        "cv": None,
        # "cv": {
        #     "k_folds": 5,
        #     "shuffle": False,
        # },
        "holdout": {
            "split_frac": 0.2,
            "seed": 0,
        },
    },
}
if cfg["train"].get("cv", None) is not None:
    cfg["train"]["cv"]["scoring_fn"] = cfg["scoring_fn"]

In [17]:
runs = {
    "Logistic Regression": {
        "model_cls": LogisticRegression,
        "hyperparam_search": {
            "C": [0.5, 5],
            "penalty": ["l2"],
            "max_iter": [500],
            "class_weight": [None, "balanced"],
            "verbose": [1 if cfg["verbose"] else 0],
            "random_state": [cfg["seed"]],
        },
    },
    "Decision Tree": {
        "model_cls": DecisionTreeClassifier,
        "hyperparam_search": {
            "max_depth": [5, 10, 30],
            "class_weight": [None, "balanced"],
            "random_state": [cfg["seed"]],
        },
    },
    "Random Forest": {
        "model_cls": RandomForestClassifier,
        "hyperparam_search": {
            # "n_estimators": [20, 80],
            "n_estimators": [80],
            # "max_depth": [3, 10],
            "max_depth": [3],
            "class_weight": [None, "balanced"],
            "n_jobs": [-1],
            "verbose": [1 if cfg["verbose"] else 0],
            "random_state": [cfg["seed"]],
        },
    },
    "Gradient Boosting": {
        "model_cls": GradientBoostingClassifier,
        "hyperparam_search": {
            # "n_estimators": [40, 100],
            "n_estimators": [40],
            "max_depth": [3],
            "verbose": [1 if cfg["verbose"] else 0],
            "random_state": [cfg["seed"]],
        },
    },
    "XGBoost": {
        "model_cls": XGBClassifier,
        "hyperparam_search": {
            # "n_estimators": [20, 60, 120],
            "n_estimators": [60],
            # "max_depth": [3, 10],
            "max_depth": [3],
            "n_jobs": [-1],
            "verbosity": [1 if cfg["verbose"] else 0],
            "random_state": [cfg["seed"]],
        },
    },
    "MLP": {
        "model_cls": MLPClassifier,
        "hyperparam_search": {
            # "hidden_layer_sizes": [(64,), (32, 32), (16, 32, 16)],
            "hidden_layer_sizes": [(64,)],
            # "alpha": [5e-5, 1e-3],
            "alpha": [5e-5],
            "early_stopping": [True],
            "max_iter": [300],
            "verbose": [1 if cfg["verbose"] else 0],
            "random_state": [cfg["seed"]],
        },
    },
}

#### Run methods

In [18]:
dir_name = os.path.join(CLEAN_DATA_PATH, "runs", now_str())
os.makedirs(dir_name, exist_ok=True)
best = {"model_name": None, "val_score": 0}
for name, run_dict in runs.items():
    print("---" * 50)
    print(f"{name}")

    ### hyperparam search
    hyperparam_search = prep_hyperparam_search(run_dict["hyperparam_search"])
    print(f"  Searching hyperparameters among {len(hyperparam_search)} options...")
    seed_all(cfg["seed"])
    if cfg["train"].get("cv", None) is not None:
        ### cross-validation
        print(f"  Cross-validating with {cfg['train']['cv']['k_folds']}-fold cross-validation...")
        cv_scores = []
        for hp_comb in hyperparam_search:
            seed_all(cfg["seed"])
            model = run_dict["model_cls"](**hp_comb)
            hp_scores = get_cross_val_scores(model, x, y, **cfg["train"]["cv"])
            cv_scores.append(np.mean(hp_scores))
        best_hp_comb_idx = np.argmax(cv_scores)
        run_dict["hyperparams"] = hyperparam_search[best_hp_comb_idx]
        run_dict["val_score"] = cv_scores[best_hp_comb_idx]
    elif cfg["train"].get("holdout", None) is not None:
        ### validation holdout
        print(f"  Holdout validation with {cfg['train']['holdout']['split_frac']} split...")
        x_train, x_val, y_train, y_val = split_data(x, y, **cfg["train"]["holdout"])
        val_scores = []
        for hp_comb in hyperparam_search:
            model = run_dict["model_cls"](**hp_comb)
            model.fit(x_train, y_train)
            y_val_pred = model.predict(x_val)
            val_score = cfg["scoring_fn"](y_val, y_val_pred)
            val_scores.append(val_score)
        best_hp_comb_idx = np.argmax(val_scores)
        run_dict["hyperparams"] = hyperparam_search[best_hp_comb_idx]
        run_dict["val_score"] = val_scores[best_hp_comb_idx]
    else:
        ### just train on all data
        print("  No validation method specified. Training on all data with the first hyperparameter combination.")
        run_dict["hyperparams"] = hyperparam_search[0]
        run_dict["val_score"] = None
    print(f"  Hyperparameters selected: {' '.join([f'{k}={v}' for k, v in run_dict['hyperparams'].items()])}")

    ### train on all data
    print(f"  Training on all data with best hyperparameters...")
    model = run_dict["model_cls"](**run_dict["hyperparams"])
    model.fit(x, y)
    run_dict["model"] = model
    run_dict["train_score"] = cfg["scoring_fn"](y, model.predict(x))
    print(
        "  Results:"
        f"\n    Training {cfg['scoring_fn'].__name__}: {run_dict['train_score']:.4f}"
        f"\n    Validation {cfg['scoring_fn'].__name__}: {run_dict['val_score']:.4f}"
        f"\n    Hyperparameters: {' '.join([f'{k}={v}' for k, v in run_dict['hyperparams'].items()])}"
    )

    ### save run
    file_name = f"{name.replace(' ', '_')}.pkl"
    with open(os.path.join(dir_name, file_name), "wb") as f:
        pickle.dump(run_dict, f)

    ### save best
    if run_dict["val_score"] > best["val_score"]:
        best["model_name"] = name
        best["val_score"] = run_dict["val_score"]

print("---" * 50)
print(f"Best model: {best['model_name']}")
print(f"Validation {cfg['scoring_fn'].__name__}: {best['val_score']:.4f}")

------------------------------------------------------------------------------------------------------------------------------------------------------
Logistic Regression
  Searching hyperparameters among 4 options...
  Holdout validation with 0.2 split...
  Hyperparameters selected: C=5 penalty=l2 max_iter=500 class_weight=balanced verbose=0 random_state=0
  Training on all data with best hyperparameters...
  Results:
    Training f1: 0.3428
    Validation f1: 0.3420
    Hyperparameters: C=5 penalty=l2 max_iter=500 class_weight=balanced verbose=0 random_state=0
------------------------------------------------------------------------------------------------------------------------------------------------------
Decision Tree
  Searching hyperparameters among 6 options...
  Holdout validation with 0.2 split...
  Hyperparameters selected: max_depth=10 class_weight=balanced random_state=0
  Training on all data with best hyperparameters...
  Results:
    Training f1: 0.3591
    Validation 