In [None]:
import numpy as np 
import pandas as pd 
import pickle

import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import brier_score_loss

from lib import util
from lib import experiment_config as ec

from tqdm import tqdm

In [None]:
plt.style.use("default")
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [None]:
np.random.seed(4284222)

In [None]:
dataset_names = {
    "california-housing": "California Housing",
    "abalone": "Abalone",
    "insurance": "Insurance",
    "ACS2018": "ACS 2018",
    "german-credit": "German Credit",
    "adult": "Adult",
    "breast-cancer": "Breast Cancer",
}

def record(repeat_ind, model_repeat_ind, dataset, n_estimators, mse_test, mse_est):
    return {
        "repeat_ind": repeat_ind,
        "model_repeat_ind": model_repeat_ind,
        "dataset": dataset_names[dataset],
        "n_estimators": n_estimators,
        "mse_test": mse_test,
        "mse_est": mse_est,
    }

def train_forest(X_train, y_train, n_estimators, is_classification):
    if is_classification:
        model = RandomForestClassifier(n_estimators=n_estimators)
    else:
        model = RandomForestRegressor(n_estimators=n_estimators)
    model.fit(X_train, y_train)
    return model

regression_datasets = ["california-housing", "abalone", "insurance", "ACS2018"]
classification_datasets = ["german-credit", "adult", "breast-cancer"]
n_repeats = 3
n_model_repeats = 3
n_estimator_values = [1, 2, 4, 8, 16, 32]#, 64, 128, 256]
base_filename = "results/{}-datasets/train-test-splits/{}/repeat_{}/preprocessed-{}.csv"
base_orig_cols_filename = "results/{}-datasets/orig_cols/{}/repeat_{}/orig_cols.p"

regression_records = []
classification_records = []

for task in ["regression", "classification"]:
    datasets = regression_datasets if task == "regression" else classification_datasets
    for dataset in datasets:
        print()
        print(dataset)
        target_name = ec.target_names[dataset]
        for repeat_ind in range(n_repeats):
            with open(base_orig_cols_filename.format(task, dataset, repeat_ind), "rb") as file:
                orig_cols = pickle.load(file)

            train_df = pd.read_csv(base_filename.format(task, dataset, repeat_ind, "train"), index_col=False)
            train_oh_df = util.get_oh_df(train_df, target_name, orig_cols)
            X_train, y_train = util.get_X_y(train_oh_df, target_name)

            test_df = pd.read_csv(base_filename.format(task, dataset, repeat_ind, "test"), index_col=False)
            test_oh_df = util.get_oh_df(test_df, target_name, orig_cols)
            X_test, y_test = util.get_X_y(test_oh_df, target_name)

            for model_repeat_ind in range(n_model_repeats):
                mses_test = []
                for n_estimators in tqdm(n_estimator_values):
                    model = train_forest(X_train, y_train, n_estimators, task == "classification")

                    if task == "regression":
                        preds_test = model.predict(X_test)
                        mses_test.append(np.mean((preds_test - y_test)**2))
                    else:
                        preds_test = model.predict_proba(X_test)
                        brier_test = brier_score_loss(y_test, preds_test[:, 1])
                        mses_test.append(brier_test)

                mse1 = mses_test[0]
                mse2 = mses_test[1]
                max_reduction = 2 * (mse1 - mse2)
                pred_mses = [mse1 - (1 - 1 / n_estimators) * max_reduction for n_estimators in n_estimator_values]

                for i, n_estimators in enumerate(n_estimator_values):
                    records = classification_records if task == "classification" else regression_records
                    records.append(
                        record(repeat_ind, model_repeat_ind, dataset, n_estimators, mses_test[i], pred_mses[i])
                    )

classification_df = pd.DataFrame.from_records(classification_records)
regression_df = pd.DataFrame.from_records(regression_records)

In [None]:
g = sns.FacetGrid(regression_df, col="dataset", sharey=False)
g.map_dataframe(sns.lineplot, x="n_estimators", y="mse_test", label="Measured", color="C0", marker="o")
g.map_dataframe(sns.lineplot, x="n_estimators", y="mse_est", label="Prediction", err_style="band", color="C1", marker="X")
g.set_ylabels("MSE")
g.set_xlabels("Num. Trees")
g.set_titles("{col_name}", fontweight="bold")
g.add_legend(ncols=3, loc="upper right", bbox_to_anchor=(0.5, 0))
plt.savefig("figures/random-forest-mse-prediction-regression.pdf", bbox_inches="tight")
plt.show()

In [None]:
g = sns.FacetGrid(classification_df, col="dataset", sharey=False)
g.map_dataframe(sns.lineplot, x="n_estimators", y="mse_test", label="Measured", color="C0", marker="o")
g.map_dataframe(sns.lineplot, x="n_estimators", y="mse_est", label="Prediction", err_style="band", color="C1", marker="X")
g.set_ylabels("Brier Score")
g.set_xlabels("Num. Trees")
g.set_titles("{col_name}", fontweight="bold")
g.add_legend(ncols=3, loc="upper right", bbox_to_anchor=(0.5, 0))
plt.savefig("figures/random-forest-mse-prediction-classification.pdf", bbox_inches="tight")
plt.show()