In [1]:
import sys
sys.path.append('../')
sys.path.append('../../')

import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import pickle
from simulation_utils import run_std_approach, run_my_lin
from DataGenerator import DataGenerator
from constants_sim import RANDOM_SEED, PARAMS

import torch
import torch.nn as nn
import os

from constants import CV_FOLDS, OPTUNA__N_TRIALS, OPTUNA__N_JOBS, XGB__NTHREAD, \
    N_ESTIMATORS, EARLY_STOPPING_ROUNDS, OBJECTIVE, XGB_HP
from utils import aggregate_results, aggregate_feature_importances, \
    aggregate_residuals
from plotting import plot_reg_perf, feature_importance_plot, approach_comparison_plot
from residual_regressor import ErrorRegressor
from xgb.experiment import XGBoostExperiment
from xgb.final_model import FinalXGB

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(152)
net_residual = nn.Sequential(
    nn.Linear(PARAMS['n_informative']*3, 16),
    nn.BatchNorm1d(16),
    nn.Tanh(),
    nn.Linear(16, 82),
    nn.BatchNorm1d(82),
    nn.Sigmoid(),
    nn.Linear(82, 32),
    nn.BatchNorm1d(32),
    nn.Tanh(),
    nn.Linear(32, 16),
    nn.BatchNorm1d(16),
    nn.Sigmoid(),
    nn.Linear(16, 8),
    nn.BatchNorm1d(8),
    nn.Tanh(),
    nn.Linear(8, 4),
    nn.BatchNorm1d(4),
    nn.Sigmoid(),
    nn.Linear(4, 1),
    nn.BatchNorm1d(1),
    nn.Tanh()
)

### Linear GT, non-linear res

In [3]:
np.random.seed(RANDOM_SEED)

dg = DataGenerator(random_seed=RANDOM_SEED)
X_train, y_train, X_test, y_test, y_test_no_coi, res_proportion, coefs, res_coefs = dg.generate_linear_data(**PARAMS)

fig_name = f'form_linnonlin_{PARAMS["effective_rank"]}rank_{PARAMS["n_common"]}common_{PARAMS["noise"]}noise_{PARAMS["n_informative"]}_\
informative_res_coef{PARAMS["res_coef_value"]}_gt_coef{PARAMS["gt_coef_value"]}'
data_save_path = "../../../sent/simulated/sim_lin_nonlin"

X_test_nn = torch.tensor(X_test.to_numpy(), dtype=torch.float32)
res_proportion_linnonlin = net_residual(X_test_nn).detach().numpy().flatten() * 3
y_test = y_test_no_coi + res_proportion

pd.DataFrame([y_test_no_coi, res_proportion, y_test])

Unnamed: 0,330,331,332,333,334,335,336,337,338,339,...,990,991,992,993,994,995,996,997,998,999
0,-6.635899,-16.447328,-14.088288,-24.337356,-4.757166,-7.864151,2.866112,30.917436,-0.461558,11.083579,...,-0.373478,5.55074,21.187584,0.679776,18.509377,10.013308,-3.57515,-10.795736,10.410479,-10.569935
1,-2.326742,0.863992,-2.042409,-20.567818,6.033498,2.843033,-16.894558,-15.807082,-11.185714,1.452902,...,-2.958734,-35.921324,-16.43189,25.095278,-8.339024,-14.883707,22.814836,7.157325,-12.045748,4.198076
2,-8.962641,-15.583336,-16.130696,-44.905174,1.276332,-5.021118,-14.028446,15.110354,-11.647272,12.536481,...,-3.332212,-30.370583,4.755694,25.775054,10.170353,-4.870399,19.239686,-3.638411,-1.635269,-6.371859


# Standard Approach

In [4]:
std_results = run_std_approach(X_train, y_train, X_test, y_test, combine=True)

# My approach Linear Model

In [5]:
mylin_results = run_my_lin(X_train, X_test, y_train, y_test, with_correction=True)

In [7]:
pickle.dump(res_proportion, open(f'{data_save_path}/res_proportion.p', 'wb'))
pickle.dump(std_results, open(f'{data_save_path}/std_results.p', 'wb'))
pickle.dump(mylin_results, open(f'{data_save_path}/mylin_results.p', 'wb'))

# XGBoost

#### Cross-Validation

In [7]:
OPTUNA__N_TRIALS = 50
REPEATS = 3
NUMERICAL = X_train.columns

# Create the directory for the xgboost results
xgb_save_path = f"../../../sent/simulated/sim_lin_nonlin/xgboost_exp"
os.makedirs(xgb_save_path, exist_ok=True)

exp = XGBoostExperiment(xgb_save_path, X_train, y_train, NUMERICAL, [], CV_FOLDS,
                        OBJECTIVE, XGB_HP, OPTUNA__N_TRIALS, OPTUNA__N_JOBS,
                        XGB__NTHREAD, N_ESTIMATORS, EARLY_STOPPING_ROUNDS)
exp.run(repeats=REPEATS)

# aggregate the results across repeats
mean_result = aggregate_results(xgb_save_path)
mean_result.to_csv(f"{xgb_save_path}/mean_results.out")
res_results = aggregate_residuals(xgb_save_path)
res_results.to_csv(f"{xgb_save_path}/residuals.out")
feat_imps = aggregate_feature_importances(xgb_save_path)
# plots
feature_importance_plot(feat_imps, xgb_save_path)
plot_reg_perf(mean_result, xgb_save_path)

## Train final model on test data

In [8]:
xgb_save_path = f"../../../sent/simulated/sim_lin_nonlin/xgboost_exp" #use the same HPs as in the all-linear experiment
xgbfinal_save_path = f"../../../sent/simulated/sim_lin_nonlin/xgb_final"
os.makedirs(xgbfinal_save_path, exist_ok=True)

final = FinalXGB(xgbfinal_save_path, X_train, y_train, X_train.columns, [], CV_FOLDS, 
                OBJECTIVE, XGB_HP, OPTUNA__N_TRIALS, OPTUNA__N_JOBS,
                XGB__NTHREAD, N_ESTIMATORS, EARLY_STOPPING_ROUNDS, repeats=5)
final.train(exp_glob_pattern=f"{xgb_save_path}/repeat-*/params.json")
res_test, _ = final.test(X_test, y_test)

  0%|          | 0/5 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:00<00:01,  3.87it/s]

Done training model 0


 40%|████      | 2/5 [00:00<00:01,  2.87it/s]

Done training model 1


 60%|██████    | 3/5 [00:00<00:00,  3.06it/s]

Done training model 2


 80%|████████  | 4/5 [00:01<00:00,  3.20it/s]

Done training model 3


100%|██████████| 5/5 [00:01<00:00,  3.21it/s]

Done training model 4





## XGBoost final

#### Residual correction

In [9]:
# Build error correction model
train_res = aggregate_residuals(xgb_save_path)
residual_regressor = ErrorRegressor(numerical=X_train.columns, imp_iter=25)
residual_regressor.fit(X_train, train_res["Residual_mean"])

## Load test data
corr_residuals_table = residual_regressor.clean_residuals(X_test, res_test["Residual_mean"])

In [8]:
pickle.dump(corr_residuals_table, open(f'{data_save_path}/res_test.p', 'wb'))

In [4]:
std_results = pickle.load(open(f'{data_save_path}/std_results.p', 'rb'))
mylin_results = pickle.load(open(f'{data_save_path}/mylin_results.p', 'rb'))
corr_residuals_table = pickle.load(open(f'{data_save_path}/res_test.p', 'rb'))
res_proportion = pickle.load(open(f'{data_save_path}/res_prop.p', 'rb'))

In [5]:

approach_comparison_plot(res_proportion, std_results[0], mylin_results[0], corr_residuals_table, 
                         "True vs extracted residuals (linear gt, non-lin res)",
                         save_path=f"{data_save_path}/form_linnonlin.png")
