In [None]:
""" This notebook serve two purposes:
    1. Build the super naive baseline
    2. Run simple basic regression model against every y
"""

import os
import re
import json
from pprint import pprint
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, BayesianRidge
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.preprocessing import StandardScaler, Normalizer, scale, normalize
from sklearn.model_selection import (
    train_test_split,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    RandomizedSearchCV,
    KFold
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    make_scorer,
)
from sklearn.pipeline import make_pipeline

from dotenv import load_dotenv

from tc_data import TopCoder
from final_model_selection import (
    kfold_predict_validate_gradient_boosting,
    kfold_predict_validate_neural_network,
    train_gb_for_production,
    build_sequential_neural_network,
    mre,
    tfmre,
)

load_dotenv()
pd.set_option('display.max_rows', 800)

## Prepare the training data

In [None]:
tc = TopCoder()

In [None]:
X, y_score = tc.build_final_dataset('avg_score')
X, y_nreg = tc.build_final_dataset('number_of_registration')
X, y_subra = tc.build_final_dataset('sub_reg_ratio')

X_price = X.copy()
y_price = X_price.pop('total_prize')

## Iterate the regressional models

In [None]:
def iterate_train_regressor(X, y):
    """ Iterate through all regressors."""
    model_lst = [
        LinearRegression,
        Ridge,
        Lasso,
        ElasticNet,
        BayesianRidge,
        SVR,
        GaussianProcessRegressor,
        DecisionTreeRegressor,
        RandomForestRegressor,
        AdaBoostRegressor,
        GradientBoostingRegressor,
        KNeighborsRegressor,
    ]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X, y = X_train.to_numpy(), y_train.to_numpy()
    
    res = {}
    for reg in model_lst:
        print(f'Training {reg.__name__}...', ' ' * 50, end='\r')
        est = make_pipeline(StandardScaler(), reg())
        cv_res = cross_validate(est, X, y, cv=10, return_train_score=True, scoring={
            'mre': make_scorer(mre, greater_is_better=False),
            'mae': make_scorer(mean_absolute_error, greater_is_better=False)
        })
        
        res[reg.__name__] = {k: np.mean(np.abs(v)) for k, v in cv_res.items()}
        
    return res

In [None]:
df_dct = {}
for name, X, y in [('avg_score', X, y_score), ('num_or_reg', X, y_nreg), ('sub_ratio', X, y_subra), ('price', X_price, y_price)]:
    print(f'\nTraining {name}...')
    df_dct[name] = pd.DataFrame.from_dict(iterate_train_regressor(X, y), orient='index')
    
df_dct = {k: df.reindex(['test_mae', 'test_mre'], axis=1).rename(columns={'test_mae': 'mae', 'test_mre': 'mre'}) for k, df in df_dct.items()}

In [None]:
simple_reg_dct = {k: df.to_dict() for k, df in df_dct.items()}

In [None]:
# with open('./result/simple_regression/simple_reg_result.json', 'w') as f:
#     json.dump(simple_reg_dct, f, indent=4)

In [None]:
model_lst = [
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
    BayesianRidge,
    SVR,
    GaussianProcessRegressor,
    DecisionTreeRegressor,
    RandomForestRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor,
    KNeighborsRegressor
]

model_dct = {reg.__name__: reg for reg in model_lst}

In [None]:
top4_reg_dct = {k: df['mre'].round(6).sort_values().head(4).to_dict() for k, df in df_dct.items()}
top4_reg_dct

In [None]:
# with open('./result/simple_regression/top4_reg_dct.json', 'w') as f:
#     json.dump(top4_reg_dct, f, indent=4)

## Check the random search cv result

In [None]:
rs_path = './result/random_search_res/'

In [None]:
rs_score = defaultdict(dict)
rs_param = defaultdict(dict)
for fn in os.listdir(rs_path):
    tokens = fn[:-5].split('_')
    target = '_'.join(tokens[:-2])
    algo = tokens[-2]

    with open(os.path.join(rs_path, fn)) as f:
        res_dct = json.load(f)
        
    rs_score[target][algo] = abs(res_dct['best_score_in_rs'])
    rs_param[target][algo] = res_dct['best_params']

In [None]:
with open('./result/simple_regression/top4_reg_dct.json') as f:
    top4_reg_dct = json.load(f)
    top4_reg_dct.pop('price')

In [None]:
rs_result_comparison = {}
for target in top4_reg_dct:
    rs_result_comparison[target] = pd.DataFrame.from_dict({
        'simple': top4_reg_dct[target],
        'rs': rs_score[target],
    })

In [None]:
rs_result_comparison['avg_score']

In [None]:
rs_result_comparison['number_of_registration']

In [None]:
rs_result_comparison['sub_reg_ratio']

In [None]:
rs_param['avg_score']['RandomForestRegressor']

In [None]:
rfs = RandomForestRegressor(n_jobs=-1, verbose=1, random_state=42, bootstrap=True, **rs_param['avg_score']['RandomForestRegressor'])

In [None]:
rfs