In [None]:
""" Just a work bench"""
import os
import json
import re
from typing import List
from pprint import pprint
from datetime import datetime
from collections import defaultdict

from dotenv import load_dotenv

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import PolynomialFeatures, scale, StandardScaler, Normalizer
# from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
    cross_val_score,
    cross_val_predict, 
    cross_validate, 
    learning_curve,
    validation_curve,
    KFold, 
    train_test_split,
    GridSearchCV
)
from sklearn.metrics import (
    make_scorer, 
    mean_absolute_error,
    median_absolute_error,
    mean_squared_error, 
    r2_score, 
    explained_variance_score
)
from sklearn.manifold import TSNE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from smogn import smoter

from gensim.models.keyedvectors import KeyedVectors

from tc_data import TopCoder
from run_nn_classification import build_dataset
from model_tcpm_distilbert import (
    TCPMDistilBertClassification,
    build_tcpm_model_distilbert_classification,
    build_tcpm_model_distilbert_regression
)
from imbalanced_regression_metrics import PrecisionRecallFscoreForRegression
from boosting_learn import EnsembleTrainer

load_dotenv()
pd.set_option('display.max_rows', 800)


# New Word2Vec & Document Vectors

In [None]:
with open('result/word2vec/test_vocab.json') as f:
    testing_points = json.load(f)

In [None]:
fn_lst = os.listdir('result/word2vec/model_selections') # unsorted
# fn_lst.remove('test_vocab.json')

possible_fn = [f'w2v-epochs{e}-window{w}-init_lr{lr}.json'
               for e in range(5, 51, 5) 
               for w in [5, 10 ,15, 20] 
               for lr in [0.025, 0.02, 0.01, 0.002]]

sorted_fn = [fn for fn in possible_fn if fn in fn_lst]

reg_str = r'w2v-epochs(?P<epochs>\d+)-window(?P<window>\d+)-init_lr(?P<lr>\d\.\d+)\.json'

In [None]:
len(sorted_fn)

In [None]:
for batch in range(10):
    with sns.axes_style('darkgrid'):
        fig, axes = plt.subplots(4, 4, figsize=(20, 20), dpi=200)
        epoch = (batch + 1) * 5
        for i, fn in enumerate(sorted_fn[batch * 16: (batch * 16) + 16]):
            ax = axes[i // 4, i % 4]
            hyper_param = re.match(reg_str, fn).groupdict()
            wv_df = pd.read_json(f'result/word2vec/model_selections/{fn}', orient='index')        
        
            marksize = 5
            sns.scatterplot(
                data=wv_df.loc[~wv_df.label.isin(testing_points['positive'])],
                x='x',
                y='y',
                alpha=0.5,
                linewidth=0.2,
                s=marksize,
                ax=ax
            )
            sns.scatterplot(
                data=wv_df.loc[wv_df.label.isin(testing_points['positive'])],
                x='x',
                y='y',
                linewidth=0.2,
                s=20,
                color='red',
                ax=ax
            )
            ax.set_title(' | '.join([f'{k}: {v}' for k, v in hyper_param.items()]))
            ax.set_xlim(-100, 100)
            ax.set_ylim(-100, 100)
            
        fig.tight_layout()
#         fig.savefig(f'result/img/w2v_2d_epochs{epoch}.png', dpi='figure')

In [None]:
handpick_fn_dct = [
    (5, 20, 0.025),
    (5, 5, 0.002),
    (10, 5, 0.002),
    (15, 20, 0.02),
    (20, 20, 0.02),
    (25, 5, 0.002),
    (30, 10, 0.025),
    (50, 5, 0.01)
]

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 3, figsize=(15, 15), dpi=200)
    for i, (e, w, lr) in enumerate(handpick_fn_dct):
        ax = axes[i // 3, i % 3]
        wv_df = pd.read_json(f'result/word2vec/model_selections/w2v-epochs{e}-window{w}-init_lr{lr}.json', orient='index')
        
        marksize = 5
#         sns.scatterplot(
#             data=wv_df.loc[~wv_df.label.isin(testing_points['positive'])],
#             x='x',
#             y='y',
#             alpha=0.5,
#             linewidth=0.2,
#             s=marksize,
#             ax=ax
#         )
        sns.scatterplot(
            data=wv_df.loc[wv_df.label.isin(testing_points['positive'])],
            x='x',
            y='y',
            linewidth=0.2,
            s=20,
            color='red',
            ax=ax
        )
#         for idx, label, x, y in wv_df.loc[wv_df.label.isin(testing_points['positive'])].itertuples():
#             ax.text(
#                 x=x,
#                 y=y,
#                 s=label,
#                 ha='right'
#             )
            
        ax.set_title(f'epochs: {e} | window: {w} | lr: {lr}')
        ax.set_xlim(-100, 100)
        ax.set_ylim(-100, 100)

    fig.tight_layout()
#     fig.savefig('result/img/w2v_2d_hp0.png', dpi='figure')

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 8), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    wv_df = pd.read_json(f'result/word2vec/model_selections/w2v-epochs10-window5-init_lr0.002.json', orient='index')
    data = wv_df.loc[wv_df.label.isin(testing_points['positive'])]
#     data = data.loc[(data.x >= 20) & (data.x <= 60) & (data.y >= 20) & (data.y <= 60)]
    
    sns.scatterplot(
        data=data,
        x='x',
        y='y',
        linewidth=0.5,
        s=30,
        color='red',
        alpha=0.75,
        label='technology keyword',
        ax=ax
    )
    for idx, label, x, y in data.itertuples():
        ax.text(
            x=x,
            y=y,
            s=label,
            ha='right'
        )

    ax.set_title('Decomposed Word2Vec keyworded vector')
#     ax.set_xlim(15, 65)
#     ax.set_ylim(15, 65)


In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 8), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    wv_df = pd.read_json(f'result/word2vec/model_selections/w2v-epochs10-window5-init_lr0.002.json', orient='index')
    data = wv_df.loc[wv_df.label.isin(testing_points['positive'])]

    sns.scatterplot(
        data=wv_df.loc[~wv_df.label.isin(testing_points['positive'])],
        x='x',
        y='y',
        linewidth=0.1,
        s=15,
        alpha=0.6,
        label='other vocab',
        ax=ax
    )
    
    sns.scatterplot(
        data=data,
        x='x',
        y='y',
        linewidth=0.5,
        s=30,
        color='red',
        alpha=0.75,
        label='technology keyword',
        ax=ax
    )
#     for idx, label, x, y in data.itertuples():
#         ax.text(
#             x=x,
#             y=y,
#             s=label,
#             ha='right'
#         )

#     ax.set_title('Decomposed Word2Vec keyworded vector')
#     ax.set_xlim(20, 40)
#     ax.set_ylim(0, 20)
    fig.savefig('result/img/word2vec_2d.png', dpi='figure')

# Showcase the data

In [None]:
tc = TopCoder()

In [None]:
cha_info = tc.get_filtered_challenge_info()

In [None]:
data_df = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True).join(cha_info.reindex(['total_prize', 'avg_score', 'number_of_registration', 'sub_reg_ratio'], axis=1))


In [None]:
new_targets = cha_info.reindex(['avg_score', 'number_of_registration', 'sub_reg_ratio'], axis=1)
new_targets.sample(n=5)#.describe()

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 3, figsize=(6, 4), dpi=200, sharey=True)
    
    avg_score_vc = new_targets['avg_score'].value_counts()
    num_of_reg_vc = new_targets['number_of_registration'].value_counts()
    sub_reg_vc = new_targets['sub_reg_ratio'].value_counts()
    
    sns.barplot(
        x=['>= 90', 'the rest'],
        y=[avg_score_vc[avg_score_vc.index >= 90].sum(), avg_score_vc[avg_score_vc.index < 90].sum()],
        ax=axes[0]
    )
    axes[0].set_title('avg_score')
    axes[0].set_ylabel('count')
    
    sns.barplot(
        x=['<= 30', 'the rest'],
        y=[num_of_reg_vc[num_of_reg_vc.index <= 30].sum(), num_of_reg_vc[num_of_reg_vc.index > 30].sum()],
        ax=axes[1]
    )
    axes[1].set_title('number_of_registration')
#     axes[1].set_ylabel('count')

    sns.barplot(
        x=['<= 0.25', 'the rest'],
        y=[sub_reg_vc[sub_reg_vc.index <= 0.25].sum(), sub_reg_vc[sub_reg_vc.index > 0.25].sum()],
        ax=axes[2]
    )
    axes[2].set_title('sub_reg_ratio')
#     axes[2].set_ylabel('count')
    
    
    fig.tight_layout()

## Imbalance metrics for regression

In [None]:
feature_df = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True).join(cha_info.reindex(['total_prize'], axis=1))
num_of_reg = cha_info['number_of_registration']
avg_score = cha_info['avg_score']
sub_reg_ratio = cha_info['sub_reg_ratio']


In [None]:
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(16, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.boxplot(x=avg_score, ax=ax)
    ax.set_xticks(list(range(10, 110, 10)))
    
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(16, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.boxplot(x=num_of_reg, ax=ax)
    ax.set_xticks(list(range(0, 310, 10)))
    
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(16, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.boxplot(x=sub_reg_ratio, ax=ax)
    ax.set_xticks(np.linspace(0, 10, 11) / 10)

In [None]:
as_vcount = avg_score.value_counts().sort_index(ascending=False)
nr_vcount = num_of_reg.value_counts().sort_index()
sr_vcount = sub_reg_ratio.value_counts().sort_index()

In [None]:
as_vcount_agg = pd.Series(
    [
        as_vcount[as_vcount.index >= 90].sum(), 
        *[as_vcount[(as_vcount.index < h) & (as_vcount.index >= l)].sum() for h, l in zip(range(90, 0, -10), range(80, -1, -10))]
    ],
    index=['>= 90', *[f'[{i + 9}, {i}' for i in range(80, -1, -10)]]
)

nr_vcount_agg = pd.Series([
    nr_vcount[nr_vcount.index <= 30].sum(),
    *[nr_vcount[(nr_vcount.index >= l) & (nr_vcount.index <= h)].sum() for l, h in zip(range(31, 142, 10), range(40, 151, 10))],
    nr_vcount[nr_vcount.index > 150].sum()
], index=['<= 30', *[f'[{i - 9}, {i}]' for i in range(40, 151, 10)], '> 150'])

sr_vcount_agg = pd.Series([
    sr_vcount[sr_vcount.index <= 0.25].sum(),
    *[sr_vcount[(sr_vcount.index > (l / 100)) & (sr_vcount.index <= h / 100)].sum() for l, h in zip(range(25, 100, 5), range(30, 101, 5))]
], index=['<= 0.25', *[f'({(i - 5) / 100}, {i / 100}]' for i in range(30, 101, 5)]])

In [None]:
sr_vcount.sum(), sr_vcount_agg.sum()

In [None]:
with sns.axes_style('white'):
    fig = plt.figure(figsize=(10, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.barplot(x=as_vcount_agg.index, y=as_vcount_agg, ax=ax)

    for p in ax.patches:
        text = int(p.get_height())
        x = p.get_x() + 0.5 * p.get_width()
        y = p.get_height() + 0.3
        ax.text(x, y, text, ha='center')

with sns.axes_style('white'):
    fig = plt.figure(figsize=(10, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.barplot(x=nr_vcount_agg.index, y=nr_vcount_agg, ax=ax)

    for p in ax.patches:
        text = int(p.get_height())
        x = p.get_x() + 0.5 * p.get_width()
        y = p.get_height() + 0.3
        ax.text(x, y, text, ha='center')

with sns.axes_style('white'):
    fig = plt.figure(figsize=(10, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.barplot(x=sr_vcount_agg.index, y=sr_vcount_agg, ax=ax)

    for p in ax.patches:
        text = int(p.get_height())
        x = p.get_x() + 0.5 * p.get_width()
        y = p.get_height() + 0.3
        ax.text(x, y, text, ha='center')


In [None]:
prf_score = PrecisionRecallFscoreForRegression(
    tE=0.6,
    tL=3,
    c=90,
    extreme='low',
    decay=0.1,
    delta=1e-4
)
prf_score.phi(91), prf_score.phi(90), prf_score.phi(89)

In [None]:
with sns.axes_style('white'):
    fig, axes = plt.subplots(2, 1, figsize=(16, 7), dpi=200)
    
    sns.scatterplot(x=avg_score.to_numpy(), y=prf_score.phi(avg_score.to_numpy()), ax=axes[0], s=12)
    axes[0].axhline(0.6, color='red', alpha=0.75, linewidth=0.5)
    sns.boxplot(x=avg_score, ax=axes[1])
    axes[1].set_xticks(list(range(10, 110, 10)))

In [None]:
prf_reg = PrecisionRecallFscoreForRegression(
    tE=0.6,
    tL=8,
    c=30,
    extreme='high',
    decay=0.5
)
prf_reg.phi(31)

In [None]:
with sns.axes_style('white'):
    fig, axes = plt.subplots(2, 1, figsize=(16, 7), dpi=200)
    
    sns.scatterplot(x=num_of_reg.to_numpy(), y=prf_reg.phi(num_of_reg.to_numpy()), ax=axes[0], s=10)
    axes[0].set_xticks(list(range(0, 360, 10)))
    sns.boxplot(x=num_of_reg, ax=axes[1])

In [None]:
prf_sub = PrecisionRecallFscoreForRegression(
    tE=0.6,
    tL=0.07,
    c=0.25,
    extreme='high'
)
prf_sub.phi(0.3)

In [None]:
with sns.axes_style('white'):
    fig, axes = plt.subplots(2, 1, figsize=(16, 7), dpi=200)
    
    sns.scatterplot(x=sub_reg_ratio.to_numpy(), y=prf_sub.phi(sub_reg_ratio.to_numpy()), ax=axes[0], s=10)
#     axes[0].set_xticks(list(range(0, 360, 10)))
    sns.boxplot(x=sub_reg_ratio, ax=axes[1])

## Use SOMGN/SMOTER for over-sampling

Done in the `.py` files. There is a implement available in python.

In [None]:
sorted(os.listdir('result/boosting_learn/learning_data/'))

## Model selection (unfinished)

In [None]:
res_path = os.path.join('result', 'boosting_learn', 'model_selection')

In [None]:
avg_score_res = {}
for reg in 'gradientboostingregressor', 'randomforestregressor':
    for dv in 0, 1:
        with open(os.path.join(res_path, f'avg_score_{reg}_dv{dv}.json')) as f:
            avg_score_res[(reg, dv)] = json.load(f)

In [None]:
num_of_reg_res = {}
for reg in 'gradientboostingregressor', 'randomforestregressor':
    for dv in 0, 1:
        try:
            with open(os.path.join(res_path, f'number_of_registration_{reg}_dv{dv}_rs.json')) as f:
                num_of_reg_res[(reg, dv)] = json.load(f)
        except OSError as e:
            print(e)

In [None]:
num_of_reg_res_df = pd.DataFrame.from_dict({k: {vk: vv for vk, vv in v.items() if vk != 'best_params'} for k, v in num_of_reg_res.items()})
num_of_reg_res_df.columns.names = ['regressor', 'dv']
num_of_reg_res_df

In [None]:
avg_score_df = pd.DataFrame.from_dict({k: {vk: vv for vk, vv in v.items() if vk != 'best_params'} for k, v in avg_score_res.items()})
avg_score_df.columns.names = ['regressor', 'dv']
avg_score_df


In [None]:
X_train, y_train = EnsembleTrainer.read_dataset('avg_score', 'train_resample', 0)
X_test, y_test = EnsembleTrainer.read_dataset('avg_score', 'test', 0)

prf_score = PrecisionRecallFscoreForRegression(tE=0.6, tL=3, c=90, extreme='low', decay=0.1)

rfreg = RandomForestRegressor(**avg_score_res[('randomforestregressor', 1)]['best_params'], n_jobs=-1)
rfreg.fit(X_train, y_train)

y_train_pred = rfreg.predict(X_train)
y_pred = rfreg.predict(X_test)

print(f'r2 score: {rfreg.score(X_test, y_test)}')
print(f'train mae: {mean_absolute_error(y_train, y_train_pred)}\ntest mae: {mean_absolute_error(y_test, y_pred)}')
print(f'train mse: {mean_squared_error(y_train, y_train_pred)}\ntest mse: {mean_squared_error(y_test, y_pred)}')
print(f'regression precision:\n\ttrain: {prf_score.precision(y_train, y_train_pred)}\n\ttest: {prf_score.precision(y_test, y_pred)}')
print(f'regression recal:\n\ttrain: {prf_score.recall(y_train, y_train_pred)}\n\ttest: {prf_score.recall(y_test, y_pred)}')
print(f'regression precision:\n\ttrain: {prf_score.fscore(y_train, y_train_pred)}\n\ttest: {prf_score.fscore(y_test, y_pred)}')



In [None]:
X_train, y_train = EnsembleTrainer.read_dataset('avg_score', 'train_resample', 1)
X_test, y_test = EnsembleTrainer.read_dataset('avg_score', 'test', 1)

prf_score = PrecisionRecallFscoreForRegression(tE=0.6, tL=3, c=90, extreme='low', decay=0.1)

rfreg = RandomForestRegressor(**avg_score_res[('randomforestregressor', 1)]['best_params'], n_jobs=-1)
rfreg.fit(X_train, y_train)

y_train_pred = rfreg.predict(X_train)
y_pred = rfreg.predict(X_test)

print(f'r2 score: {rfreg.score(X_test, y_test)}')
print(f'train mae: {mean_absolute_error(y_train, y_train_pred)}\ntest mae: {mean_absolute_error(y_test, y_pred)}')
print(f'train mse: {mean_squared_error(y_train, y_train_pred)}\ntest mse: {mean_squared_error(y_test, y_pred)}')
print(f'regression precision:\n\ttrain: {prf_score.precision(y_train, y_train_pred)}\n\ttest: {prf_score.precision(y_test, y_pred)}')
print(f'regression recal:\n\ttrain: {prf_score.recall(y_train, y_train_pred)}\n\ttest: {prf_score.recall(y_test, y_pred)}')
print(f'regression precision:\n\ttrain: {prf_score.fscore(y_train, y_train_pred)}\n\ttest: {prf_score.fscore(y_test, y_pred)}')



In [None]:
X_train, y_train = EnsembleTrainer.read_dataset('avg_score', 'train_resample', 0)
X_test, y_test = EnsembleTrainer.read_dataset('avg_score', 'test', 0)

rfreg = RandomForestRegressor(**avg_score_res[('randomforestregressor', 1)]['best_params'], n_jobs=-1)
train_sizes, train_score, test_score = learning_curve(
    rfreg,
    X_train,
    y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1, 10),
    verbose=1,
)



In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    for i, score in enumerate((train_score, test_score)):
        mean = np.mean(score, axis=1)
        std = np.std(score, axis=1)
        
        sns.lineplot(
            x=train_sizes,
            y=mean,
            label='val_score' if i else 'train_score',
            marker='o',
            markersize=5,
            ax=ax,
        )
        
    ax.set_xticks(train_sizes)
    ax.set_ylim(-0.5, 1.2)
#     ax.set_yticks([i / 10 for i in range(10)] + [1])
        
    fig.savefig('temp_fig_lrcurve_avg_score_dv0.png', dpi=200)

In [None]:
X_train, y_train = EnsembleTrainer.read_dataset('avg_score', 'train_resample', 0)
X_test, y_test = EnsembleTrainer.read_dataset('avg_score', 'test', 0)

prf_score = PrecisionRecallFscoreForRegression(tE=0.6, tL=3, c=90, extreme='low', decay=0.1)

c = GradientBoostingRegressor(**avg_score_res[('gradientboostingregressor', 1)]['best_params'])
gbreg.fit(X_train, y_train)

y_train_pred = gbreg.predict(X_train)
y_pred = gbreg.predict(X_test)

print(f'r2 score: {gbreg.score(X_test, y_test)}')
print(f'train mae: {mean_absolute_error(y_train, y_train_pred)}\ntest mae: {mean_absolute_error(y_test, y_pred)}')
print(f'train mse: {mean_squared_error(y_train, y_train_pred)}\ntest mse: {mean_squared_error(y_test, y_pred)}')
print(f'regression precision:\n\ttrain: {prf_score.precision(y_train, y_train_pred)}\n\ttest: {prf_score.precision(y_test, y_pred)}')
print(f'regression recal:\n\ttrain: {prf_score.recall(y_train, y_train_pred)}\n\ttest: {prf_score.recall(y_test, y_pred)}')
print(f'regression precision:\n\ttrain: {prf_score.fscore(y_train, y_train_pred)}\n\ttest: {prf_score.fscore(y_test, y_pred)}')


In [None]:
gbreg = GradientBoostingRegressor(**avg_score_res[('gradientboostingregressor', 1)]['best_params'])
train_sizes_gbr, train_score_gbr, test_score_gbr = learning_curve(
    gbreg,
    X_train,
    y_train,
    cv=5,
    train_sizes=np.linspace(0.1, 1, 10),
    verbose=1,
)


In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    for i, score in enumerate((train_score_gbr, test_score_gbr)):
        mean = np.mean(score, axis=1)
        std = np.std(score, axis=1)
        
        sns.lineplot(
            x=train_sizes_gbr,
            y=mean,
            label='val_score' if i else 'train_score',
            marker='o',
            markersize=4,
            ax=ax,
        )
        
    ax.set_xticks(train_sizes_gbr)
    ax.set_ylim(-0.75, 1.2)
        
    fig.savefig('temp_fig_avg_score_gbg_dv0.png', dpi='figure')

In [None]:
avg_score_res[('gradientboostingregressor', 1)]['best_params']

## Build new feature

In [None]:
s = datetime.now()
tc = TopCoder()
e = datetime.now()

print(e - s)

### Testing run for different models

In [None]:
def mre(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) / y_true)

In [None]:
X, y = tc.build_final_dataset('number_of_registration')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.shape, X_test.shape

In [None]:
scaler = StandardScaler().fit(X_train)

In [None]:
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [None]:
gbreg = GradientBoostingRegressor(
    n_estimators=2000,
    loss='ls',
#     learning_rate=0.05,
    n_iter_no_change=5,
    tol=0.01,
    validation_fraction=0.2,
    random_state=42
)
# gbreg = LinearRegression()

gbreg.fit(X_train_std, y_train.to_numpy())
y_pred = gbreg.predict(X_test_std)
y_train_pred = gbreg.predict(X_train_std)

print(f'===== 2000 | ls =====')
# print(f'Actuall estimator: {gbreg.n_estimators_}')
print(f'TEST\t score: {gbreg.score(X_test_std, y_test.to_numpy())}')
print(f'Train\t score: {gbreg.score(X_train_std, y_train.to_numpy())}')
print(f'TEST\tmae: {mean_absolute_error(y_test.to_numpy(), y_pred)} | mse: {mean_squared_error(y_test.to_numpy(), y_pred)} | mre: {mre(y_test.to_numpy(), y_pred)}')
print(f'TRAIN\tmae: {mean_absolute_error(y_train.to_numpy(), y_train_pred)} | mse: {mean_squared_error(y_train.to_numpy(), y_train_pred)} | mre: {mre(y_train.to_numpy(), y_train_pred)}')


In [None]:
rdreg = RandomForestRegressor(
    n_estimators=2000,
#     max_features=X_train.shape[1] // 3,
    min_samples_leaf=5,
    min_impurity_decrease=0.5,
    random_state=42,
    n_jobs=-1,
)
rdreg.fit(X_train_std, y_train.to_numpy())
y_pred = rdreg.predict(X_test_std)
y_train_pred = rdreg.predict(X_train_std)

print(f'===== RESULT | mid =====')
print(f'Actuall estimator: {rdreg.n_estimators}')
print(f'TEST\tscore: {rdreg.score(X_test_std, y_test.to_numpy())}')
print(f'Train\tscore: {rdreg.score(X_train_std, y_train.to_numpy())}')
print(f'TEST\tmae: {mean_absolute_error(y_test.to_numpy(), y_pred)} | mse: {mean_squared_error(y_test.to_numpy(), y_pred)}')
print(f'TRAIN\tmae: {mean_absolute_error(y_train.to_numpy(), y_train_pred)} | mse: {mean_squared_error(y_train.to_numpy(), y_train_pred)}')


In [None]:
adreg = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(max_depth=3),
    loss='square',
    learning_rate=0.1,
    n_estimators=1000,
    random_state=42
)

adreg.fit(X_train_std, y_train.to_numpy())
y_pred = adreg.predict(X_test_std)
y_train_pred = adreg.predict(X_train_std)

print(f'===== RESULT =====')
#     print(f'Actuall estimator: {adreg.n_estimators}')
print(f'TEST\tscore: {adreg.score(X_test_std, y_test.to_numpy())}')
print(f'Train\t score: {adreg.score(X_train_std, y_train.to_numpy())}')
print(f'TEST\tmae: {mean_absolute_error(y_test.to_numpy(), y_pred)} | mse: {mean_squared_error(y_test.to_numpy(), y_pred)}')
print(f'TRAIN\tmae: {mean_absolute_error(y_train.to_numpy(), y_train_pred)} | mse: {mean_squared_error(y_train.to_numpy(), y_train_pred)}')
