In [None]:
""" Just a work bench"""
import os
import json
import re
from typing import List
from pprint import pprint
from datetime import datetime

from dotenv import load_dotenv

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# import xgboost as xgb
from transformers import BertTokenizer, AutoTokenizer, AutoConfig, TFDistilBertModel, TFBertModel, TFTrainingArguments
import tensorflow as tf
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.preprocessing import PolynomialFeatures, scale, StandardScaler, Normalizer
# from sklearn.pipeline import make_pipeline
from sklearn.model_selection import (
    cross_val_score,
    cross_val_predict, 
    cross_validate, 
    KFold, 
    train_test_split,
    GridSearchCV
)
from sklearn.metrics import (
    make_scorer, 
    mean_absolute_error,
    median_absolute_error,
    mean_squared_error, 
    r2_score, 
    explained_variance_score
)
from sklearn.manifold import TSNE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from smogn import smoter

from gensim.models.keyedvectors import KeyedVectors

from tc_data import TopCoder
from run_classification import build_dataset
from model_tcpm_distilbert import (
    TCPMDistilBertClassification,
    build_tcpm_model_distilbert_classification,
    build_tcpm_model_distilbert_regression
)
from imbalanced_regression_metrics import PrecisionRecallFscoreForRegression
from boosting_learn import EnsembleTrainer

load_dotenv()
pd.set_option('display.max_rows', 800)


# New Word2Vec & Document Vectors

In [None]:
with open('result/word2vec/test_vocab.json') as f:
    testing_points = json.load(f)

In [None]:
fn_lst = os.listdir('result/word2vec/') # unsorted
fn_lst.remove('test_vocab.json')

possible_fn = [f'w2v-epochs{e}-window{w}-init_lr{lr}.json'
               for e in range(5, 51, 5) 
               for w in [5, 10 ,15, 20] 
               for lr in [0.025, 0.02, 0.01, 0.002]]

sorted_fn = [fn for fn in possible_fn if fn in fn_lst]

reg_str = r'w2v-epochs(?P<epochs>\d+)-window(?P<window>\d+)-init_lr(?P<lr>\d\.\d+)\.json'

In [None]:
len(sorted_fn)

In [None]:
for batch in range(10):
    with sns.axes_style('darkgrid'):
        fig, axes = plt.subplots(4, 4, figsize=(20, 20), dpi=200)
        epoch = (batch + 1) * 5
        for i, fn in enumerate(sorted_fn[batch * 16: (batch * 16) + 16]):
            ax = axes[i // 4, i % 4]
            hyper_param = re.match(reg_str, fn).groupdict()
            wv_df = pd.read_json(f'result/word2vec/{fn}', orient='index')        
        
            marksize = 5
            sns.scatterplot(
                data=wv_df.loc[~wv_df.label.isin(testing_points['positive'])],
                x='x',
                y='y',
                alpha=0.5,
                linewidth=0.2,
                s=marksize,
                ax=ax
            )
            sns.scatterplot(
                data=wv_df.loc[wv_df.label.isin(testing_points['positive'])],
                x='x',
                y='y',
                linewidth=0.2,
                s=20,
                color='red',
                ax=ax
            )
            ax.set_title(' | '.join([f'{k}: {v}' for k, v in hyper_param.items()]))
            ax.set_xlim(-100, 100)
            ax.set_ylim(-100, 100)
            
        fig.tight_layout()
        fig.savefig(f'result/img/w2v_2d_epochs{epoch}.png', dpi='figure')

In [None]:
handpick_fn_dct = [
    (5, 20, 0.025),
    (5, 5, 0.002),
    (10, 5, 0.002),
    (15, 20, 0.02),
    (20, 20, 0.02),
    (25, 5, 0.002),
    (30, 10, 0.025),
    (50, 5, 0.01)
]

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 3, figsize=(15, 15), dpi=200)
    for i, (e, w, lr) in enumerate(handpick_fn_dct):
        ax = axes[i // 3, i % 3]
        wv_df = pd.read_json(f'result/word2vec/w2v-epochs{e}-window{w}-init_lr{lr}.json', orient='index')
        
        marksize = 5
#         sns.scatterplot(
#             data=wv_df.loc[~wv_df.label.isin(testing_points['positive'])],
#             x='x',
#             y='y',
#             alpha=0.5,
#             linewidth=0.2,
#             s=marksize,
#             ax=ax
#         )
        sns.scatterplot(
            data=wv_df.loc[wv_df.label.isin(testing_points['positive'])],
            x='x',
            y='y',
            linewidth=0.2,
            s=20,
            color='red',
            ax=ax
        )
#         for idx, label, x, y in wv_df.loc[wv_df.label.isin(testing_points['positive'])].itertuples():
#             ax.text(
#                 x=x,
#                 y=y,
#                 s=label,
#                 ha='right'
#             )
            
        ax.set_title(f'epochs: {e} | window: {w} | lr: {lr}')
        ax.set_xlim(-100, 100)
        ax.set_ylim(-100, 100)

    fig.tight_layout()
    fig.savefig('result/img/w2v_2d_hp0.png', dpi='figure')

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 8), dpi=200)
    ax = fig.add_axes([0.05, 0.05, 0.9, 0.9])
    wv_df = pd.read_json(f'result/word2vec/w2v-epochs10-window5-init_lr0.002.json', orient='index')
    data = wv_df.loc[wv_df.label.isin(testing_points['positive'])]

    sns.scatterplot(
        data=data,
        x='x',
        y='y',
        linewidth=0.2,
        s=20,
        color='red',
        ax=ax
    )
    for idx, label, x, y in data.itertuples():
        ax.text(
            x=x,
            y=y,
            s=label,
            ha='right'
        )

    ax.set_title(f'epochs: 10 | window: 5 | lr: 0.002')
#     ax.set_xlim(20, 40)
#     ax.set_ylim(0, 20)

# Showcase the data

In [None]:
tc = TopCoder()

In [None]:
cha_info = tc.get_filtered_challenge_info()

In [None]:
score_rep = cha_info.reindex(['max_score', 'min_score', 'avg_score', 'std_score'], axis=1)

In [None]:
data_df = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True).join(cha_info.reindex(['total_prize', 'avg_score', 'number_of_registration', 'sub_reg_ratio'], axis=1))


In [None]:
cha_info.reindex(['total_prize', 'avg_score', 'number_of_registration', 'sub_reg_ratio'], axis=1).describe()

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 1, figsize=(8, 6), dpi=200)
    for i, col in enumerate(['max_score', 'min_score', 'avg_score']):
        ax = axes[i]
        data = score_rep[col].value_counts().sort_values(ascending=False)#.to_frame().reset_index()
        sns.barplot(
            x=[data[data.index >= 90].sum(), data[data.index < 90].sum()],
            y=['>= 90', 'the rest'],
            ax=ax
        )
#         ax.set_xlabel('Count of unique values')
#         ax.set_ylabel('Cat of values')
        ax.set_title(' '.join([w.capitalize() for w in col.split('_')]))
    fig.tight_layout()

## Use original data for learning

In [None]:
feature_df = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True).join(cha_info.reindex(['total_prize'], axis=1))
num_of_reg = cha_info['number_of_registration']
avg_score = cha_info['avg_score']
sub_reg_ratio = cha_info['sub_reg_ratio']


In [None]:
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(16, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.boxplot(x=avg_score, ax=ax)
    ax.set_xticks(list(range(10, 110, 10)))
    
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(16, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.boxplot(x=num_of_reg, ax=ax)
    ax.set_xticks(list(range(0, 310, 10)))
    
with sns.axes_style('whitegrid'):
    fig = plt.figure(figsize=(16, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.boxplot(x=sub_reg_ratio, ax=ax)
    ax.set_xticks(np.linspace(0, 10, 11) / 10)

In [None]:
as_vcount = avg_score.value_counts().sort_index(ascending=False)
nr_vcount = num_of_reg.value_counts().sort_index()
sr_vcount = sub_reg_ratio.value_counts().sort_index()

In [None]:
as_vcount_agg = pd.Series(
    [
        as_vcount[as_vcount.index >= 90].sum(), 
        *[as_vcount[(as_vcount.index < h) & (as_vcount.index >= l)].sum() for h, l in zip(range(90, 0, -10), range(80, -1, -10))]
    ],
    index=['>= 90', *[f'[{i + 9}, {i}' for i in range(80, -1, -10)]]
)

nr_vcount_agg = pd.Series([
    nr_vcount[nr_vcount.index <= 30].sum(),
    *[nr_vcount[(nr_vcount.index >= l) & (nr_vcount.index <= h)].sum() for l, h in zip(range(31, 142, 10), range(40, 151, 10))],
    nr_vcount[nr_vcount.index > 150].sum()
], index=['<= 30', *[f'[{i - 9}, {i}]' for i in range(40, 151, 10)], '> 150'])

sr_vcount_agg = pd.Series([
    sr_vcount[sr_vcount.index <= 0.25].sum(),
    *[sr_vcount[(sr_vcount.index > (l / 100)) & (sr_vcount.index <= h / 100)].sum() for l, h in zip(range(25, 100, 5), range(30, 101, 5))]
], index=['<= 0.25', *[f'({(i - 5) / 100}, {i / 100}]' for i in range(30, 101, 5)]])

In [None]:
sr_vcount.sum(), sr_vcount_agg.sum()

In [None]:
with sns.axes_style('white'):
    fig = plt.figure(figsize=(10, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.barplot(x=as_vcount_agg.index, y=as_vcount_agg, ax=ax)

    for p in ax.patches:
        text = int(p.get_height())
        x = p.get_x() + 0.5 * p.get_width()
        y = p.get_height() + 0.3
        ax.text(x, y, text, ha='center')

with sns.axes_style('white'):
    fig = plt.figure(figsize=(10, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.barplot(x=nr_vcount_agg.index, y=nr_vcount_agg, ax=ax)

    for p in ax.patches:
        text = int(p.get_height())
        x = p.get_x() + 0.5 * p.get_width()
        y = p.get_height() + 0.3
        ax.text(x, y, text, ha='center')

with sns.axes_style('white'):
    fig = plt.figure(figsize=(10, 3), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    sns.barplot(x=sr_vcount_agg.index, y=sr_vcount_agg, ax=ax)

    for p in ax.patches:
        text = int(p.get_height())
        x = p.get_x() + 0.5 * p.get_width()
        y = p.get_height() + 0.3
        ax.text(x, y, text, ha='center')


In [None]:
prf_score = PrecisionRecallFscoreForRegression(
    tE=0.6,
    tL=3,
    c=90,
    extreme='low',
    decay=0.1,
    delta=1e-4
)
prf_score.phi(88)

In [None]:
with sns.axes_style('white'):
    fig, axes = plt.subplots(2, 1, figsize=(16, 7), dpi=200)
    
    sns.scatterplot(x=avg_score.to_numpy(), y=prf_score.phi(avg_score.to_numpy()), ax=axes[0], s=10)
    
    sns.boxplot(x=avg_score, ax=axes[1])
    axes[1].set_xticks(list(range(10, 110, 10)))

In [None]:
prf_reg = PrecisionRecallFscoreForRegression(
    tE=0.6,
    tL=8,
    c=30,
    extreme='high',
    decay=0.5
)
prf_reg.phi(31)

In [None]:
with sns.axes_style('white'):
    fig, axes = plt.subplots(2, 1, figsize=(16, 7), dpi=200)
    
    sns.scatterplot(x=num_of_reg.to_numpy(), y=prf_reg.phi(num_of_reg.to_numpy()), ax=axes[0], s=10)
    axes[0].set_xticks(list(range(0, 360, 10)))
    sns.boxplot(x=num_of_reg, ax=axes[1])

In [None]:
prf_sub = PrecisionRecallFscoreForRegression(
    tE=0.6,
    tL=0.07,
    c=0.25,
    extreme='high'
)
prf_sub.phi(0.3)

In [None]:
with sns.axes_style('white'):
    fig, axes = plt.subplots(2, 1, figsize=(16, 7), dpi=200)
    
    sns.scatterplot(x=sub_reg_ratio.to_numpy(), y=prf_sub.phi(sub_reg_ratio.to_numpy()), ax=axes[0], s=10)
#     axes[0].set_xticks(list(range(0, 360, 10)))
    sns.boxplot(x=sub_reg_ratio, ax=axes[1])

In [None]:
len(avg_score) - 954

In [None]:
srr_vc = sub_reg_ratio.value_counts()
srr_vc[srr_vc.index <= 0.25].sum(), srr_vc[srr_vc.index > 0.25].sum()

In [None]:
# num_of_reg.shape, num_of_reg[num_of_reg <= 30].shape, num_of_reg[num_of_reg > 30].shape

In [None]:
feature_df.join(pd.read_json('data/new_docvec.json', orient='index'))

In [None]:
avg_score[avg_score < 90].count(), num_of_reg[num_of_reg > 30].count(), sub_reg_ratio[sub_reg_ratio > 0.25].count()

In [None]:
test_index = pd.concat([
    sub_reg_ratio[sub_reg_ratio <= 0.25].sample(n=636),
    sub_reg_ratio[sub_reg_ratio > 0.25].sample(n=318)
]).index

In [None]:
X_train = feature_df.loc[~feature_df.index.isin(test_index)].sort_index()
X_test = feature_df.loc[feature_df.index.isin(test_index)].sort_index()
y_train = sub_reg_ratio[~sub_reg_ratio.index.isin(test_index)].sort_index()
y_test = sub_reg_ratio[sub_reg_ratio.index.isin(test_index)].sort_index()

In [None]:
standard_scalar = StandardScaler()
standard_scalar.fit(X_train.to_numpy())

In [None]:
X_train_stded = standard_scalar.transform(X_train)
X_test_stded = standard_scalar.transform(X_test)

In [None]:
param_grid={
#     'loss': ['ls', 'lad'],
    'n_estimators': [1500, 2000, 4000],
    'learning_rate': [1e-4, 2e-4, 2e-5, 1e-5]
}

gridscv = GridSearchCV(
    GradientBoostingRegressor(verbose=1),
    param_grid=param_grid,
#     cv=3,
#     cv=10,
    scoring={
        'precision': make_scorer(prf_sub.precision),
        'recall': make_scorer(prf_sub.recall),
        'fscore': make_scorer(prf_sub.fscore)
    },
    refit='fscore'
)
# gridscv.fit(X_train_stded, y_train.to_numpy())

In [None]:
gbr = GradientBoostingRegressor(loss='ls', n_estimators=4000, learning_rate=2e-4, validation_fraction=0.15, random_state=42, verbose=1)
gbr.fit(X_train_stded, y_train.to_numpy())


In [None]:
gridscv.fit(X_train_stded, y_train.to_numpy())

In [None]:
gridscv.score(X_test_stded, y_test.to_numpy())

In [None]:
GradientBoostingRegressor().get_params()

In [None]:
gbr.score(X_train_stded, y_train.to_numpy()), gbr.score(X_test_stded, y_test.to_numpy())

Score for `avg_score`:

```python
gbr.score(X_train_stded, y_train.to_numpy()), gbr.score(X_test_stded, y_test.to_numpy())
# (0.46776686536131373, 0.3382982717201233)
```

A little overfitted

Shape of `y_train` for `avg_score`:

```python
y_train[y_train >= 95].count(), y_train[y_train < 95].count()
# (2878, 937)
```

Shape of `y_pred_train` for `avg_score`:

```python
y_pred_trained[y_pred_trained < 95].shape, y_pred_trained[y_pred_trained >= 95].shape
# ((1363,), (2452,))
```

In [None]:
y_pred_trained = gbr.predict(X_train_stded)
median_absolute_error(y_train.to_numpy(), y_pred_trained)

In [None]:
y_pred_test = gbr.predict(X_test_stded)
median_absolute_error(y_test.to_numpy(), y_pred_test)

MAE of `y_pred_train` : 2.659448356352616
MAE of `y_pred_test`: 3.271299264475617

In [None]:
y_pred_test[y_pred_test <= 30].shape, y_pred_test[y_pred_test > 30].shape

Shape of `y_pred_test`: `(543, 411)`  
Shape of `y_test`: `(636, 318)`

In [None]:
np.mean(np.abs(y_test.to_numpy() - y_pred_test) / y_test.to_numpy())

MRE of `avg_score`: 0.04

## Use SOMGN for over-sampling

In [None]:
Xy_stded = pd.concat([pd.DataFrame(X_train_stded), y_train.reset_index(drop=True)], axis=1, ignore_index=True)
Xy_stded.columns = [*[f'x{i}' for i in range(36)], 'y']

In [None]:
np.concatenate((X_train_stded, y_train.to_numpy().reshape(-1, 1)), axis=1)

In [None]:
Xy_stded

In [None]:
oversampled_Xy = smoter(
    data=Xy_stded,
    y='y',
    k=3
#     rel_xtrm_type='high'
)

In [None]:
oversampled_X, oversampled_y = oversampled_Xy.reindex([f'x{i}' for i in range(36)], axis=1).to_numpy(), oversampled_Xy['y'].to_numpy()

oversampled_X.shape, oversampled_y.shape

Oversample shape of `avg_score`:
- X: (2984, 36)
- y: (2984,)

In [None]:
new_gbr = GradientBoostingRegressor(loss='ls', n_estimators=5000, learning_rate=0.0002, validation_fraction=0.2, random_state=42, verbose=1)
new_gbr.fit(oversampled_X, oversampled_y)

In [None]:
y_pred_os = new_gbr.predict(X_test_stded)

In [None]:
mean_absolute_error(y_test.to_numpy(), y_pred_os)

MAE of `y_pred_os` for `avg_score`: 3.6620754791688954

In [None]:
new_gbr.score(X_test_stded, y_test.to_numpy())

r2 score of `new_gbr` for `avg_score`: 0.39

In [None]:
y_pred_os[y_pred_os <= 30].shape, y_pred_os[y_pred_os > 30].shape

Shape of `y_pred_os` for `avg_score`: (519, 435)

In [None]:
np.mean(np.abs(y_test.to_numpy() - y_pred_os) / y_test.to_numpy())

MRE of `y_pred_os` for `avg_score`: 0.043480902545495526

In [None]:
with sns.axes_style('dark'):
    titles = ['Prediction with original data', 'Prediction with SMOGN processed data']
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    for i, y_pred in enumerate((y_pred_test, y_pred_os)):
        ax = axes[i]
        sns.scatterplot(
            x=y_pred,
            y=y_test.to_numpy(),
            s=15,
            color='orange',
            linewidth=0.25,
            alpha=0.75,
            ax=ax
        )
        ax.set_title(titles[i])
        ax.set_xlabel('y predict')
        ax.set_ylabel('y true')
        ax.axhline(y=30, color='white', linestyle='-')
        ax.axvline(x=30, color='white', linestyle='-')
        ax.plot([0, 100], [0, 100], linestyle='--')
        
    fig.tight_layout()
    fig.savefig('result/img/num_of_reg_prediction.png', dpi='figure')

In [None]:
score_rep.drop_duplicates()

In [None]:
tsne = TSNE(n_components=2, init='pca', random_state=42, perplexity=50, n_iter=5000)
dcomp_score_rep = tsne.fit_transform(score_rep.to_numpy())

In [None]:
dcomp_score_df = pd.DataFrame.from_dict({'x': dcomp_score_rep[:, 0], 'y': dcomp_score_rep[:, 1]})
dcomp_score_df.index = score_rep.index

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(4, 4), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.scatterplot(
        x=dcomp_score_rep[:, 0],
        y=dcomp_score_rep[:, 1],
        alpha=0.5,
        linewidth=0,
        s=5,
        ax=ax
    )

In [None]:
prz_score_df = cha_info.reindex(['total_prize', 'max_score', 'min_score', 'avg_score', 'std_score'], axis=1)
ps_mean = prz_score_df.groupby(by='total_prize').mean()
print(f'unique prize: {len(ps_mean)}')

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(2, 2, figsize=(8, 8), dpi=200)
    axes = [*axes[0], *axes[1]]
    for i, col in enumerate(('max_score', 'min_score', 'avg_score', 'std_score')):
        x_plot = np.linspace(0, 2500, 2500) # plot the polynominal regression line

        reg = LinearRegression()
        reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_line = reg.predict(x_plot.reshape(-1, 1))
        
        poly_reg = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
        poly_reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_curve = poly_reg.predict(x_plot.reshape(-1, 1))
        
        ax = axes[i]
        sns.scatterplot(
            data=cha_info,
            x='total_prize',
            y=col,
            s=8,
            alpha=0.5,
            linewidth=0,
            ax=ax
        )
        sns.scatterplot(
            x=ps_mean.index,
            y=ps_mean[col],
            color='green',
            linewidth=0.2,
            s=35,
            ax=ax,
        )
        sns.lineplot(
            x=x_plot,
            y=y_line,
            linewidth=1.5,
            alpha=0.75,
            color='orange',
            ax=ax,
        )
        sns.lineplot(
            x=x_plot,
            y=y_curve,
            linewidth=1.5,
            alpha=0.75,
            color='red',
            ax=ax
        )
        

        ax.set_title(' '.join([w.capitalize() for w in col.split('_')]))
        
    fig.tight_layout()

In [None]:
prz_reg_df = cha_info.reindex(['total_prize', 'number_of_registration', 'sub_reg_ratio'], axis=1)
pr_mean = prz_reg_df.groupby(by='total_prize').mean()

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    x_plot = np.linspace(0, 2500, 2500)
    
    for i, col in enumerate(('number_of_registration', 'sub_reg_ratio')):
        reg = LinearRegression()
        reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_line = reg.predict(x_plot.reshape(-1, 1))
        
        poly_reg = make_pipeline(PolynomialFeatures(degree=3), LinearRegression())
        poly_reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_curve = poly_reg.predict(x_plot.reshape(-1, 1))
        
        ax = axes[i]
        sns.scatterplot(
            data=cha_info,
            x='total_prize',
            y=col,
            s=8,
            alpha=0.5,
            linewidth=0,
            ax=ax
        )
        sns.lineplot(
            x=x_plot,
            y=y_line,
            linewidth=1.5,
            alpha=0.75,
            color='orange',
            ax=ax,
        )
        sns.lineplot(
            x=x_plot,
            y=y_curve,
            linewidth=1.5,
            alpha=0.75,
            color='red',
            ax=ax
        )
        sns.scatterplot(
            x=pr_mean.index,
            y=pr_mean[col],
            color='green',
            linewidth=0.2,
            s=35,
            ax=ax
        )

        ax.set_title(' '.join([w.capitalize() for w in col.split('_')]))
    
    
    fig.tight_layout()

In [None]:
X = cha_info.reindex([
    'number_of_platforms',
    'number_of_technologies',
    'project_id',
    'challenge_duration',
    'total_prize'
], axis=1).join(cha_info['subtrack_category'].cat.codes.rename('subtrack_category'))

In [None]:
y = cha_info.reindex(['max_score', 'min_score', 'avg_score', 'std_score'], axis=1)

In [None]:
def mre(y_true, y_pred, sample_weight=None):
    return np.mean(np.abs(y_true - y_pred) / y_true)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.2, random_state=42)

In [None]:
# model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
cosine_similarity = lambda y_true, y_pred: np.dot(y_true, y_pred) / (np.linalg.norm(y_true) * np.linalg.norm(y_pred))

In [None]:
csim = np.array([cosine_similarity(yt, yp) for yt, yp in zip(y_test, pred)])

In [None]:
csim.mean()

In [None]:
model.feature_importances_