In [None]:
""" Just a work bench"""
import os
import json
from typing import List
from pprint import pprint
from datetime import datetime

from dotenv import load_dotenv

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from transformers import BertTokenizer, AutoTokenizer, AutoConfig, TFDistilBertModel, TFBertModel, TFTrainingArguments
import tensorflow as tf
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
# from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
from sklearn.manifold import TSNE
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

from tc_data import TopCoder
from run_classification import build_dataset
from model_tcpm_distilbert import (
    TCPMDistilBertClassification,
    build_tcpm_model_distilbert_classification,
    build_tcpm_model_distilbert_regression
)

load_dotenv()
pd.set_option('display.max_rows', 800)


In [None]:
tc = TopCoder()

In [None]:
cha_info = tc.get_filtered_challenge_info()

In [None]:
score_rep = cha_info.reindex(['max_score', 'min_score', 'avg_score', 'std_score'], axis=1)

In [None]:
data_df = tc.get_meta_data_features(encoded_tech=True, softmax_tech=True, return_df=True).join(cha_info.reindex(['total_prize', 'avg_score', 'number_of_registration', 'sub_reg_ratio'], axis=1))


In [None]:
cha_info.reindex(['total_prize', 'avg_score', 'number_of_registration', 'sub_reg_ratio'], axis=1).describe()

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(4, 1, figsize=(8, 6), dpi=200)
    for i, col in enumerate(['max_score', 'min_score', 'avg_score', 'std_score']):
        ax = axes[i]
        data = score_rep[col].value_counts().sort_values(ascending=False)#.to_frame().reset_index()
        sns.barplot(
            x=[data.iloc[0], data.iloc[1:].sum()],
            y=['100', 'the rest'],
            ax=ax
        )
#         ax.set_xlabel('Count of unique values')
#         ax.set_ylabel('Cat of values')
        ax.set_title(' '.join([w.capitalize() for w in col.split('_')]))
    fig.tight_layout()

In [None]:
score_rep.drop_duplicates()

In [None]:
tsne = TSNE(n_components=2, init='pca', random_state=42, perplexity=50, n_iter=5000)
dcomp_score_rep = tsne.fit_transform(score_rep.to_numpy())

In [None]:
dcomp_score_df = pd.DataFrame.from_dict({'x': dcomp_score_rep[:, 0], 'y': dcomp_score_rep[:, 1]})
dcomp_score_df.index = score_rep.index

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(4, 4), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.scatterplot(
        x=dcomp_score_rep[:, 0],
        y=dcomp_score_rep[:, 1],
        alpha=0.5,
        linewidth=0,
        s=5,
        ax=ax
    )

In [None]:
prz_score_df = cha_info.reindex(['total_prize', 'max_score', 'min_score', 'avg_score', 'std_score'], axis=1)
ps_mean = prz_score_df.groupby(by='total_prize').mean()
print(f'unique prize: {len(ps_mean)}')

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(2, 2, figsize=(8, 8), dpi=200)
    axes = [*axes[0], *axes[1]]
    for i, col in enumerate(('max_score', 'min_score', 'avg_score', 'std_score')):
        x_plot = np.linspace(0, 2500, 2500) # plot the polynominal regression line

        reg = LinearRegression()
        reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_line = reg.predict(x_plot.reshape(-1, 1))
        
        poly_reg = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
        poly_reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_curve = poly_reg.predict(x_plot.reshape(-1, 1))
        
        ax = axes[i]
        sns.scatterplot(
            data=cha_info,
            x='total_prize',
            y=col,
            s=8,
            alpha=0.5,
            linewidth=0,
            ax=ax
        )
        sns.scatterplot(
            x=ps_mean.index,
            y=ps_mean[col],
            color='green',
            linewidth=0.2,
            s=35,
            ax=ax,
        )
        sns.lineplot(
            x=x_plot,
            y=y_line,
            linewidth=1.5,
            alpha=0.75,
            color='orange',
            ax=ax,
        )
        sns.lineplot(
            x=x_plot,
            y=y_curve,
            linewidth=1.5,
            alpha=0.75,
            color='red',
            ax=ax
        )
        

        ax.set_title(' '.join([w.capitalize() for w in col.split('_')]))
        
    fig.tight_layout()

In [None]:
prz_reg_df = cha_info.reindex(['total_prize', 'number_of_registration', 'sub_reg_ratio'], axis=1)
pr_mean = prz_reg_df.groupby(by='total_prize').mean()

In [None]:
with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4), dpi=200)
    x_plot = np.linspace(0, 2500, 2500)
    
    for i, col in enumerate(('number_of_registration', 'sub_reg_ratio')):
        reg = LinearRegression()
        reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_line = reg.predict(x_plot.reshape(-1, 1))
        
        poly_reg = make_pipeline(PolynomialFeatures(degree=3), LinearRegression())
        poly_reg.fit(X=cha_info['total_prize'].to_numpy().reshape(-1, 1), y=cha_info[col].to_numpy())
        y_curve = poly_reg.predict(x_plot.reshape(-1, 1))
        
        ax = axes[i]
        sns.scatterplot(
            data=cha_info,
            x='total_prize',
            y=col,
            s=8,
            alpha=0.5,
            linewidth=0,
            ax=ax
        )
        sns.lineplot(
            x=x_plot,
            y=y_line,
            linewidth=1.5,
            alpha=0.75,
            color='orange',
            ax=ax,
        )
        sns.lineplot(
            x=x_plot,
            y=y_curve,
            linewidth=1.5,
            alpha=0.75,
            color='red',
            ax=ax
        )
        sns.scatterplot(
            x=pr_mean.index,
            y=pr_mean[col],
            color='green',
            linewidth=0.2,
            s=35,
            ax=ax
        )

        ax.set_title(' '.join([w.capitalize() for w in col.split('_')]))
    
    
    fig.tight_layout()

In [None]:
X = cha_info.reindex([
    'number_of_platforms',
    'number_of_technologies',
    'project_id',
    'challenge_duration',
    'total_prize'
], axis=1).join(cha_info['subtrack_category'].cat.codes.rename('subtrack_category'))

In [None]:
y = cha_info.reindex(['max_score', 'min_score', 'avg_score', 'std_score'], axis=1)

In [None]:
def mre(y_true, y_pred, sample_weight=None):
    return np.mean(np.abs(y_true - y_pred) / y_true)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), test_size=0.2, random_state=42)

In [None]:
# model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
model = RandomForestRegressor()
model.fit(X_train, y_train)

In [None]:
pred = model.predict(X_test)

In [None]:
cosine_similarity = lambda y_true, y_pred: np.dot(y_true, y_pred) / (np.linalg.norm(y_true) * np.linalg.norm(y_pred))

In [None]:
csim = np.array([cosine_similarity(yt, yp) for yt, yp in zip(y_test, pred)])

In [None]:
csim.mean()

In [None]:
model.feature_importances_