In [None]:
import os
import re
import json
import pickle

import pandas as pd
import numpy as np
from collections import defaultdict

import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.transforms as mtrans
import seaborn as sns

from sklearn.linear_model import LinearRegression

pd.set_option('display.max_rows', 150)

In [None]:
measure_dfs = defaultdict(dict)
ml_models = defaultdict(dict)
document_vec = defaultdict(dict)

for track in ('all', 'develop', 'design'):
    for dimension in range(100, 1100, 100):
        with open(os.path.join(os.curdir, 'pricing_model_0', f'{track}_track', 'measures', f'measure_{dimension}D.json')) as f:
            measure_dfs[track][dimension] = pd.read_json(f, orient='records').set_index('index')

        with open(os.path.join(os.curdir, 'pricing_model_0', f'{track}_track', 'ml_models', f'ml_model_{dimension}D'), 'rb') as f:
            ml_models[track][dimension] = pickle.load(f)

        with open(os.path.join(os.curdir, 'pricing_model_0', f'{track}_track', 'document_vec', f'document_vec_{dimension}D.json')) as f:
            document_vec[track][dimension] = json.load(f)


In [None]:
ml_models['all'][200]

Duration - vector
Type
Technology -> numeric



In [None]:
measure_dfs['all'][200]

In [None]:
MMRE = []
for track in ('all', 'develop', 'design'):
    for dimension in range(100, 1100, 100):
        
        MMRE.extend([
            {
                'track': track,
                'dimension': dimension,
                'mmre': measure_dfs[track][dimension]['MRE'].mean(),
                'method': 'similarity'
            },
            {
                'track': track,
                'dimension': dimension,
                'mmre': measure_dfs[track][dimension]['LR_MRE'].mean(),
                'method': 'linear_regression'
            }
        ])
        
mmre_df = pd.DataFrame(MMRE)

In [None]:
mmre_df

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 3), dpi=200)
    ax = fig.add_axes([0.15, 0.15, 0.7, 0.7])
    
    sns.lineplot(
        data=mmre_df.loc[mmre_df.method == 'similarity'],
        x='dimension',
        y='mmre',
        size='track',
        hue='track',
        style='track',
        style_order=['develop', 'all', 'design'],
        sizes=[0.75, 2, 0.75],
        markers=['o'] * 3,
        markersize=2.8,
        ax=ax
    )
    
    ax.set_ylim(0, 6)
    ax.set_xticks(list(range(100, 1100, 100)))
    ax.set_xticklabels(labels=list(range(100, 1100, 100)))
    
    ax.set_xlabel('Dimensionality of document vectors')
    ax.set_ylabel('Mean MRE')
    ax.set_title('Pricing model 0 accuracy by MMRE and vector size')
    
#     fig.savefig(os.path.join(os.pardir, os.pardir, 'presentation', 'presentation1', 'pm0_summary.png'), dpi='figure')

In [None]:

with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 1, figsize=(9, 8), dpi=200)
    
    for idx, track in enumerate(mmre_df.track.unique()):
        ax = axes[idx]
        sns.lineplot(
            data=mmre_df.loc[mmre_df.track == track],
            x='dimension',
            y='mmre',
            hue='method',
            style='method',
            marker='o',
            markersize=4,
            ax=ax
        )
        ax.set_title(f'MMRE of {track} challenge data set')
        ax.set_xticks(list(range(100, 1100, 100)))
        ax.set_xticklabels(labels=list(range(100, 1100, 100)))
        
    fig.tight_layout()
    
    fig.savefig('MMRE by dimensionality.png', dpi='figure')

In [None]:
mre_by_dimension_all = pd.DataFrame.from_dict({d: measure_dfs['all'][d].MRE for d in range(100, 1100, 100)})
mre_by_dimension_dev = pd.DataFrame.from_dict({d: measure_dfs['develop'][d].MRE for d in range(100, 1100, 100)})
mre_by_dimension_des = pd.DataFrame.from_dict({d: measure_dfs['design'][d].MRE for d in range(100, 1100, 100)})

In [None]:
measure_dfs['all'][100].columns

In [None]:
mre_by_dimension_all.describe().round(decimals=3)

In [None]:
fig = plt.figure(figsize=(11, 6), dpi=200)

max_prize = 5000
interval = 100

with sns.axes_style('dark', {'xtick.bottom': True}):
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.distplot(measure_dfs['all'][100].total_prize.loc[measure_dfs['all'][100].total_prize <= max_prize], bins=int(max_prize / interval), kde=False, rug=False, ax=ax)
    
    ax.set_xticks(list(range(0, max_prize + 1, interval)))
    ax.set_xticklabels(labels=list(range(0, max_prize + 1, interval)), rotation=315, rotation_mode='anchor', ha='left')

    for hist in ax.patches:
        count = int(hist.get_height())
        x = hist.get_x() + hist.get_width() / 2
        y = hist.get_height()
        if count != 0:
            ax.annotate(
                f'{count}', 
                xy=(x, y), 
                xytext=(0, 3), 
                horizontalalignment='center', 
                verticalalignment='bottom',
                textcoords='offset points'
            )

The prizing frequency can reflect the sub-track under a track.

- similarity in the same **sub-track**
- Take all metadata that impact the challenge complexity into consideration
- Meta data difference/distance between 2 challenges.
- BERT? -> Pair up the challenges the increase the size of data set

---

Give a new challenge, 

- X: Construct multi-dimension vectors -> `word2vec` dimension + **metadata featrue**
- y: the difference of the new challenge and the historical challenges

Readability analysis

> Dynamic pricing?  
> Track the activity in the community - 

Focus on the accuracy of the prediction model. Build new ML models.

1. Model variation -> predict number of submission & predict number of registration
    - multi-task learning

In [None]:
fig = plt.figure(figsize=(11, 6), dpi=200)

with sns.axes_style('darkgrid'):
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    colors = {'primary': '#E93C4F', 'secondary': '#FFC24A'}
    
    meanlineprops = {'linestyle': '--', 'linewidth': 0.5, 'color': colors['secondary']}
    flierprops = {'marker': 'o', 'markerfacecolor': colors['primary'], 'markeredgewidth': 0.5, 'markersize': 2.5}
    sns.boxplot(
        data=mre_by_dimension_all, 
        showmeans=True,
        meanline=True,
        
        color=colors['primary'],
        meanprops=meanlineprops,
        flierprops=flierprops,
        
        linewidth=0.8,
        width=0.618,
        
        ax=ax
    )
    
    ax.set_ylim((10**-2,10**3.5))
    ax.set_yscale('log')
    
    ax.set_title('Distribution of challenges MRE by trained word vector dimensionality')
    ax.set_ylabel('MRE')
    ax.set_xlabel('Trained word vectors dimensionality')
    
fig.savefig('MRE_distro_by_dimension.png', dpi='figure')