In [None]:
""" KNN for pricing model prediction
    X: doc-vec appending metadata
    y: prize
"""

import os
import json
from collections import defaultdict

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tc_main import TopCoder

In [None]:
topcoder = TopCoder()

- the subtrack `code`: get rid of extreme prize - prize too high can be mis-labeled.
- the subtrack `first-2-finish`: get rid of extreme prize above 1000

Handpick a F2F task - that is overprized can be mislabeled

```python
knn_measures = {}

for i in range(3):
    with open(os.path.join(os.curdir, 'pricing_model_3', f'knn_pricing_model_measure_{i}.json')) as f:
        knn_measure_dct = {track: {int(dimension): result['Mean_MRE'] for dimension, result in d.items()} for track, d in json.load(f).items()}
        
        knn_measures[i] = pd.DataFrame([dict(track=track, dimension=dimension, model=i, mmre=mmre) for track, d in knn_measure_dct.items() for dimension, mmre in d.items()])
        
```

- Clean the price that are too high - only prize < 5000

- **BERT to detect contect representation.** 
- LSTM - softmax
  - fit the context by group of challenges / project of challenges | size of data

In [None]:
meta_data_type = ['all', 'subtrack', 'number_of_platform', 'number_of_technologies', 'challenge_duration']

knn_measure_lst = []
for model in range(5):
    with open(os.path.join(os.curdir, 'pricing_model_3', f'knn_pricing_model_measure_{model}.json')) as f:
        mmre_by_dim = json.load(f)['develop']
        knn_measure_lst.extend([dict(dimension=dim, mmre=mmre, meta_data=meta_data_type[model]) for dim, mmre in mmre_by_dim.items()])

knn_measure_df = pd.DataFrame(knn_measure_lst)
        
pm0_dev_measure_dfs = {}
for dimension in range(100, 1100, 100):
    with open(os.path.join(os.curdir, f'pricing_model_0', 'develop_track', 'measures', f'measure_{dimension}D.json')) as f:
        pm0_dev_measure_dfs[dimension] = pd.read_json(f, orient='records').set_index('index')

pm0_mmre = [dict(dimension=dimension, mmre=df['MRE'].mean()) for dimension, df in pm0_dev_measure_dfs.items()]
pm0_mmre_df = pd.DataFrame(pm0_mmre)
pm0_mmre_df['meta_data'] = ['pricing_model_0'] * len(pm0_mmre_df)
        

In [None]:
pd.concat([knn_measure_df, pm0_mmre_df]).reset_index(drop=True).astype({'dimension': int, 'mmre': float, 'meta_data': str}).dtypes

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(10, 5), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.lineplot(
        data=pd.concat([knn_measure_df, pm0_mmre_df]).reset_index(drop=True).astype({'dimension': int, 'mmre': float, 'meta_data': str}),
        x='dimension',
        y='mmre',
        hue='meta_data',
        size='meta_data',
        sizes=[2.75, 0.75, 0.75, 0.75, 0.75, 2.75],
        marker='o',
        markersize=4,
        ax=ax
    )
    
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles, labels, prop={'size': 8})
    
    ax.set_xticks(list(range(100, 1100, 100)))
    ax.set_xticklabels(labels=list(range(100, 1100, 100)))
    
    ax.set_xlabel('Dimension')
    ax.set_ylabel('Mean MRE')
    ax.set_title('Pricing model 3 result - DEVELOP track')
    
#     fig.savefig(os.path.join(os.pardir, os.pardir, 'presentation', 'presentation1', 'pm_models3_result.png'), dpi='figure')