In [None]:
""" KNN for pricing model prediction
    X: doc-vec appending metadata
    y: prize
"""

import os
import json
from collections import defaultdict

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from tc_main import TopCoder

In [None]:
topcoder = TopCoder()

In [None]:
dev_df = topcoder.challenge_basic_info.loc[(topcoder.challenge_basic_info.total_prize < 5000) & (topcoder.challenge_basic_info.track == 'DEVELOP')]
idices = list(dev_df.subtrack.value_counts().head(5).index)

In [None]:
dev_df.subtrack.value_counts()

In [None]:

with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(6, 6), dpi=200)
    ax = fig.add_axes([0.2, 0.2, 0.8, 0.8])

    sns.violinplot(
        data=topcoder.challenge_basic_info.loc[(0 < topcoder.challenge_basic_info.total_prize) & (topcoder.challenge_basic_info.total_prize < 5000) & (topcoder.challenge_basic_info.subtrack.isin(idices))],
        x=topcoder.challenge_basic_info.subtrack[topcoder.challenge_basic_info.subtrack.isin(idices)].astype('string'),
        y='total_prize',
        linewidth=0.8,
        ax=ax
    )
    sns.despine(ax=ax)
    
    labels = ['\n'.join(i.get_text().lower().split('_')) for i in ax.get_xticklabels()]
    ax.set_xticklabels(labels=labels)
    

- the subtrack `code`: get rid of extreme prize - prize too high can be mis-labeled.
- the subtrack `first-2-finish`: get rid of extreme prize above 1000

Handpick a F2F task - that is overprized can be mislabeled

The KNN model results are stored in the files of name pattern `knn_pricing_model_measure_*.json`, for now, there are 3 different results based on the way training vectors is constructed.

- 0: The document vector from `pricing_model_0` is used directly as input `X`
- 1: The document vector from `pricing_model_0` appending the vector [`number_of_platforms`, `number_of_technologies`, `challenge_duration`] is used as input `X`
- 2: The document vector from `pricing_model_0` appending the vector [`track`, `subtrack`, `number_of_platforms`, `number_of_technologies`, `challenge_duration`] is used as input `X`

In [None]:
knn_measures = {}

for i in range(3):
    with open(os.path.join(os.curdir, 'pricing_model_3', f'knn_pricing_model_measure_{i}.json')) as f:
        knn_measure_dct = {track: {int(dimension): result['Mean_MRE'] for dimension, result in d.items()} for track, d in json.load(f).items()}

        knn_measures[i] = pd.DataFrame([dict(track=track, dimension=dimension, model=i, mmre=mmre) for track, d in knn_measure_dct.items() for dimension, mmre in d.items()])
        
        

In [None]:
track_df = pd.concat([df.loc[df.track == 'develop'] for df in knn_measures.values()])

with sns.axes_style('darkgrid'):
    fig, axes = plt.subplots(3, 1, figsize=(9, 9), dpi=200)
    
    for i, track in enumerate(('all', 'develop', 'design')):
        track_df = pd.concat([df.loc[df.track == track] for df in knn_measures.values()])
        ax = axes[i]
        
        sns.lineplot(
            data=track_df,
            x='dimension',
            y='mmre',
            hue='model',
            style='model',
            palette='deep',
            markers=['o'] * 3,
            markersize=4,
            ax=ax
        )
        
        ax.set_title(f'KNN model trainig results - {track.upper()} track')
        
    fig.tight_layout()

- Clean the price that are too high - only prize < 5000

- **BERT to detect contect representation.** 
- LSTM - softmax
  - fit the context by group of challenges / project of challenges | size of data