In [None]:
""" This notebook will be the formal to train, analyze the word embedding data
    (with some ugly code temperately existed of course - but will be cleaned eventually!)
"""
import os
import re
import json
import itertools
from collections import defaultdict
from pprint import pprint
from datetime import datetime

import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

from gensim.models import KeyedVectors

from tc_main import TopCoder
from tc_pricing_models import train_word2vec_model, reduce_wv_dimensions, plot_word2vec, cosine_similarity, doc_vector_from_word_vectors

pd.set_option('display.max_rows', 300)
pd.set_option('display.width', 1000)

# pricing_model_0/all_track/models/model_200D
TRAINED_WV_PATH = os.path.join(os.curdir, 'pricing_model_0', 'all_track', 'models', 'model_200D')

In [None]:
topcoder = TopCoder()

In [None]:
sec_sim = topcoder.corpus_section_similarity
sec_sim.loc[(sec_sim.score >= 0.5) & (sec_sim.freq >= 0.75)]

In [None]:
topcoder.challenge_basic_info.registration_start_date.describe()

In [None]:
# Un-comment following line if training a new model
# trained_wv = train_word2vec_model(sentences=topcoder.corpus.get_challenge_req_sentences(), size=200)

# Un-comment following line if using a trained model
trained_wv = KeyedVectors.load(TRAINED_WV_PATH)

In [None]:
challenge_req = topcoder.corpus.get_challenge_req_sentences(as_dataframe=True)

# Remove empty requirement corpora
cleaned_challenge_req = challenge_req.loc[challenge_req.requirements != '']

# calculate the vector representation of each challenge, store it in a dictionary
cha_vec_dct = {cha_id: doc_vector_from_word_vectors(cha['requirements'], trained_wv) for cha_id, cha in cleaned_challenge_req.to_dict(orient='index').items()}

# get zero vectors caused by non-empty requirements which don't have any meaningful words
zero_vec = {cha_id: vec for cha_id, vec in cha_vec_dct.items() if not isinstance(vec, np.ndarray)}

cleaned_cha_vec_dct = {cha_id: vec for cha_id, vec in cha_vec_dct.items() if cha_id not in zero_vec}

During building the challenges' document vectors (`cha_vec_dct`) and calculating the cosine similarity of every pair of document vectors, I encountered an warning from `numpy` indicating that there are _invalid values in true_divide_. This usually means that somewhere during the calculation there is _zero divided by zero_ happening.

I went back to challenge requirement corpus (`challenge_req` DataFrame) and found that there are _empty corpus_ (`''`) and corpus with no meaningful word in it (`any(word for word in corpus if word in trained_wv.vocab) == False`). The results of document vector calculation for these corpora are all **integer 0**, instead of a 200-dimension row vector. 

And when calculating cosine similarity with two vectors' dot product divided by the product of their norm 

```python
import numpy as np

def cosine_similarity(vec_a, vec_b):
    """ Cosine similarity."""
    return np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a) * np.linalg.norm(vec_b))
```

There will be `NaN` resulted in `cha_vec_dct`

In [None]:
removed_challenge_ids = [*(set(challenge_req.index) - set(cleaned_challenge_req.index)), *list(zero_vec.keys())]
print(f'Removed {len(removed_challenge_ids)} challenges which produce invalid vectors')

In [None]:
sec_req = topcoder.corpus.sectioned_requirements
sec_req.loc[sec_req.index.get_level_values(1).isin(removed_challenge_ids)]

> Above DataFrame showcases the challenge requirement corpora that are empty or don't have meaning content

---

Instead of directly sorting & selecting from the DOK, I build a DataFrame from it as the pandas implementation outperformed python built-in `dict` A LOT

In [None]:
# calculate cosine similarity for every pair of challenges, sotre it in a DOK format dictionary
cha_cos_sim_dok = {
    (cha_a, cha_b): cosine_similarity(cha_vec_dct[cha_a], cha_vec_dct[cha_b])
    for cha_a, cha_b in itertools.combinations_with_replacement(cleaned_cha_vec_dct.keys(), 2)
}

In [None]:
# turn DOK into a DataFrame to take advantages of pandas' performance
cha_cos_sim_df = pd.DataFrame.from_dict(cha_cos_sim_dok, orient='index')
cha_cos_sim_df.index = pd.MultiIndex.from_tuples(cha_cos_sim_df.index)
cha_cos_sim_df.index.names, cha_cos_sim_df.columns = ['l0', 'l1'], ['similarity']

In [None]:
# challenge_estimated_prize = {}
# challenge_actual_prize = topcoder.challenge_prize_avg_score.total_prize

# for cha_id in cleaned_cha_vec_dct:
#     all_cha_sim = cha_cos_sim_df.loc[(cha_cos_sim_df.index.get_level_values(0) == cha_id) | (cha_cos_sim_df.index.get_level_values(1) == cha_id)]
#     all_cha_sim.index = all_cha_sim.index.map(lambda ids: ids[0] if ids[0] != cha_id else ids[1])
#     top10_most_similar_cha = all_cha_sim.similarity.sort_values(ascending=False).iloc[1: 11].index
    
#     challenge_estimated_prize[cha_id] = challenge_actual_prize[challenge_actual_prize.index.isin(top10_most_similar_cha)].mean()
    

In [None]:
# cha_est_prz_df = pd.DataFrame.from_dict(challenge_estimated_prize, orient='index')
# cha_est_prz_df.columns = ['estimated_total_prize']

In [None]:
# pricing_model_measurement_df = cha_est_prz_df.join(challenge_actual_prize)
# pricing_model_measurement_df = pricing_model_measurement_df.loc[pricing_model_measurement_df.total_prize != 0]
# pricing_model_measurement_df['MRE'] = (pricing_model_measurement_df.total_prize - pricing_model_measurement_df.estimated_total_prize).abs() / pricing_model_measurement_df.total_prize

# pricing_model_measurement_df.MRE.mean()

In [None]:
# with open('test.json', 'w') as f:
#     pricing_model_measurement_df.reset_index().to_json(f, orient='records', indent=4, index=True)

In [None]:
# s = cha_cos_sim_df.loc[(cha_cos_sim_df.index.get_level_values(0) == 30121730) | (cha_cos_sim_df.index.get_level_values(1) == 30121730)]
# s.index = s.index.map(lambda cid_pair: cid_pair[0] if cid_pair[0] != 30121730 else cid_pair[1])
# s.similarity.sort_values(ascending=False).iloc[1:11].index

## Pricing model 0

Base on text mining and analogy estimation approach

- ✅ calculate Document2Vector of challenges
- ✅ calculate similarity between each pair of challenges
- ✅for each task
  1. select 10 most similar tasks
  2. pricing strategies:
      - use average prize of 10 tasks as estimate prize of given
      - use mid prize of ...
  3. calculate estimation error (MRE magnitude of relative error) based on actual prize
  4. repeat
- ✅on the entrie dataset, calculate mean MRE -> See how big is it, **this is the measure of accuraccy of the model**

The pricing model 0's mean MRE is **5.248539251177451**. Tragic 🤦🏻‍♂️

## Pricing model 1

Once we get the MRE of all task, we can use any machine learning tech to analyze the error. e.g logistic regression

each ML approach produce one model

## Pricing model 2

Take the meta data of challenges/projects into consideration

- Types of challenges (aggregation & sub-model)

## BERT

> Nice to have application  
> Not only give a estimated price, but also reasons realted to recommended prize considerin the dynamic context.

Take a specific vector space, encode the hidden context, find the hidden factors of uncertainty of a given challenge.

In [None]:
with open(os.path.join(os.curdir, 'data', 'tech_by_start_date.json')) as f:
    tech_by_start_dt = json.load(f)
df_tech_by_dt = pd.DataFrame.from_dict(tech_by_start_dt, orient='index').fillna(0).astype(int).drop(columns='other')
df_tech_by_dt.index = pd.to_datetime(df_tech_by_dt.index)
df_tech_by_dt.sort_index(inplace=True)

In [None]:
df_tech_by_dt.columns

In [None]:
# label_lst = ['java', 'python', 'javascript', 'php', 'mysql', 'api', 'design', 'prototype', 'ui', 'data', 'science']
wv_2D = reduce_wv_dimensions(trained_wv)

# plot_word2vec(wv_2D, list(df_tech_by_dt.columns))

In [None]:
tech_lst = [t for t in df_tech_by_dt.columns if 2 < len(t) < 20]
wv_2D['is_tech'] = wv_2D.word.isin(tech_lst)

In [None]:
fig = plt.figure(figsize=(8, 8), dpi=200)

with sns.axes_style('darkgrid'):
    ax = fig.add_axes([0.2, 0.2, 0.8, 0.8])
    sns.scatterplot(
        data=wv_2D, 
        x='x', 
        y='y',
        hue='is_tech',
        alpha=0.5,
        palette=['#8a8a8a', '#FF0000'],
        size=1,
        linewidth=0.2,
        ax=ax
    )
    
    ax.set_title('All track 200D word vectors after dimension reduction')
    ax.title.set_color('white')
    ax.xaxis.label.set_color('white')
    ax.yaxis.label.set_color('white')
    ax.tick_params(axis='both', colors='white')
    
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:-1], labels[:-1], prop={'size': 8})
    