In [None]:
""" Experiment script for pairing develop challenges"""
import os
import json
import itertools
import random
from math import gcd
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# from tc_main import TopCoder
from doc_pair_training_data import CHALLENGE_ID_COMBINATION, DATA_PATH, TOPCODER, FILTERED_CHALLENGE_INFO, SUBTRACK_COMB, TECH_COMB, SUBTRACK_DEDUCTED_COMB, TECH_CAT_COMB, render_vector

pd.set_option('display.max_rows', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
subtrack_sr = FILTERED_CHALLENGE_INFO.subtrack.copy()

In [None]:
subtrack_dct = {sub: code for code, sub in enumerate(FILTERED_CHALLENGE_INFO.subtrack.unique())}
subtrack_dct

In [None]:
FILTERED_CHALLENGE_INFO.subtrack.apply(lambda st: subtrack_dct[st]).value_counts()

In [None]:
top5_sub_track = list(subtrack_sr.value_counts().sort_values(ascending=False).head(5).index)

In [None]:
with open(os.path.join(os.curdir, 'data', 'tech_by_challenge.json')) as f:
    tech_by_cha = json.load(f)

tech_count = defaultdict(int)
for cha in tech_by_cha:
    if cha['challenge_id'] in FILTERED_CHALLENGE_INFO.index:
        for tech in cha['tech_lst']:
            if 'angular' in tech.lower():
                tech_count['angularjs'] += 1
            else:
                tech_count[tech.lower()] += 1

In [None]:
tech_count_df = pd.Series(tech_count).sort_values(ascending=False).to_frame().reset_index()
tech_count_df.columns = ['tech_name', 'tech_count']
tech_count_df = tech_count_df.loc[tech_count_df['tech_name'] != 'other']

In [None]:
tech_count_df.head(30)

In [None]:
front_end = ('javascript', 'angularjs', 'css', 'html', 'reactjs', 'html5', 'jquery', 'swift', 'bootstrap', 'jsp', 'ajax')
back_end = ('node.js', 'java', 'swift', 'c#', 'spring', 'apex', 'python')
database = ('postgresql', 'mongodb', 'sql', 'sql server')
framework_library = ('angularjs', 'reactjs', '.net', 'jquery', 'spring', 'bootstrap', 'jsp')
programming_language = ('javascript', 'java', 'swift', 'c#', 'apex', 'python')
other = ('ios', 'android', 'docker', 'rest', 'api', 'elasticsearch', 'qa', 'other')

In [None]:
with sns.axes_style('dark'):
    fig = plt.figure(figsize=(11.5, 8), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.barplot(
        data=tech_count_df.head(30),
        x='tech_count',
        y='tech_name',
        ax=ax
    )
    
    ax.set_xlabel('Frequency of technology keyword appearance')
    ax.set_ylabel('Technology')
    ax.set_title('Top 30 most popular technologies in selected challenges')
    
    for p in ax.patches:
        count = int(p.get_width())
        x = p.get_width()
        y = p.get_height() * 0.5 + p.get_y()
        ax.annotate(
            f'{count}',
            xy=(x, y),
            xytext=(3, 0),
            ha='left',
            va='center',
            textcoords='offset points'
        )

In [None]:
TOPCODER.get_tech_popularity().head(30)

In [None]:
TECH_CAT_DCT = {
    'frontend': ('javascript', 'angularjs', 'css', 'html', 'reactjs', 'html5', 'jquery', 'swift', 'bootstrap', 'jsp', 'ajax'),
    'backend': ('node.js', 'java', 'swift', 'c#', 'spring', 'apex', 'python'),
    'database': ('postgresql', 'mongodb', 'sql', 'sql server'),
    'framework': ('angularjs', 'reactjs', '.net', 'jquery', 'spring', 'bootstrap', 'jsp'),
    'language': ('javascript', 'java', 'swift', 'c#', 'apex', 'python'),
    'other': ('ios', 'android', 'docker', 'rest', 'api', 'elasticsearch', 'qa', 'other')
}

In [None]:
TECH_CAT_COMB

- categorize by workload
- data stat

In [None]:
tech_cat_count = {tuple(comb) if type(comb) is list else comb: 0 for comb in TECH_CAT_COMB}
# print(tech_cat_count)
for i in range(1, 163):
    with open(f'pricing_model_6/training_data/tech_cat_comb_{i}.json') as f:
        print(f'.', end='', flush=True)
        for cha in json.load(f):
            for tech_comb_idx in cha['comb_idx_lst']:
                k = TECH_CAT_COMB[tech_comb_idx]
                tech_cat_count[tuple(k) if type(k) is list else k] += 1

In [None]:
tech_cat_sr = pd.Series({' | '.join(k) if isinstance(k, tuple) else k: v for k, v in tech_cat_count.items()}).sort_values(ascending=False)

In [None]:
with sns.axes_style('dark'):
    fig = plt.figure(figsize=(11.5, 8), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.barplot(
        x=tech_cat_sr,
        y=tech_cat_sr.index,
        ax=ax
    )
    
    ax.set_xlabel('Frequency of technology keyword appearance')
    ax.set_ylabel('Technology')
    ax.set_title('Top 30 most popular technologies in selected challenges')
    
    for p in ax.patches:
        count = int(p.get_width())
        x = p.get_width()
        y = p.get_height() * 0.5 + p.get_y()
        ax.annotate(
            f'{count}',
            xy=(x, y),
            xytext=(3, 0),
            ha='left',
            va='center',
            textcoords='offset points'
        )

In [None]:
# gigantic_md_df = pd.concat([pd.read_json(f'pricing_model_6/training_data/meta_data_diff_{i}.json', orient='records').set_index(['l0', 'l1']) for i in range(1, 163)])


In [None]:
# (gigantic_md_df['prz_diff'] < 20).astype(int).value_counts()

In [None]:
# gigantic_md_df#.dura_diff.value_counts().sort_index()

### Pairing challenges
1. Decrease theshold
2. Sample down the training data - **SMOTE**
3. Add one more dimension -> same project: 1, cross project: 0

### Single instance - RF REGRESSION
- abosulute data values -> prize
- pick F2F & CODE out for training



In [None]:
y = pd.concat([pd.read_json(f'pricing_model_6/round1/y_{i}.json', orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])


In [None]:
X = pd.concat([pd.read_json(f'pricing_model_6/round1/X_{i}.json', orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])

In [None]:
y = pd.read_json('pricing_model_6/y_gigantic.json', orient='records').set_index(['l0', 'l1'])


In [None]:
cha_id_sr = pd.Series(FILTERED_CHALLENGE_INFO.index)
sp = np.array_split(cha_id_sr.sample(frac=1, random_state=0), 10)
split_idx_lst = [sr.to_list() for sr in sp]

In [None]:
test_cha_id = split_idx_lst[5]
y_train = y.loc[~y.index.get_level_values(0).isin(test_cha_id) & ~y.index.get_level_values(1).isin(test_cha_id)]

In [None]:
y_test = y.loc[
    (
        y.index.get_level_values(0).isin(test_cha_id) | \
        y.index.get_level_values(1).isin(test_cha_id)
    ) & ~(
        y.index.get_level_values(0).isin(test_cha_id) &
        y.index.get_level_values(1).isin(test_cha_id)
    )
]