In [None]:
""" Experiment script for pairing develop challenges"""
import os
import json
import itertools
import random
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tc_main import TopCoder
from doc_pair_training_data import TOPCODER, FILTERED_CHALLENGE_INFO

pd.set_option('display.max_rows', 500)

In [None]:
[tuple(sorted(comb)) for comb in itertools.combinations_with_replacement(FILTERED_CHALLENGE_INFO.subtrack.unique(), 2)]

In [None]:

with open(os.path.join(os.curdir, 'data', 'tech_by_challenge.json')) as f:
    tech_by_cha = json.load(f)


In [None]:
tech_count = defaultdict(int)
for cha in tech_by_cha:
    if cha['challenge_id'] in FILTERED_CHALLENGE_INFO.index:
        for tech in cha['tech_lst']:
            if 'angular' in tech.lower():
                tech_count['angularjs'] += 1
            else:
                tech_count[tech.lower()] += 1

In [None]:
tech_count_df = pd.Series(tech_count).sort_values(ascending=False).to_frame().reset_index()
tech_count_df.columns = ['tech_name', 'tech_count']

In [None]:
with sns.axes_style('dark'):
    fig = plt.figure(figsize=(11.5, 8), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.barplot(
        data=tech_count_df.head(30),
        x='tech_count',
        y='tech_name',
        ax=ax
    )
    
    ax.set_xlabel('Frequency of technology keyword appearance')
    ax.set_ylabel('Technology')
    ax.set_title('Top 30 most popular technologies in selected challenges')
    
    for p in ax.patches:
        count = int(p.get_width())
        x = p.get_width()
        y = p.get_height() * 0.5 + p.get_y()
        ax.annotate(
            f'{count}',
            xy=(x, y),
            xytext=(3, 0),
            ha='left',
            va='center',
            textcoords='offset points'
        )

In [None]:
gigantic_df = pd.concat([pd.read_json(f'pricing_model_6/training_data/meta_data_diff_{i}.json', orient='records') for i in range(1, 163)], ignore_index=True).set_index(['l0', 'l1'])

In [None]:
gigantic_df.describe().loc[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]