In [None]:
""" EDA 2nd round for challenge selecting"""

import os
import json
from collections import defaultdict

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import KeyedVectors

from tc_main import TopCoder

pd.set_option('display.max_rows', 500)

In [None]:
topcoder = TopCoder()

In [None]:
cbi = topcoder.challenge_basic_info
cha_distro = cbi.loc[cbi.project_id != -1].reset_index().groupby(by='project_id').count()['challenge_id'].sort_values()

cha_distro_vc = pd.cut(cha_distro, bins=np.arange(0, 220, 10), right=False).value_counts().sort_index()


In [None]:
cha_distro_vc

In [None]:

with sns.axes_style('darkgrid'):
    fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(8.5, 6), dpi=200)
    
    sns.distplot(cha_distro, bins=22, kde=False, rug=True, ax=ax0)
    
    ax0.set_title('Number of challenges under one project\nDistribution - histogram')
    ax0.set_ylabel('Frequency')
    ax0.set_xlabel('Number of challenges')
    
    ax0.set_xticks(list(range(10, 220, 10)))
    ax0.set_xticklabels(labels=list(range(10, 220, 10)), rotation=315)
#     ax0.set_ylim(bottom=0, top=200)
    
    for hist in ax0.patches:
        count = int(hist.get_height())
        x = hist.get_x() + hist.get_width() / 2
        y = hist.get_height()
        if count != 0:
            ax0.annotate(
                f'{count}', 
                xy=(x, y), 
                xytext=(0, 3), 
                horizontalalignment='center', 
                verticalalignment='bottom',
                textcoords='offset points'
            )
    
#     colors = {'primary': '#E93C4F', 'secondary': '#FFC24A'}
    
#     meanlineprops = {'linestyle': '--', 'linewidth': 0.5, 'color': colors['secondary']}
#     flierprops = {'marker': 'o', 'markerfacecolor': colors['primary'], 'markeredgewidth': 0.5, 'markersize': 2.5}
    
#     sns.boxplot(
#         x=cha_distro,
#         showmeans=True,
#         meanline=True,
        
#         color=colors['primary'],
#         meanprops=meanlineprops,
#         flierprops=flierprops,
        
#         linewidth=0.8,
#         width=0.618,
        
#         ax=ax1
#     )
#     ax1.set_xticks(list(range(0, 220, 20)))
#     ax1.set_xticklabels(labels=list(range(0, 220, 20)))
    
    

In [None]:
filt_cha = topcoder.get_filtered_challenge_basic_info()
filt_cha.head()

In [None]:
cha_req = topcoder.get_challenge_req()
cha_req.loc[cha_req.index.isin(topcoder.get_handpick_dev_cha_id())]

In [None]:
cr_dev_df = topcoder.get_challenge_req(track='DEVELOP')
len(cr_dev_df)

In [None]:
cr_dev_df.loc[cr_dev_df.index.isin(topcoder.get_handpick_dev_cha_id())]

In [None]:
cbi_df = topcoder.challenge_basic_info
m_index = topcoder.get_challenge_req('develop').index
filtered_cbi = cbi_df.loc[cbi_df.index.isin(m_index) & (cbi_df.total_prize > 0) & (cbi_df.total_prize <= 5000)]

In [None]:
total_prize = filtered_cbi.reindex(['track', 'subtrack', 'total_prize'], axis=1)
total_prize = total_prize.loc[total_prize.subtrack != 'CONCEPTUALIZATION']
subtrack_lst = list(total_prize.subtrack.value_counts().sort_values(ascending=False).index)

with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(16, 5), dpi=200)
    ax = fig.add_axes([0.1, 0.2, 0.8, 0.7])
    
    sns.violinplot(
        data=total_prize,
        y='total_prize',
        x='subtrack',
        order=subtrack_lst,
        width=1.5,
        linewidth=1,
        ax=ax
    )
    
    stroke_w = 1 / len(subtrack_lst)
    for idx, subtrack in enumerate(subtrack_lst):
        y_bottom, y_top = topcoder.develop_challenge_prize_range[subtrack]
        xmin = 0.1 * stroke_w + stroke_w * idx
        xmax = 0.9 * stroke_w + stroke_w * idx
        ax.axhline(y=y_bottom, xmin=xmin, xmax = xmax, color='red')
        ax.axhline(y=y_top, xmin=xmin, xmax = xmax, color='red')
        
#         stat = total_prize.loc[total_prize.subtrack == subtrack].total_prize.describe()
#         y_bottom_stat = stat['25%']
#         y_top_stat = stat['75%']
#         ax.axhline(y=y_bottom_stat, xmin=xmin, xmax = xmax, color='blue')
#         ax.axhline(y=y_top_stat, xmin=xmin, xmax = xmax, color='blue')
    
    ax.set_xticklabels(labels=['\n'.join([w.capitalize() for w in i.get_text().split('_')]) for i in ax.get_xticklabels()])
    ax.set_yticks(list(range(0, 5500, 500)))
    
    ax.set_xlabel('Sub-track', labelpad=8)
    ax.set_ylabel('Total Price')
#     ax.set_title('Prize distro by subtrack - DEVELOP track')
    
#     fig.savefig('img/prize_distribution_by_subtrack', dpi='figure')

In [None]:
subtrack = topcoder.get_filtered_challenge_basic_info().subtrack.value_counts().sort_values(ascending=False)

with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(16, 5), dpi=200)
    ax = fig.add_axes([0.1, 0.2, 0.8, 0.7])
    
    sns.barplot(
        x=subtrack.index,
        y=subtrack,
        ax=ax
    )

    ax.set_title('DEVELOP challenges by subtrack')
    ax.set_ylabel('Number of Challenges')
    ax.set_xticklabels(labels=['\n'.join([w.capitalize() for w in i.get_text().split('_')]) for i in ax.get_xticklabels()])
    for p in ax.patches:
        count = int(p.get_height())
        x, y = p.get_width() * 0.5 + p.get_x(), p.get_height()
        ax.annotate(
            f'{count}',
            xy=(x, y),
            xytext=(0, 3),
            ha='center',
            va='baseline',
            textcoords='offset points'
        )
#     fig.savefig('img/filtered_dev_cha_by_subtrack.png', dpi='figure')

In [None]:
total_prize.total_prize.describe().index

In [None]:
total_prize = filtered_cbi.reindex(['track', 'subtrack', 'total_prize'], axis=1)
subtrack_lst = list(total_prize.subtrack.value_counts().sort_values(ascending=False).index)

with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(8, 4), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.violinplot(
        data=total_prize.loc[total_prize.subtrack.isin(subtrack_lst[:5])],
        y='total_prize',
        x='subtrack',
        order=subtrack_lst[:5],
        width=0.9,
        linewidth=1,
        ax=ax
    )
    
    ax.set_xticklabels(labels=['\n'.join([w.capitalize() for w in i.get_text().split('_')]) for i in ax.get_xticklabels()])
    ax.set_yticks(list(range(0, 5500, 500)))
    
    ax.set_xlabel('Subtrack', labelpad=8)
    ax.set_ylabel('Total Prize')
    ax.set_title('Prize distro by subtrack - DEVELOP track')
    
    for idx, subtrack in enumerate(subtrack_lst[:5]):
        y_bottom, y_top = topcoder.develop_challenge_prize_range[subtrack]
        ax.axhline(y=y_bottom, xmin=0.02 + 0.2 * idx, xmax = 0.18 + 0.2 * idx, color='red')
        ax.axhline(y=y_top, xmin=0.02 + 0.2 * idx, xmax = 0.18 + 0.2 * idx, color='red')
        
# fig.savefig(os.path.join(os.pardir, os.pardir, 'presentation', 'presentation1', 'prz_distro_dev_t5.png'), dpi='figure')
    

- FIRST_2_FINISH: `[0, 600]`
- CODE: `[250, 2500]`
- ASSEMBLY_COMPETITION: `[750, 2750]`
- BUG_HUNT: `[0, 750]`
- UI_PROTOTYPE_COMPETITION: `[1250, 2750]`
- ARCHITECTURE: `[1500, 3000]`
- DEVELOP_MARATHON_MATCH: `[1000, 1750]`
- COPILOT_POSTING: `[150, 300]`
- TEST_SUITES: `[500, 2000]`
- TEST_SCENARIOS: `[500, 2000]`
- SPECIFICATION: `[1500, 3000]`
- CONTENT_CREATION: `[500, 2000]`
- CONCEPTUALIZATION: `[1500, 2000]`

In [None]:
with sns.axes_style('darkgrid'):
    fig = plt.figure(figsize=(4, 4), dpi=200)
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
    
    sns.violinplot(
        data=total_prize.loc[total_prize.subtrack == 'ARCHITECTURE'],
        y='total_prize',
        order=subtrack_lst[:int(0.33 * len(subtrack_lst))],
        width=0.618,
        linewidth=0.618,
        ax=ax
    )
    
    ax.set_xticklabels(labels=[i.get_text() for i in ax.get_xticklabels()], rotation=345, ha='center')
    ax.set_yticks(list(range(0, 5500, 500)))
    ax.set_yticklabels(labels=list(range(0, 5500, 500)))