In [None]:
import os
import re
import json
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 150)

In [None]:
with open(os.path.join(os.curdir, 'data', 'tech_by_start_date.json')) as f:
    tech_by_start_dt = json.load(f)

with open(os.path.join(os.curdir, 'data', 'number_of_track_by_date.json')) as f:
    num_of_track_by_dt = json.load(f)

with open(os.path.join(os.curdir, 'data', 'prize_of_track_by_date.json')) as f:
    prize_of_track_by_dt = json.load(f)

with open(os.path.join(os.curdir, 'data', 'number_of_dev_subtrack_by_dt.json')) as f:
    num_of_dev_subtrack_by_dt = json.load(f)

with open(os.path.join(os.curdir, 'data', 'prize_of_dev_subtrack_by_dt.json')) as f:
    prize_of_dev_subtrack_by_dt = json.load(f)

with open(os.path.join(os.curdir, 'data', 'number_of_challenges_by_project.json')) as f:
    num_of_challenges_by_project = json.load(f)


In [None]:
df_tech_by_dt = pd.DataFrame.from_dict(tech_by_start_dt, orient='index').fillna(0).astype(int)
df_cha_by_proj = pd.DataFrame(num_of_challenges_by_project).set_index('project_id')
df_prz_of_track_by_dt = pd.DataFrame(prize_of_track_by_dt).set_index('date')
df_num_of_track_by_dt = pd.DataFrame(num_of_track_by_dt).set_index('date')
df_prz_of_dev_subtrack_by_dt = pd.DataFrame(prize_of_dev_subtrack_by_dt).set_index('date')
df_num_of_dev_subtrack_by_dt = pd.DataFrame(num_of_dev_subtrack_by_dt).set_index('date').astype(int)

In [None]:
df_tech_by_dt.index = pd.to_datetime(df_tech_by_dt.index)
df_prz_of_track_by_dt.index = pd.to_datetime(df_prz_of_track_by_dt.index)
df_num_of_track_by_dt.index = pd.to_datetime(df_num_of_track_by_dt.index)
df_prz_of_dev_subtrack_by_dt.index = pd.to_datetime(df_prz_of_dev_subtrack_by_dt.index)
df_num_of_dev_subtrack_by_dt.index = pd.to_datetime(df_num_of_dev_subtrack_by_dt.index)

In [None]:
# Get all the project with more than 10 challenges under it

df_filtered_proj = \
    df_cha_by_proj.loc[df_cha_by_proj.number_of_challenges >= 10]\
    .sort_values(by='number_of_challenges', ascending=False)


fig = plt.figure(figsize=(8, 5), dpi=200)

with sns.axes_style('dark', {'xtick.bottom': True}):
    ax0 = fig.add_subplot(2, 1, 1)
    
    sns.distplot(df_filtered_proj.number_of_challenges, bins=20, kde=False, rug=True, ax=ax0)
    
    ax0.set_title('Number of challenges under one project\nDistribution - histogram')
    ax0.set_ylabel('Frequency')
    ax0.set_xlabel('Number of challenges')
    
    ax0.set_xticks(list(range(10, 220, 10)))
    ax0.set_xticklabels(labels=list(range(10, 220, 10)), rotation=315)
    ax0.set_ylim(bottom=0, top=200)
    
    ax0.grid(True, axis='y')
    
    for hist in ax0.patches:
        count = int(hist.get_height())
        x = hist.get_x() + hist.get_width() / 2
        y = hist.get_height()
        if count != 0:
            ax0.annotate(
                f'{count}', 
                xy=(x, y), 
                xytext=(0, 3), 
                horizontalalignment='center', 
                verticalalignment='bottom',
                textcoords='offset points'
            )

with sns.axes_style('darkgrid'):
    ax1 = fig.add_subplot(2, 1, 2)
    
    colors = {'primary': '#E93C4F', 'secondary': '#FFC24A'}
    
    meanlineprops = {'linestyle': '--', 'linewidth': 0.5, 'color': colors['secondary']}
    flierprops = {'marker': 'o', 'markerfacecolor': colors['primary'], 'markeredgewidth': 0.5, 'markersize': 2.5}
    sns.boxplot(
        x=df_cha_by_proj.loc[df_cha_by_proj.number_of_challenges > 10], 
        showmeans=True,
        meanline=True,
        
        color=colors['primary'],
        meanprops=meanlineprops,
        flierprops=flierprops,
        
        linewidth=0.8,
        width=0.618,
        
        ax=ax1
    )
    
    ax1.set_title('Number of challenges under one project\nDistribution - boxplot')
    ax1.set_xlabel('Number of challenges')

    ax1.set_xticks(list(range(20, 220, 20)))
    ax1.set_xticklabels(labels=list(range(20, 220, 20)))
    
plt.tight_layout()