# Feature engineering related analysis

This notebook is meant for the code & logic for feature engineering. It will include some visualization as well as pre-processing for the data.

In [None]:
%matplotlib inline
import os
import re
import pathlib
import itertools
from pprint import pprint
from collections import defaultdict
from datetime import datetime

import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.colors as mcolors

import topcoder_mongo as DB
import topcoder_ml as TML
import static_var as S
import util as U
from topcoder_feature_engineering import (
    get_challenge_tag_combination_count,
    get_tag_combination_softmax,
    compute_tag_feature,
    train_challenge_desc_doc2vec,
    compute_challenge_metadata,
    compute_competing_challenges,
)

sns.set(
    rc={
        'axes.facecolor':'#121212',
        'figure.facecolor':'#121212',
        'text.color': 'white',
        'axes.titlecolor': 'white',
        'axes.labelcolor': 'white',
        'xtick.color': 'white',
        'ytick.color': 'white',
        'figure.autolayout': True,
    },
)

pd.set_option('display.max_rows', 500)

In [None]:
project_scale = pd.DataFrame.from_records(
    DB.TopcoderMongo.get_project_scale([0, 10, 25, 75, 125])
)

challenge_wproj_tag = pd.concat([
    pd.DataFrame({'id': row.challenge_lst, 'proj_scale_tag': row.tag})
    for row in project_scale.reindex(['tag', 'challenge_lst'], axis=1).itertuples()]
).reset_index(drop=True)

project_scale = project_scale.reindex([col for col in project_scale.columns if col != 'challenge_lst'], axis=1)

In [None]:
project_scale

## Top2 Prize

Training target: top 2 prize

In [None]:
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$unwind': '$prize_sets'},
    {'$match': {'prize_sets.type': 'placement'}},
    {'$project': {
        '_id': False,
        'id': True,
        'top2_prize': {'$sum': {'$slice': ['$prize_sets.prizes.value', 2]}},
    }},
]

challenge_top2_prize = pd.DataFrame.from_records(DB.TopcoderMongo.run_challenge_aggregation(query))

## Tag

The challenge tags will be encoded in following ways:

1. Count the number of tags, number of 2-tag combination, number of 3-tag combination, number of 4-tag combination
2. Pick the top 25 tag/combinations of each group.
3. Compute the log value of counts, convert it to 4 softmax function
4. Calculate summary of softmax score of 4 group of tag combination, as 4 feature digit array
5. Calculate the binary encoded array of tag combinations as a 100 feature digit array

In [None]:
# %%script false --no-raise-error # Time consuming code
tag_softmax1, tag_softmax2, tag_softmax3, tag_softmax4 = tuple(get_tag_combination_softmax())

In [None]:
# %%script false --no-raise-error # Time consuming code
tag_comb1, tag_comb2, tag_comb3, tag_comb4 = get_challenge_tag_combination_count()

In [None]:
tag_comb4.head(25)

In [None]:
# %%script false --no-raise-error # Time consuming code
feature_df = pd.DataFrame.from_records(compute_tag_feature())
feature_df.head(3)

### Check out the new tags

> A very randome thought: Can we compare the tag arrays' similarity inside the project
> 
> And across project as well

In [None]:
%%script false --no-raise-error # This block of code is kinda useless, so skipping
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
from sklearn.linear_model import LinearRegression

tag_count, challenge_tags, word2vec = TML.challenge_tag_word2vec()

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)
    

    wv = pd.DataFrame.from_records(vectors)
    wv.columns = ['x', 'y']
    wv['label'] = labels
    
    wv = wv.set_index('label').join(tag_count_df.set_index('tag'), how='inner').sort_values('count', ascending=False)
    wv['top30'] = [True] * 30 + [False] * (len(wv) - 30)
    
    X, y = wv.loc[wv.top30].x.to_numpy().reshape(-1, 1), wv.loc[wv.top30].y.to_numpy()

    reg = LinearRegression() # too lazy to eyeball...
    reg.fit(X, y)
    
    return wv, reg

wv, reg = reduce_dimensions(word2vec)

fig, ax = plt.subplots(figsize=(12, 12), dpi=200)
size_norm = mcolors.LogNorm(vmin=200, vmax=1500)
size_range = (50, 1000)

sns.scatterplot(
    data=word2vec_with_count[::-1],
    x='x',
    y='y',
    size='count',
    sizes=size_range,
    size_norm=size_norm,
    hue='top30',
    ax=ax,
    linewidth=0.1,
    alpha=0.75,
)

sns.despine(ax=ax, left=True, bottom=True)
ax.grid(True, color='white', alpha=0.25)
ax.set_xlim(-15, 12)
ax.set_ylim(-15, 12)
# ax.set_xlim(-6, 12)
# ax.set_ylim(-15, 6)

for row in wv.loc[wv.top30].reset_index().itertuples():
    point_pos = 'over' if row.y > reg.predict([[row.x]])[0] else 'below'
    ha, va = ('left', 'bottom') if point_pos == 'over' else ('right', 'top')
    moving_step = (-1) ** int(point_pos == 'below') * (row.Index % 2 + 1)
#     print(moving_step, point_pos)
    x, y = (row.x + moving_step, row.y + moving_step)

    ax.annotate(row.index, xy=(row.x, row.y), xytext=(x, y), arrowprops=dict(arrowstyle='-', alpha=0.25), alpha=0.85, ha=ha, va=va)

# manually annotate
for keyword in ('angular', 'ios', 'sql', 'react'):
    for row in wv.loc[~wv.top30 & wv.index.to_series().str.lower().str.contains(keyword)].reset_index().itertuples():
        point_pos = 'over' if row.y > reg.predict([[row.x]])[0] else 'below'
        ha, va = ('left', 'bottom') if point_pos == 'over' else ('right', 'top')
        moving_step = (-1) ** int(point_pos == 'below') * 2.5 ** (row.Index % 2)
        
        x, y = (row.x + moving_step, row.y + moving_step)

        ax.annotate(row.index, xy=(row.x, row.y), xytext=(x, y), arrowprops=dict(arrowstyle='-', alpha=0.25), alpha=0.5, ha=ha, va=va)


In [None]:
%%script false --no-raise-error
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$unwind': '$tags'},
    {'$group': {'_id': {'tag': '$tags'}, 'count': {'$sum': 1}}},
    {'$replaceRoot': {'newRoot': {'$mergeObjects': ['$_id', {'count': '$count'}]}}}
]

tag_count_df = pd.DataFrame.from_records(
    data=DB.TopcoderMongo.run_challenge_aggregation(query)
).sort_values('count', ascending=False).reset_index(drop=True)
tag_count_df = tag_count_df.loc[tag_count_df['count'] >= 5]


In [None]:
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$project': {'id': True, 'num_of_tags': {'$size': '$tags'}, '_id': False}},
]

challenge_tag_count = pd.DataFrame.from_records(DB.TopcoderMongo.run_challenge_aggregation(query))

fig, ax = plt.subplots(figsize=(12, 5), dpi=200)

num_of_tag_freq = challenge_tag_count.num_of_tags.value_counts().sort_index().to_frame().reset_index()
num_of_tag_freq.columns = ['num_of_tag', 'count']

sns.barplot(
    data=num_of_tag_freq, x='num_of_tag', y='count', orient='v',
    linewidth=0, ax=ax
)
sns.despine(ax=ax, left=True)
ax.set_title('Number of Tags Distribution')
ax.set_xlabel('Number of Tags')
ax.set_ylabel('Count')

ax.set_ylim(0, 1500)
ax.yaxis.grid(True, color='white', alpha=0.5)

for p in ax.patches:
    cnt = p.get_height()
    x = p.get_x() + p.get_width() * 0.5
    y = p.get_height()
    
    ax.annotate(int(cnt), xy=(x, y), xytext=(x, y + 25), ha='center', alpha=0.85)


In [None]:
challenge_tag_count_wproj = challenge_tag_count.merge(challenge_wproj_tag, on='id')
tag_freq_by_proj = challenge_tag_count_wproj.groupby(['num_of_tags', 'proj_scale_tag']).size().to_frame().reset_index()
tag_freq_by_proj.columns = ['num_of_tags', 'proj_scale_tag', 'count']

fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.barplot(
    data=tag_freq_by_proj, x='num_of_tags', y='count', hue='proj_scale_tag',
    orient='v', linewidth=0,
    ax=ax,
)
sns.despine(ax=ax, left=True)
ax.set_title('Number of Tags Distribution')
ax.set_xlabel('Number of Tags')
ax.set_ylabel('Count')

ax.set_ylim(0, 500)
ax.yaxis.grid(True, color='white', alpha=0.5)


In [None]:
ps_tChallenge = project_scale.reindex(['tag', 'num_of_project', 'num_of_challenge', 'num_of_completed', 'num_of_Challenge', 'num_of_completed_Challenge'], axis=1)

In [None]:
scoped_count = (challenge_top2_prize
                .set_index('id')
                .join(challenge_wproj_tag.set_index('id'), how='inner').proj_scale_tag.value_counts()
                .to_frame()
                .sort_index()
                .reset_index()
                .rename(columns={'index': 'proj_scale_tag', 'proj_scale_tag': 'count'}))

## Challenge requirement

Build challenge requirement document vector using Doc2Vec

First peek into the similarity stuff

In [None]:
challenge_desc = pd.DataFrame.from_records(DB.TopcoderMongo.get_challenge_description())


In [None]:
challenge_desc

In [None]:
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
challenge_desc['tokenized_desc'] = challenge_desc['processed_paragraph'].apply(simple_preprocess)
challenge_desc['tokens_len'] = challenge_desc['tokenized_desc'].apply(lambda t: len(t))

In [None]:
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.histplot(
    x=challenge_desc.tokens_len,
    ax=ax
)
sns.despine(ax=ax, left=True)
ax.xaxis.grid(False)
ax.set_xlim(0, 5000)
ax.xaxis.set_major_locator(mticker.MultipleLocator(100))
fig.autofmt_xdate()

In [None]:
challenge_desc.tokens_len[challenge_desc.tokens_len > 50]

In [None]:
model, corpus = train_challenge_desc_doc2vec()

In [None]:
challenge_docvecs = {doc.tags[0]: model.docvecs[doc.tags[0]] for doc in corpus}

In [None]:
model.docvecs.most_similar([challenge_docvecs['0003dce9-9420-489b-b56f-2e19c793a641']])

In [None]:
def most_similar_docvec(topn=2):
    for cha_id, vec in challenge_docvecs.items():
        for match_id, sim in model.docvecs.most_similar([vec], topn=topn):
            if cha_id == match_id:
                continue

            yield {
                'id': cha_id,
                'match_id': match_id,
                'sim': sim
            }

In [None]:
gen = most_similar_docvec(11)
df = pd.DataFrame.from_records(gen)

In [None]:
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$project': {'_id': False, 'id': True, 'legacy_sub_track': '$legacy.sub_track'}},
]
challenge_sub_track = pd.DataFrame.from_records(DB.TopcoderMongo.run_challenge_aggregation(query))
challenge_sub_track

In [None]:
df['from_same_st'] = (
    pd.merge(left=df, right=challenge_sub_track, on='id', how='left')['legacy_sub_track'] ==
    pd.merge(left=df, right=challenge_sub_track, left_on='match_id', right_on='id', how='left')['legacy_sub_track']
)

df['from_same_proj'] = (
    pd.merge(left=df, right=challenge_wproj_tag, left_on='id', right_on='id', how='left', )['proj_scale_tag'] ==
    pd.merge(left=df, right=challenge_wproj_tag, left_on='match_id', right_on='id', how='left')['proj_scale_tag']
)

In [None]:
df_same = df.reindex(['id', 'from_same_st', 'from_same_proj'], axis=1).astype({'from_same_st': int, 'from_same_proj': int})
top10_most_similar_from_same = df_same.groupby('id').sum().join(challenge_wproj_tag.set_index('id'), how='inner')

In [None]:
top10_most_similar_from_same

In [None]:
%%script false --no-raise-error 
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)
colors = ['#006594', '#7969b6', '#da5ea5', '#ff6c67', '#ffa600']

sns.violinplot(
    data=top10_most_similar_from_same_proj, x='proj_scale_tag', y='from_same_proj',
    order=sorted(top10_most_similar_from_same_proj['proj_scale_tag'].unique()),
    palette=colors,
    inner='quartile', bw=0.05,
    boxprops={'edgecolor': 'white'},
    ax=ax,
)
sns.despine(ax=ax, left=True, bottom=True)
ax.set_ylim(0, 10)
ax.set_xlabel('Project Scale')
ax.set_ylabel('Numbero of challenges in 10 most similar from same project')
ax.set_title('Top10 most similar challenge origin')
for l in ax.lines:
    l.set_color('white')
    l.set_alpha(0.85)
    
for p in ax.collections:
    p.set_ec('white')
    p.set_lw(0.5)

For the top 10 most similar document vectors of a given document vector, how many of them are from the same project/sub-track of the given docuemnt vector?

In [None]:
group_key = 'from_same_proj' # 'from_same_proj' | 'from_same_st'
data = top10_most_similar_from_same.groupby([group_key, 'proj_scale_tag']).size().to_frame().reset_index().rename(columns={0: 'count'})
fig, axes = plt.subplots(1, 5, figsize=(16, 7.68), dpi=200, sharey=True)
colors = ['#006594', '#7969b6', '#da5ea5', '#ff6c67', '#ffa600']

for i, scale in enumerate(sorted(data.proj_scale_tag.unique())):
    ax = axes.flat[i]
    d = data.loc[data.proj_scale_tag == scale].sort_values(group_key, ascending=False)
    sns.barplot(
        data=d, x='count', y=group_key, orient='h',
        order=list(range(11))[::-1], color=colors[i],
        linewidth=0,
        ax=ax
    )
    sns.despine(ax=ax, left=True, bottom=True)
    ax.xaxis.grid(False)
    ax.yaxis.grid(True, color='white', alpha=0.5)
    ax.set_xticklabels([])
    ax.set_xlim(0, (300 if group_key == 'from_same_proj' else 500))
    ax.set_title(f'Project Scale: {scale}')
    ax.set_ylabel(f'Number of challenge from same {group_key}' if i == 0 else '')
    ax.set_xlabel('Count')
    
    target_h = 0.33
    for p in ax.patches:
        curr_h, curr_y = p.get_height(), p.get_y()
        p.set_height(target_h)
        p.set_y(curr_y + (curr_h - target_h) / 2)
        p.set_x((150 if group_key == 'from_same_proj' else 250) - p.get_width() / 2)
        
        x = p.get_x() + p.get_width() / 2
        y = p.get_y() + p.get_height() / 2
        cnt = p.get_width()
        ax.annotate(int(cnt), xy=(x, y), xytext=(x, y), ha='center', va='center', color='white', alpha=0.85)


## Metadata

Metadata feature includes:

- challenge duration
- project id (encoded)
- legacy sub-track

In [None]:
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$group': {
        '_id': {'legacy_sub_track': '$legacy.sub_track'},
        'count': {'$sum': 1},
    }},
    {'$replaceRoot': {'newRoot': {'$mergeObjects': ['$_id', {'count': '$count'}]}}},
    {'$sort': {'legacy_sub_track': pymongo.ASCENDING}},
]

sub_track_count = pd.DataFrame.from_records(DB.TopcoderMongo.run_challenge_aggregation(query))
sub_track_count['st_fmt_name'] = sub_track_count.legacy_sub_track.apply(lambda w: ' '.join(t.capitalize() for t in w.split('_')))

In [None]:
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.barplot(
    data=sub_track_count, x='st_fmt_name', y='count',
    lw=0,
    ax=ax,
)
sns.despine(ax=ax, left=True)
ax.yaxis.grid(True, color='white', alpha=0.5)
ax.set_title('Number of Challenges by Legacy Sub Track')
ax.set_xlabel('Legacy Sub Track', labelpad=5)
ax.set_ylabel('Count')

for p in ax.patches:
    cnt = p.get_height()
    x = p.get_x() + p.get_width() / 2
    y = p.get_height()
    
    ax.annotate(int(cnt), xy=(x, y), xytext=(x, y + 50), color='white', alpha=0.85, ha='center', va='center')

In [None]:
challenge_metadata = compute_challenge_metadata()

In [None]:
challenge_metadata

In [None]:
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)
axt = ax.twinx()

sns.histplot(x=challenge_metadata.duration, lw=0.5, bins=106, ax=ax)
sns.kdeplot(x=challenge_metadata.duration, color='#ffa600', alpha=0.85, ax=axt)

sns.despine(ax=ax, left=True)
sns.despine(ax=axt, left=True)

axt.grid(False)
axt.set_yticks([])
axt.set_ylabel('')

ax.tick_params(axis='y', length=0,)
ax.xaxis.grid(False)
ax.yaxis.grid(True, color='white', alpha=0.5)
ax.xaxis.set_major_locator(mticker.MultipleLocator(5))
ax.yaxis.set_major_locator(mticker.MultipleLocator(100))
ax.set_xlim(-1, 107)

## Interaction between challenges

Definition of competing challenges: **challenges whose durations overlap on eachother.**

The similarity of challenges should be considered as well:

1. Docvec similarity
2. Tag similarity

In [None]:
query = [
    {'$project': {'_id': False, 'id': True, 'num_of_competing_challenges': True}},
]

competing_challenges = pd.DataFrame.from_records(DB.TopcoderMongo.feature.aggregate(query))

In [None]:
competing_challenges.describe()

In [None]:
fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)
axt = ax.twinx()

sns.histplot(x=competing_challenges.num_of_competing_challenges, lw=0.5, bins=48, ax=ax)
sns.kdeplot(x=competing_challenges.num_of_competing_challenges, color='#ffa600', alpha=0.85, ax=axt)

sns.despine(ax=ax, left=True)
sns.despine(ax=axt, left=True)

axt.grid(False)
axt.set_yticks([])
axt.set_ylabel('')

ax.set_xlim(0, 500)
ax.tick_params(axis='y', length=0)
ax.xaxis.grid(False)
ax.yaxis.grid(True, color='white', alpha=0.5)
ax.xaxis.set_major_locator(mticker.MultipleLocator(10))
ax.yaxis.set_major_locator(mticker.MultipleLocator(100))

ax.set_xlabel('Number of Competing Challenges')

for p in ax.patches:
    if p.get_x() >= 200 and 0 < p.get_height() <= 50:
        cnt = p.get_height()
        x = p.get_x() + p.get_width() / 2
        y = p.get_height()
        ax.annotate(int(cnt), xy=(x, y), xytext=(x, y + 25), ha='center', color='white', alpha=0.85)