# Feature engineering related analysis

This notebook is meant for the code & logic for feature engineering. It will include some visualization as well as pre-processing for the data.

In [None]:
%matplotlib inline
import os
import re
import pathlib
import itertools
from pprint import pprint
from collections import defaultdict
from datetime import datetime

import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import matplotlib.colors as mcolors

import topcoder_mongo as DB
import topcoder_ml as TML
import static_var as S
import util as U
from topcoder_feature_engineering import get_challenge_tag_combination_count, get_tag_combination_softmax, compute_tag_feature

sns.set(
    rc={
        'axes.facecolor':'#121212',
        'figure.facecolor':'#121212',
        'text.color': 'white',
        'axes.titlecolor': 'white',
        'axes.labelcolor': 'white',
        'xtick.color': 'white',
        'ytick.color': 'white',
        'figure.autolayout': True,
    },
)

pd.set_option('display.max_rows', 500)

In [None]:
project_scale = pd.DataFrame.from_records(
    DB.TopcoderMongo.get_project_scale([0, 10, 25, 75, 125])
)

challenge_wproj_tag = pd.concat([
    pd.DataFrame({'id': row.challenge_lst, 'proj_scale_tag': row.tag})
    for row in project_scale.reindex(['tag', 'challenge_lst'], axis=1).itertuples()]
)

project_scale = project_scale.reindex([col for col in project_scale.columns if col != 'challenge_lst'], axis=1)

## Top2 Prize

Training target: top 2 prize

In [None]:
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$unwind': '$prize_sets'},
    {'$match': {'prize_sets.type': 'placement'}},
    {'$project': {
        '_id': False,
        'id': True,
        'top2_prize': {'$sum': {'$slice': ['$prize_sets.prizes.value', 2]}},
    }},
]

challenge_top2_prize = pd.DataFrame.from_records(DB.TopcoderMongo.run_challenge_aggregation(query))

## Tag

The challenge tags will be encoded in following ways:
1. Count the number of tags, number of 2-tag combination, number of 3-tag combination, number of 4-tag combination
2. Pick the top 25 tag/combinations of each group.
3. Compute the log value of counts, convert it to 4 softmax function
4. Calculate summary of softmax score of 4 group of tag combination, as 4 feature digit array
5. Calculate the binary encoded array of tag combinations as a 100 feature digit array

In [None]:
%%script false --no-raise-error # Time consuming code
tag_softmax1, tag_softmax2, tag_softmax3, tag_softmax4 = tuple(get_tag_combination_softmax())

In [None]:
%%script false --no-raise-error # Time consuming code
tag_comb1, tag_comb2, tag_comb3, tag_comb4 = get_challenge_tag_combination_count()

In [None]:
%%script false --no-raise-error # Time consuming code
feature_df = pd.DataFrame.from_records(compute_tag_feature())
feature_df.head(3)

### Check out the new tags

> A very randome thought: Can we compare the tag arrays' similarity inside the project
> 
> And across project as well

In [None]:
%%script false --no-raise-error # This block of code is kinda useless, so skipping
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
from sklearn.linear_model import LinearRegression

tag_count, challenge_tags, word2vec = TML.challenge_tag_word2vec()

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        vectors.append(model.wv[word])
        labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)
    

    wv = pd.DataFrame.from_records(vectors)
    wv.columns = ['x', 'y']
    wv['label'] = labels
    
    wv = wv.set_index('label').join(tag_count_df.set_index('tag'), how='inner').sort_values('count', ascending=False)
    wv['top30'] = [True] * 30 + [False] * (len(wv) - 30)
    
    X, y = wv.loc[wv.top30].x.to_numpy().reshape(-1, 1), wv.loc[wv.top30].y.to_numpy()

    reg = LinearRegression() # too lazy to eyeball...
    reg.fit(X, y)
    
    return wv, reg

wv, reg = reduce_dimensions(word2vec)

fig, ax = plt.subplots(figsize=(12, 12), dpi=200)
size_norm = mcolors.LogNorm(vmin=200, vmax=1500)
size_range = (50, 1000)

sns.scatterplot(
    data=word2vec_with_count[::-1],
    x='x',
    y='y',
    size='count',
    sizes=size_range,
    size_norm=size_norm,
    hue='top30',
    ax=ax,
    linewidth=0.1,
    alpha=0.75,
)

sns.despine(ax=ax, left=True, bottom=True)
ax.grid(True, color='white', alpha=0.25)
ax.set_xlim(-15, 12)
ax.set_ylim(-15, 12)
# ax.set_xlim(-6, 12)
# ax.set_ylim(-15, 6)

for row in wv.loc[wv.top30].reset_index().itertuples():
    point_pos = 'over' if row.y > reg.predict([[row.x]])[0] else 'below'
    ha, va = ('left', 'bottom') if point_pos == 'over' else ('right', 'top')
    moving_step = (-1) ** int(point_pos == 'below') * (row.Index % 2 + 1)
#     print(moving_step, point_pos)
    x, y = (row.x + moving_step, row.y + moving_step)

    ax.annotate(row.index, xy=(row.x, row.y), xytext=(x, y), arrowprops=dict(arrowstyle='-', alpha=0.25), alpha=0.85, ha=ha, va=va)

# manually annotate
for keyword in ('angular', 'ios', 'sql', 'react'):
    for row in wv.loc[~wv.top30 & wv.index.to_series().str.lower().str.contains(keyword)].reset_index().itertuples():
        point_pos = 'over' if row.y > reg.predict([[row.x]])[0] else 'below'
        ha, va = ('left', 'bottom') if point_pos == 'over' else ('right', 'top')
        moving_step = (-1) ** int(point_pos == 'below') * 2.5 ** (row.Index % 2)
        
        x, y = (row.x + moving_step, row.y + moving_step)

        ax.annotate(row.index, xy=(row.x, row.y), xytext=(x, y), arrowprops=dict(arrowstyle='-', alpha=0.25), alpha=0.5, ha=ha, va=va)


In [None]:
%%script false --no-raise-error
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$unwind': '$tags'},
    {'$group': {'_id': {'tag': '$tags'}, 'count': {'$sum': 1}}},
    {'$replaceRoot': {'newRoot': {'$mergeObjects': ['$_id', {'count': '$count'}]}}}
]

tag_count_df = pd.DataFrame.from_records(
    data=DB.TopcoderMongo.run_challenge_aggregation(query)
).sort_values('count', ascending=False).reset_index(drop=True)
tag_count_df = tag_count_df.loc[tag_count_df['count'] >= 5]


In [None]:
query = [
    *DB.TopcoderMongo.scoped_challenge_with_text_query,
    {'$project': {'id': True, 'num_of_tags': {'$size': '$tags'}, '_id': False}},
]

challenge_tag_count = pd.DataFrame.from_records(DB.TopcoderMongo.run_challenge_aggregation(query))

fig, ax = plt.subplots(figsize=(12, 5), dpi=200)

num_of_tag_freq = challenge_tag_count.num_of_tags.value_counts().sort_index().to_frame().reset_index()
num_of_tag_freq.columns = ['num_of_tag', 'count']

sns.barplot(
    data=num_of_tag_freq, x='num_of_tag', y='count', orient='v',
    linewidth=0, ax=ax
)
sns.despine(ax=ax, left=True)
ax.set_title('Number of Tags Distribution')
ax.set_xlabel('Number of Tags')
ax.set_ylabel('Count')

ax.set_ylim(0, 1500)
ax.yaxis.grid(True, color='white', alpha=0.5)

for p in ax.patches:
    cnt = p.get_height()
    x = p.get_x() + p.get_width() * 0.5
    y = p.get_height()
    
    ax.annotate(int(cnt), xy=(x, y), xytext=(x, y + 25), ha='center', alpha=0.85)


In [None]:
challenge_tag_count_wproj = challenge_tag_count.merge(challenge_wproj_tag, on='id')
tag_freq_by_proj = challenge_tag_count_wproj.groupby(['num_of_tags', 'proj_scale_tag']).size().to_frame().reset_index()
tag_freq_by_proj.columns = ['num_of_tags', 'proj_scale_tag', 'count']

fig, ax = plt.subplots(figsize=(16, 6.67), dpi=200)

sns.barplot(
    data=tag_freq_by_proj, x='num_of_tags', y='count', hue='proj_scale_tag',
    orient='v', linewidth=0,
    ax=ax,
)
sns.despine(ax=ax, left=True)
ax.set_title('Number of Tags Distribution')
ax.set_xlabel('Number of Tags')
ax.set_ylabel('Count')

ax.set_ylim(0, 500)
ax.yaxis.grid(True, color='white', alpha=0.5)


In [None]:
ps_tChallenge = project_scale.reindex(['tag', 'num_of_project', 'num_of_challenge', 'num_of_completed', 'num_of_Challenge', 'num_of_completed_Challenge'], axis=1)

In [None]:
scoped_count = (challenge_top2_prize
                .set_index('id')
                .join(challenge_wproj_tag.set_index('id'), how='inner').proj_scale_tag.value_counts()
                .to_frame()
                .sort_index()
                .reset_index()
                .rename(columns={'index': 'proj_scale_tag', 'proj_scale_tag': 'count'}))