In [None]:
DATA_PATH = '../data/processed/02_cleaned_df.pkl'

ROLE_COLS  = ['DevType']
TECH_COLS  = ['LanguageHaveWorkedWith',
              'DatabaseHaveWorkedWith',
              'WebframeHaveWorkedWith',
              'MiscTechHaveWorkedWith',
              'ToolsTechHaveWorkedWith']

EXCLUDE_ROLES = ['Other (please specify):',
                 'Student',
                 'Designer',
                 'Educator',
                 'Marketing or sales professional',
                 'Engineering manager',
                 'Senior Executive (C-Suite, VP, etc.)',
                 'Product manager',
                 'Engineer, site reliability']


In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

from scripts.preprocessing import one_hot_encode

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

### Initialize and preprocess

In [None]:
# Read data
processed_df = pd.read_pickle(DATA_PATH)

In [None]:
# One hot encode
ohe_df = one_hot_encode(processed_df, ROLE_COLS + TECH_COLS)

In [None]:
# Drop excluded jobs
ohe_df = ohe_df.drop(EXCLUDE_ROLES, axis=1, level=1)

# Visualize

## 1. Jobs' frequency

In [None]:
jobs_freq = ohe_df['DevType'].sum().sort_values(ascending=False).reset_index()
jobs_freq.columns = ['role', 'n_answers']

In [None]:
fig = px.bar(jobs_freq, x='role', y='n_answers', text_auto='.2s')
fig.update_xaxes(tickangle=-30)
fig.update_layout(width=1000, height=500)
fig.show()

## 2. Skills' frequency

In [None]:
# Calculate the sum for all cols except Devtypes
skills_freq = ohe_df[TECH_COLS].sum().reset_index()
skills_freq.columns = ['group', 'skill', 'freq']

In [None]:
# Plot
fig = px.treemap(skills_freq,
                 path=['group', 'skill'],
                 color_continuous_scale='BuPu',
                 values='freq',color='freq')

fig.update_layout(width=1400, height=700)

fig.show()

## 3. Skills & Jobs relationship

In [None]:
# Get sorted roles and skills by frequency
sorted_roles = ohe_df['DevType'].sum().sort_values().index.tolist()
sorted_skills = ohe_df[TECH_COLS].sum().sort_values(ascending=False).droplevel(level=0).index.tolist()

In [None]:
# For each role, calculate the percentage of each skill
skills = []

for role in sorted_roles:
    # Filter for the role
    role_mask = (ohe_df[('DevType', role)] == 1)

    #For each skill column, calculate mean of the one-hot-encoded -> percentage
    skills_role = pd.concat({tech_col: ohe_df.loc[role_mask, tech_col].mean() * 100
                             for tech_col in TECH_COLS})

    # Append to original list
    skills.append(skills_role)

In [None]:
# Concat and format
skills = pd.concat(skills, axis=1)
skills.columns = sorted_roles
skills = skills.reset_index(level=0, drop=True)
skills = skills.loc[sorted_skills]
skills = skills.T

In [None]:
fig = go.Figure(data=go.Heatmap(z=skills, x=skills.columns,y=skills.index, colorscale='magma', ygap=1))
fig.update_layout(width=1800, height=500)
fig.show()
#fig.write_html(os.path.join(FIG_DIR, 'heatmap.html'))

## 4. Jobs dendrogram

In [None]:
fig = ff.create_dendrogram(skills, labels=skills.index, orientation='left', color_threshold=0)
fig.update_layout(height=500, width=700, showlegend=False)
fig.show()

## 5. Skills clustering

### Generate tSNE embeddings

In [None]:
std_skills = StandardScaler().fit_transform(skills)
std_skills = pd.DataFrame(std_skills, columns=skills.columns, index=skills.index)

In [None]:
tsne_projection = TSNE(n_components=2,
                       perplexity=3,
                       learning_rate=0.01,
                       init='pca',
                       method='barnes_hut',
                       n_jobs=6,
                       n_iter=10**10,
                       random_state=0).fit_transform(std_skills.T)

tsne_projection = pd.DataFrame(tsne_projection, index=std_skills.columns)

In [None]:
fig = px.scatter(x=tsne_projection[0], y=tsne_projection[1], text=tsne_projection.index)
fig.update_traces(textposition='top center')
fig.update_layout(height=1000, width=1000, title_text='TSNE')
fig.show()

### Cluster embeddings

In [None]:
range_n_cluster = list(range(10,25))
silhoutte_score = []
best_cluster_model = None

for n_clusters in range_n_cluster:
    cluster_model  = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = cluster_model.fit_predict(tsne_projection)

    silhouette_avg = silhouette_score(tsne_projection, cluster_labels)
    silhoutte_score += [silhouette_avg]

    if silhouette_avg >= np.max(silhoutte_score):
        best_cluster_model = cluster_model

In [None]:
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')

In [None]:
cluster_labels = ["skills_group_" + str(label)
                 for label in best_cluster_model.labels_]

In [None]:
fig = px.scatter(x=tsne_projection[0],
                 y=tsne_projection[1],
                 text=tsne_projection.index,
                 color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=1000, width=1000, title_text='Cluster')
fig.show()

In [None]:
# Print clusters
skills_clusters = tsne_projection.index.to_series().groupby(cluster_labels).apply(list)
for cluster, skills in skills_clusters.items():
    print(cluster)
    print(skills)