In [None]:
# Constante
DF_PATH = "../data/processed/1_preprocessed_df.pkl"
JOB_LIST_PATH ="../data/raw/roles_short_names.csv" 
FIG_PATH = "../reports/figures"
EXPORT_FEATURES_DIR = "../data/processed/"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['DatabaseWorkedWith',
                  'LanguageWorkedWith',
                  # 'NEWCollabToolsWorkedWith',
                  'PlatformWorkedWith',
                  'MiscTechWorkedWith',
                  'WebframeWorkedWith']
SYST_COLS      = ['OpSys']


In [None]:
import pandas as pd 
import numpy as np
import pickle
import os
import logging
import yaml

# Visualisation mathplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Visualisation plotly
import plotly
import plotly.graph_objects as go
import plotly.express as px

# Preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.manifold import TSNE

# Sci-klearn packages
from sklearn.pipeline import make_pipeline
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import silhouette_score

# Options:
## Pandas number of row to display
pd.options.display.max_rows = 1000

<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 📊🧮 Read Data and preprocess it </span></h2>

In [None]:
# Read Data 
df = pd.read_pickle(DF_PATH)

In [None]:
encoded_dfs = {}
for col in ROLE_COLS + TECH_COLS:
    mlb = MultiLabelBinarizer()
    encoded_df = pd.DataFrame(mlb.fit_transform(df[col]),
                              columns=mlb.classes_,
                              index=df[col].index)
    encoded_dfs[col] = encoded_df
    
# Merge 1-hot encoded 
ohe_df = pd.concat(encoded_dfs,
                  axis=1)

<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 🪫 Dimensionality reduction </span></h2>

<p style="padding: 6px;
              color:#174f20;">
💡 We will use some technique for visualisation.
</p>

In [None]:
# Prepare sub data frames 
skills_ohe = ohe_df.drop('DevType', axis=1).copy()
std_skills = StandardScaler().fit_transform(skills_ohe)

In [None]:
std_skills

<font style="font-family:Georgia; color:#aa0c0c;">
📌 We need to do an standarization of the data to got an percision clustring 
</font>

<h4 id="heading"><span style="background-color:#fcf3b9; color:#3d3710 ; display:fill;border-radius:5px; font-family:Georgia"> 📋 PCA </span></h4>

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

pca = KernelPCA(n_components=2)
X_pca = pca.fit_transform(std_skills.T)
pca_projection = pd.DataFrame(X_pca, index=skills_ohe.columns)

In [None]:
?PCA

In [None]:
pca_projection.shape

In [None]:
fig = px.scatter(x=pca_projection[0], y=pca_projection[1], text=pca_projection.droplevel(0).index)
fig.update_traces(textposition='top center')
fig.update_layout(height=1000, width=1000, title_text='TSNE')
fig.show()

<h4 id="heading"><span style="background-color:#fcf3b9; color:#3d3710 ; display:fill;border-radius:5px; font-family:Georgia"> 📋 TruncatedSVD </span></h4>

In [None]:
# ?TruncatedSVD

# from sklearn.decomposition import TruncatedSVD

# svd = TruncatedSVD(n_components=2, algorithm='randomized',
#                    random_state=0,
#                    n_iter=5,
#                    n_oversamples=10,
#                    power_iteration_normalizer='auto',
#                    tol=0.0,)
# X_svd = svd.fit_transform(std_skills.T)
# svd_projection = pd.DataFrame(X_pca, index=skills_ohe.columns)

# svd_projection.shape

# fig = px.scatter(x=svd_projection[0], y=svd_projection[1], text=svd_projection.droplevel(0).index)
# fig.update_traces(textposition='top center')
# fig.update_layout(height=1000, width=1000, title_text='TSNE')
# fig.show()

<h4 id="heading"><span style="background-color:#fcf3b9; color:#3d3710 ; display:fill;border-radius:5px; font-family:Georgia"> 📋 TSNE </span></h4>

In [None]:
skills_ohe

In [None]:
tsne_projection = TSNE(n_components=2, 
                       perplexity=3,
                       learning_rate=0.01,
                       init='pca', 
                       method='barnes_hut', 
                       n_jobs=2, 
                       n_iter=10**10,
                       random_state=0).fit_transform(std_skills.T)

tsne_projection = pd.DataFrame(tsne_projection, index=skills_ohe.columns, columns=['x','y'])

In [None]:
tsne_projection.shape, tsne_projection 

In [None]:
fig = px.scatter(x=tsne_projection['x'], y=tsne_projection['y'], text=tsne_projection.droplevel(0).index)
fig.update_traces(textposition='top center')
fig.update_layout(height=1000, width=1000, title_text='TSNE')
fig.show()

_________________

<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> 🎡 Cluster  </span></h2>

<font style="font-family:Georgia; color:#aa0c0c;">
📌 We dont know how much of cluster we need
</font>
<br>
<font style="font-family:Georgia; color:#174f20;">
💡 To solve this problem we need to use silhouette to know which number give much score 
</font>

In [None]:
# set an random list of posible nunber of cluster
range_n_cluster = list(range(5,25))
silhoutte_score = []
best_cluster_model = None 

for n_clusters in range_n_cluster:
    cluster_model  = AgglomerativeClustering(n_clusters=n_clusters, linkage='ward')
    cluster_labels = cluster_model.fit_predict(tsne_projection)
    
    silhouette_avg = silhouette_score(tsne_projection, cluster_labels)
    silhoutte_score += [silhouette_avg]
    
    if silhouette_avg >= np.max(silhoutte_score):
        best_cluster_model = cluster_model

In [None]:
plt.plot(range_n_cluster, silhoutte_score)
plt.axvline(best_cluster_model.n_clusters, color='black')

In [None]:
cluster_labels = ["skills_group_" + str(label) 
                 for label in best_cluster_model.labels_]

In [None]:
# cluster_labels

In [None]:
fig = px.scatter(x=tsne_projection['x'], 
                 y=tsne_projection['y'], 
                 text=tsne_projection.droplevel(0).index, 
                 color=cluster_labels)
fig.update_traces(textposition='top center')
fig.update_layout(height=800, width=800, title_text='Cluster')
fig.show()

In [None]:
# How to better write this
skills_clusters = tsne_projection.index.droplevel(0).to_series().groupby(cluster_labels).agg(list)

* what acctuly done is that the .index is for retraive the index of the dataframe
* because we have multilpe index we need to drop the first level and that the perpose of the .droplevel(0)
* to_setie() is to transform  the index to serie wich we can make transformation using the panda builting functions 
* .groupby() is groupe the index which contain the all names of the software and TI & skills
* finally we use .agg(list/dict/...) or .apply(list/dict/...) aslo we can use .transform(list/dict/...) to apply the grouping to the element of the serie which in our case are the index

In [None]:
#tsne_projection.index.droplevel(0).to_series().groupby(cluster_labels).apply(list)[0]

In [None]:
#cluster_labels

In [None]:
skills_clusters

In [None]:
for cluster, skills in skills_clusters.items():
    print(cluster)
    print(skills)

_________________

<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> Create new features  </span></h2>

the skills_clusters has 2d dimentions:
 1. **skill** : which contain the skills that grouped with each other.
 2. **cluster** : which has the name of the groupe of the skills.
So to brakedown the cell bellow:
* skills_ohe is the data the we allready encoded which has multiple column index
* .droplevl(0, axis=1) to drop the first index so that return a datafram with one column index
* when we add the bracket with the keyword 'list' like this [list] then we are extracting a subtable contain only the names in the list.
* finaly when we add the .sum(axis=1) to the data extracted he sum over cloumns of that table and store them in single column

To make a conclusion of that:
- 📋 we summeraise for each individual how many tech of each groups that he\she used in there work

In [None]:
# skills_ohe.droplevel(0, axis=1)[skills]

In [None]:
new_features = []

for cluster, skills in skills_clusters.items():
    cluster_sum = skills_ohe.droplevel(0, axis=1)[skills].sum(axis=1)
    cluster_sum.name = cluster
    new_features.append(cluster_sum)

fe_clustered_skills = pd.concat(new_features, axis=1)

In [None]:
fe_clustered_skills

_________________

<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> Illustration Model  </span></h2>

#### Create train / test matrices 

In [None]:
# add the news groups features to the encoded data
combined_features_df = pd.concat([fe_clustered_skills, 
                                  skills_ohe.droplevel(0,axis=1)], 
                                 axis=1)
# creat the target variable by using the DevType
roles_df = ohe_df['DevType'].copy()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(combined_features_df, roles_df, test_size=0.3, random_state=0)

#### Compute samples weight to deal with classes imbalance 

In [None]:
np.multiply(class_weights.values, Y_train.values)

In [None]:
# Assign class weight as an inverse of its frequency 
class_weights = (1 / roles_df.sum(axis=0))

# Multiply class weights with the 1 hot encoded values and get the mean of each sample
sample_weight = np.multiply(class_weights.values, Y_train.values).sum(axis=1)

In [None]:
Y_train.values.shape

In [None]:
sample_weight

#### Compute samples weight to deal with classes imbalance 

In [None]:
def f1_from_confusion_matrix(confusion_matrix):
    return (confusion_matrix[1,1] / 
            (confusion_matrix[1,1] + 
             (0.5 * (confusion_matrix[0,1] + confusion_matrix[1,0]))
            ))

In [None]:
feature_sets = {'original': skills_ohe.droplevel(0,axis=1).columns.tolist(), 
                'clusters': fe_clustered_skills.columns.tolist()}

In [None]:
# multilabel_confusion_matricies

In [None]:
results = {}

for feature_set_name, feature_set in feature_sets.items():
    # Create sub trainings
    sub_train = X_train[feature_set].copy()
    sub_test  = X_test[feature_set].copy()
    
    # Train classifier 
    clf = MultiOutputClassifier(LogisticRegression())
    clf.fit(sub_train, Y_train, sample_weight=sample_weight)

    # Calcaulte F1 for training data
    multilabel_confusion_matricies = multilabel_confusion_matrix(Y_train, clf.predict(sub_train))
    f1_train_scores = [f1_from_confusion_matrix(matrix) 
                       for matrix in multilabel_confusion_matricies]

    # Calcaulte F1 for testing data
    multilabel_confusion_matricies = multilabel_confusion_matrix(Y_test, clf.predict(sub_test))
    f1_test_scores = [f1_from_confusion_matrix(matrix) 
                       for matrix in multilabel_confusion_matricies]

    # Add to results
    set_result = pd.DataFrame({"train": f1_train_scores, "test":f1_test_scores}, 
                              index=roles_df.columns.to_list())
    results[feature_set_name] = set_result.sort_values('test')
    
    
    # Print
    print("Feature set: " + feature_set_name)
    print(".. Mean train F1:", np.mean(f1_train_scores))    
    print(".. Mean test F1:", np.mean(f1_test_scores))
    print()

_________________

<h2 id="heading"><span style="background-color:#cefffb; color:#1b4946; display:fill;border-radius:5px; font-family:cursive"> Export new features  </span></h2>

In [None]:
features_path = os.path.join(EXPORT_FEATURES_DIR, "features_skills_clusters.pkl")
fe_clustered_skills.to_pickle(features_path)

In [None]:
description_path = os.path.join(EXPORT_FEATURES_DIR, "features_skills_clusters_description.yaml")
with open(description_path, 'w') as outfile:
    yaml.dump(skills_clusters.to_dict(), outfile)