In [None]:
import numpy as np

DATA_PATH = '../data/processed/02_cleaned_df.pkl'
MODEL_DIR = '../models'

ROLE_COLS  = ['DevType']
TECH_COLS  = ['LanguageHaveWorkedWith',
              'DatabaseHaveWorkedWith',
              'WebframeHaveWorkedWith',
              'MiscTechHaveWorkedWith',
              'ToolsTechHaveWorkedWith']

EXCLUDE_ROLES = ['Other (please specify):',
                 'Student',
                 'Designer',
                 'Educator',
                 'Marketing or sales professional',
                 'Engineering manager',
                 'Senior Executive (C-Suite, VP, etc.)',
                 'Product manager',
                 'Engineer, site reliability']

In [None]:
import numpy as np
import pandas as pd

import pickle
import os
import yaml
import time
import datetime
import copy

from scripts.preprocessing import one_hot_encode
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LassoCV
from sklearn import linear_model

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

from sklearn.metrics import auc, accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.inspection import permutation_importance
from sklearn.ensemble import StackingClassifier

### Functions

In [None]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)

    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()

    return quality_scores

In [None]:
def get_train_test_data(job, ohe_tech, ohe_roles):
    # Split to train and test
    role_mask = (ohe_roles[job] == 1)
    role_n = role_mask.sum()

    i_role  = role_mask[role_mask].index.tolist()
    i_other = role_mask[~role_mask].sample(role_n, random_state=0).index.tolist()

    i_role_train,  i_role_test  = train_test_split(i_role,  test_size=0.3, random_state=0)
    i_other_train, i_other_test = train_test_split(i_other, test_size=0.3, random_state=0)

    i_train = i_role_train + i_other_train
    i_test  = i_role_test  + i_other_test

    X_train, y_train = ohe_tech.loc[i_train], ohe_roles[job].loc[i_train]
    X_test,  y_test  = ohe_tech.loc[i_test], ohe_roles[job].loc[i_test]

    return X_train, X_test, y_train, y_test

### Load data and preprocess

In [None]:
# Read data
processed_df = pd.read_pickle(DATA_PATH)

In [None]:
# One hot encode
ohe_df = one_hot_encode(processed_df, ROLE_COLS + TECH_COLS)
ohe_df = ohe_df.drop(EXCLUDE_ROLES, axis=1, level=1)

In [None]:
# Split X & Y
ohe_tech  = ohe_df[TECH_COLS].droplevel(0, axis=1)
ohe_roles = ohe_df[ROLE_COLS].droplevel(0, axis=1)

In [None]:
# Check sums
ohe_roles.sum().sort_values()

# Train models

### Create template model

In [None]:
rf_clf =  RandomForestClassifier(max_depth=3, n_estimators=5000, random_state=0)

en_clf =  Pipeline([('std_scale', StandardScaler()),
                    ('cv_elastic_net',
                     GridSearchCV(
                        linear_model.LogisticRegression(penalty='elasticnet',
                                                        solver='saga',
                                                        max_iter=1000,
                                                        random_state=0),
                         param_grid={'C':        np.linspace(0.5, 1.5, 10),
                                     'l1_ratio': np.linspace(0,   1,   10)},
                         n_jobs=6))
                    ])

stacked_clf = StackingClassifier(estimators=[('random_forest', rf_clf),
                                             ('elastic_net',   en_clf)],
                                 final_estimator=linear_model.LogisticRegression())

### Train models

In [None]:
data = {}
models = {}
unique_jobs = ohe_roles.columns.to_list()

for job in unique_jobs:
     print(str(datetime.datetime.now()) + ' ... Training model for ' + job)

     # Create and save data
     X_train, X_test, y_train, y_test = get_train_test_data(job, ohe_tech, ohe_roles)
     data[job] = {'X_train': X_train, 'X_test': X_test, 'y_train': y_train, 'y_test': y_test}

     # Fit and save model
     job_model = copy.deepcopy(stacked_clf)
     job_model.fit(X_train, y_train)
     models[job] = copy.deepcopy(job_model)

### Evaluate models

In [None]:
train_evaluation = {}
test_evaluation  = {}

for job in unique_jobs:
    print(str(datetime.datetime.now()) + ' ... Evaluating ' + job)

    model = models[job]
    train_fit = classification_report(data[job]['y_train'], model.predict(data[job]['X_train']), output_dict=True)
    train_evaluation[job] = train_fit['weighted avg']

    test_fit = classification_report(data[job]['y_test'], model.predict(data[job]['X_test']), output_dict=True)
    test_evaluation[job] = test_fit['weighted avg']

test_evaluation  = pd.DataFrame(test_evaluation).T
train_evaluation = pd.DataFrame(train_evaluation).T

In [None]:
train_evaluation.T

In [None]:
test_evaluation.T

### Calculate feature importances

In [None]:
models = pickle.load(open('../models/ensemble_models.pkl', 'rb'))

In [None]:
features_imps = {}

for job in unique_jobs:
    print(str(datetime.datetime.now()) + ' ... Calculating feature importances ' + job)

    features_importances = permutation_importance(models[job],
                                                  data[job]['X_train'],
                                                  data[job]['y_train'],
                                                  n_repeats=12,
                                                  random_state=0,
                                                  n_jobs=6)
    features_importances.pop('importances')
    features_importances = pd.DataFrame.from_dict(features_importances)
    features_importances.index = X_train.columns

    features_imps[job] = features_importances.sort_values('importances_mean', ascending=False)

In [None]:
[print(job, imp[:10].index.tolist()) for job, imp in features_imps.items()]

### Exporting

In [None]:
with open(os.path.join(MODEL_DIR, 'ensemble_models.pkl'), 'wb') as handle:
    pickle.dump(models, handle)

In [None]:
with open(os.path.join(MODEL_DIR, 'ensemble_models_eval.pkl'), 'wb') as handle:
    pickle.dump({'train': train_evaluation, 'test': test_evaluation}, handle)

In [None]:
with open(os.path.join(MODEL_DIR, 'ensemble_models_feature_importances.pkl'), 'wb') as handle:
    pickle.dump(features_imps, handle)