In [None]:
import pandas as pd
from joblib import dump
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from joblib import load


In [None]:
train_df_raw = pd.read_csv('data/train_rosters.csv')
eval_df_raw = pd.read_csv('data/eval_rosters.csv')

In [None]:
train_df = train_df_raw.copy()
eval_df = eval_df_raw.copy()

train_df = train_df.fillna(-1)
eval_df = eval_df.fillna(-1)

train_df.head()

In [None]:
def pop_features(df, features):
    for feat in features:
        df.pop(feat)

remove_features = ['Unnamed: 0', 'index',
                   "med_num_skills_of_each_worker", "avg_num_skills_of_each_worker",
                   "chi_num_skills_of_each_worker",
                   "diff_avg_skill_penalty_and_worker_penalty",
                   "local_skill_demand",
                   "deviations_skill_staffing_from_mean",
                   "num_days", 'num_workers', "staff_req_sparcity", "skill_scarcity"
                   ]



pop_features(train_df, remove_features)
pop_features(eval_df, remove_features)
print(train_df.head)


In [None]:
y_train = train_df.pop('target').values
y_eval = eval_df.pop('target').values

y_train =  np.ravel(y_train, order='C')
y_test =  np.ravel(y_eval, order='C')

X_train = train_df.values
X_test = eval_df.values

scaler = preprocessing.StandardScaler().fit(X_train)
X_train= scaler.transform(X_train)
X_test= scaler.transform(X_test)

In [None]:
def max_prune(y):
    size = len(y)
    total_required = np.sum(y)
    max_prune = size - total_required
    return max_prune/size

def confusion_matrix_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    return false_negative_rate(cm)

def fnr_complement_scorer(clf, X, y):
    y_pred = clf.predict(X)
    cm = confusion_matrix(y, y_pred)
    return 1 - false_negative_rate(cm)

def false_negative_rate(cm):
    return round(cm[1][0] / (cm[1][0] + cm[1][1]), 3)

def negative_pred_val(cm):
    return round(cm[0][0] / (cm[0][0] + cm[1][0]), 3)

def prune_percentage(cm):
    return round( (cm[0][0] + cm[1][0]) / (np.sum(cm[:])), 3)

def print_metrics(clf):
    scores = cross_val_score(clf, X_test, y_test, cv=10, scoring='accuracy')
    print("Acc cv: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    scores = cross_val_score(clf, X_test, y_test, cv=10, scoring='f1_macro')
    print("F1 cv: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

    y_pred = clf.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    #print("npr = ", negative_pred_val(cm))
    print("fnr = ", false_negative_rate(cm))
    print("prune = ", prune_percentage(cm))

def threshold_to_metrics(clf, threshold=0.99):
    y_pred_p = clf.predict_proba(X_test)

    y_pred_0 = np.where(y_pred_p[:,0] <= threshold, 0, y_pred_p[:,0])
    y_pred_1 = y_pred_p[:,1]

    y_pred = np.where(y_pred_0 > y_pred_1, 0, 1)

    cm = confusion_matrix(y_test, y_pred)
    fnr = false_negative_rate(cm)
    prune_pc = prune_percentage(cm)

    return {"q":threshold, "prune":prune_pc, "fnr": fnr, "cm":cm}

def print_metrics_with_threshold(clf, threshold=0.99):
    metrics = threshold_to_metrics(clf, threshold)
    print(metrics['cm'])
    #print("npr = ", negative_pred_val(cm))
    print("fnr = ", metrics['fnr'])
    print("prune = ", metrics['prune_pc'])


In [None]:
sgd = SGDClassifier(max_iter=1000, tol=1e-3, loss='log', random_state=0, class_weight='balanced')
sgd.fit(X_train, y_train)

In [None]:
x_min = 0
x_max = 1.0
x_step = .1
x_min, x_max, x_step

def plot_prune(clf, base_clf=sgd, plot_name=None):
    q_metrics = {}
    base = {}
    for q in np.arange(x_min, x_max, x_step):
       q_metrics[q] = threshold_to_metrics(clf, q)['prune']
       base[q] = threshold_to_metrics(base_clf, q)['prune']

    lists = sorted(q_metrics.items()) # sorted by key, return a list of tuples
    x, y = zip(*lists) # unpack a list of pairs into two tuples

    lists = sorted(base.items()) # sorted by key, return a list of tuples
    x_b, y_b = zip(*lists) # unpack a list of pairs into two tuples

    l1,  = plt.plot(x, y, label='model 1')
    l2,  = plt.plot(x_b, y_b, label='model 0')
    plt.legend(handles=[l1, l2], loc='upper right')

    plt.xlabel("Decision Threshold")
    plt.ylabel("Prune Percentage")

    if plot_name is not None:
        fig = plt.gcf()
        fig.savefig(f"images/{plot_name}")

    plt.show()
    return plt

def plot_max_prune(clf, base_clf=sgd, plot_name=None):
    max_prune_pc = max_prune(y_test)
    q_metrics = {}
    base = {}
    for q in np.arange(x_min, x_max, x_step):
       q_metrics[q] = threshold_to_metrics(clf, q)['prune'] / max_prune_pc
       base[q] = threshold_to_metrics(base_clf, q)['prune'] / max_prune_pc

    lists = sorted(q_metrics.items()) # sorted by key, return a list of tuples
    x, y = zip(*lists) # unpack a list of pairs into two tuples

    lists = sorted(base.items()) # sorted by key, return a list of tuples
    x_b, y_b = zip(*lists) # unpack a list of pairs into two tuples

    l1,  = plt.plot(x, y, label='model 1')
    l2,  = plt.plot(x_b, y_b, label='model 0')
    plt.legend(handles=[l1, l2], loc='upper right')
    if plot_name is not None:
        fig = plt.gcf()
        fig.savefig(f"images/{plot_name}")

    plt.show()
    return plt


In [None]:
def plot_fnr(clf, base_clf=sgd, plot_name=None):
    q_metrics = {}
    base = {}
    for q in np.arange(x_min, x_max, x_step):
       q_metrics[q] = threshold_to_metrics(clf, q)['fnr']
       base[q] = threshold_to_metrics(base_clf, q)['fnr']

    lists = sorted(q_metrics.items()) # sorted by key, return a list of tuples
    x, y = zip(*lists) # unpack a list of pairs into two tuples

    lists = sorted(base.items()) # sorted by key, return a list of tuples
    x_b, y_b = zip(*lists) # unpack a list of pairs into two tuples

    l1,  = plt.plot(x, y, label='model 1')
    l2,  = plt.plot(x_b, y_b, label='model 0')
    plt.legend(handles=[l1, l2], loc='upper right')
    plt.xlabel("Decision Threshold")
    plt.ylabel("False Negative Rate")
    if plot_name is not None:
        fig = plt.gcf()
        fig.savefig(f"images/{plot_name}")
    plt.show()
    return plt

In [None]:
def plot_prune_single(clf, plot_name=None, range=(0, 1, .05)):
    q_metrics = {}
    for q in np.arange(range[0], range[1], range[2]):
       q_metrics[q] = threshold_to_metrics(clf, q)['prune']


    lists = sorted(q_metrics.items()) # sorted by key, return a list of tuples
    x, y = zip(*lists) # unpack a list of pairs into two tuples

    plt.plot(x, y)
    plt.xlabel("Decision Threshold")
    plt.ylabel("Prune Percentage")
    if plot_name is not None:
        fig = plt.gcf()
        fig.savefig(f"images/{plot_name}")
    plt.show()
    return plt

def plot_fnr_against_prune_single(clf, plot_name=None, range=(0, 1, .05)):
    q_metrics = {}
    for q in np.arange(range[0], range[1], range[2]):
        metrics = threshold_to_metrics(clf, q)
        q_metrics[metrics['fnr']] = metrics['prune']


    lists = sorted(q_metrics.items()) # sorted by key, return a list of tuples
    x, y = zip(*lists) # unpack a list of pairs into two tuples

    plt.plot(x, y)
    plt.xlabel("False Negative Rate")
    plt.ylabel("Prune Percentage")
    if plot_name is not None:
        fig = plt.gcf()
        fig.savefig(f"images/{plot_name}")
    plt.show()
    return plt


def plot_fnr_single(clf, plot_name=None, range=(0, 1, .05)):
    q_metrics = {}
    for q in np.arange(range[0], range[1], range[2]):
       q_metrics[q] = threshold_to_metrics(clf, q)['fnr']


    lists = sorted(q_metrics.items()) # sorted by key, return a list of tuples
    x, y = zip(*lists) # unpack a list of pairs into two tuples

    plt.plot(x, y)
    plt.xlabel("Decision Threshold")
    plt.ylabel("False Negative Rate")
    if plot_name is not None:
        fig = plt.gcf()
        fig.savefig(f"images/{plot_name}")
    plt.show()
    return plt

In [None]:
clf_0 = SGDClassifier(max_iter=1000, tol=1e-3, loss='log', random_state=0)
clf_0.fit(X_train, y_train)

In [None]:
plot_fnr(clf_0, plot_name="sgd_fnr_raw_vs_balanced.png")

In [None]:
plot_prune(clf_0, plot_name="sgd_prune_raw_vs_balanced.png")


In [None]:
clf_1 = SGDClassifier(max_iter=1000, tol=1e-3, loss='log', penalty='elasticnet', random_state=0, class_weight='balanced')
clf_1.fit(X_train, y_train)

In [None]:
plot_fnr(clf_1)


In [None]:
plot_prune(clf_1)


In [None]:
clf_2 = SGDClassifier(max_iter=1000, tol=1e-3, loss='modified_huber', random_state=0, class_weight='balanced')
clf_2.fit(X_train, y_train)

In [None]:
plot_fnr(clf_2)

In [None]:
plot_prune(clf_2)


In [None]:
clf_3 = SGDClassifier(max_iter=1000, tol=1e-3, loss='modified_huber', penalty="elasticnet", random_state=0, class_weight='balanced')
clf_3.fit(X_train, y_train)

In [None]:
plot_fnr(clf_3, clf_2)

In [None]:
plot_prune(clf_3, clf_2)

In [None]:
plot_fnr_single(clf_3, "sgd_tuned_q_fnr.png")
plot_prune_single(clf_3, "sgd_tuned_q_pruned.png")
plot_fnr_against_prune_single(clf_3, "sgd_tuned_fnr_against_prune.png")

In [None]:
clf_4 = SGDClassifier(max_iter=1000, tol=1e-3, loss='modified_huber', penalty="l1", random_state=0, class_weight='balanced')
clf_4.fit(X_train, y_train)

In [None]:
plot_fnr(clf_4, clf_3)

In [None]:
plot_prune(clf_4, clf_3)

In [None]:
from sklearn.linear_model import LogisticRegression

clf_5 = LogisticRegression(max_iter=1000, tol=1e-3, random_state=0, class_weight='balanced')
clf_5.fit(X_train, y_train)

In [None]:
plot_fnr(clf_5, clf_3)

In [None]:
plot_prune(clf_5, clf_3)

In [None]:
clf_6 = LogisticRegression(max_iter=1000, tol=1e-3, random_state=0, solver='sag', class_weight='balanced')
clf_6.fit(X_train, y_train)

In [None]:
plot_fnr(clf_6, clf_3)

In [None]:
plot_prune(clf_6, clf_3)



In [None]:
import matplotlib.pyplot as plt

In [None]:
def roc(clf):
    y_score = clf.predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, y_score[:,0], pos_label=0)
    roc = auc(fpr, tpr)

    return {"roc": roc, "fpr": fpr, "tpr": tpr}

roc_1 = roc(clf_1)
roc_2 = roc(clf_2)
roc_3 = roc(clf_3)
roc_4 = roc(clf_4)



%matplotlib inline
plt.figure()
lw = 2
plt.plot(roc_1['fpr'], roc_1['tpr'], color='green',
         lw=lw, label='ROC dtree (area = %0.3f)' % roc_1['roc'])
plt.plot(roc_2['fpr'], roc_2['tpr'], color='red',
         lw=lw, label='ROC dtree (area = %0.3f)' % roc_2['roc'])
plt.plot(roc_3['fpr'], roc_3['tpr'], color='blue',
         lw=lw, label='ROC dtree (area = %0.3f)' % roc_3['roc'])
plt.plot(roc_4['fpr'], roc_4['tpr'], color='orange',
         lw=lw, label='ROC dtree (area = %0.3f)' % roc_4['roc'])


plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Negative Rate')
plt.ylabel('True Negative Rate')
plt.title('ROC Analysis')
plt.legend(loc="lower right")
plt.savefig('out.png')
plt.show()

In [None]:
import seaborn as sns
from sklearn.model_selection import GridSearchCV, StratifiedKFold
lr = SGDClassifier(max_iter=1000, tol=1e-3, loss='log', random_state=0)

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:1.0-x, 1:x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= lr,
                          param_grid= param_grid,
                          cv=StratifiedKFold(),
                          n_jobs=-1,
                          scoring=fnr_complement_scorer,
                          verbose=2).fit(X_train, y_train)

#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
lr = SGDClassifier(max_iter=1000, tol=1e-3, loss='log', random_state=0)

#Setting the range for class weights
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:1.0-x, 1:x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch_2 = GridSearchCV(estimator= lr,
                          param_grid= param_grid,
                          cv=StratifiedKFold(),
                          n_jobs=-1,
                          scoring=fnr_prune_scorer,
                          verbose=2).fit(X_train, y_train)

In [None]:
#Ploting the score for different values of weight
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch_2.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)

In [None]:
print(weigh_data.tail(30))