In [None]:
import datetime
import pickle
import os
import gc

import numpy as np
import pandas as pd

from scipy.stats import randint as sp_randint
from joblib import dump, load
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Normalizer, Imputer, LabelBinarizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_validate
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score

from analytics_package.cic.preprocessing.preprocessing import *
from analytics_package.cic.outlier_processing.outlier_processing import *
from analytics_package.cic.pipeline.pipeline import *
from analytics_package.cic.io.file import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Modeling

In [None]:
features_types = {
    'numeric': [
        None
    ],
    'nominal': [
        None
    ],
    'target': None
}

In [None]:
def Model(features_types , model):
    pipeline_features_numeric = Pipeline([
        ('features', ColumnSelector(features_types['numeric'])),
        ('transformer_impute', Imputer()),
        ('transformer_norm', Normalizer())
    ])

    pipeline_features_dummies = Pipeline([
        ('features', ColumnSelector(features_types['nominal'])),
        ('transformer_impute', FillNaN('0')),
        ('transformer_le', MultiColumnLabelEncoder())
    ])

    pipeline_preprocessing = FeatureUnion([
        ('features_numeric', pipeline_features_numeric),
        ('features_dummies', pipeline_features_dummies)
    ])

    model_pipeline = Pipeline([
        ('preprocessing', pipeline_preprocessing),
        ('model', model)
    ])
    return model_pipeline

# Parameter Search

In [None]:
param_dist = {
    "model__max_depth": [5, None],
    "model__max_features": sp_randint(1, 11),
    "model__min_samples_split": sp_randint(2, 11),
    "model__criterion": ["gini", "entropy"]
}

model_pipeline = Model(features_types, RandomForestClassifier(max_depth=5))
random_search = RandomizedSearchCV(model_pipeline, param_distributions=param_dist, n_iter=20, cv=3)

# Cross-val

In [None]:
model_pipeline = Model(features_types, RandomForestClassifier(max_depth=5))

cross_validate(model_pipeline, X, y, cv=3, scoring=['f1_weighted', 'accuracy', 'average_precision'])

In [None]:
importances = pd.DataFrame({
    'importances': model_pipeline.named_steps['model'].feature_importances_
}, index = features_types['numeric'] + features_types['le'] + features_types['binary'], ).sort_values(by='importances')      
plt.figure( figsize=(15,8))
sns.barplot(x='importances', y='index', data=importances.head(25).reset_index().sort_values(by='importances', ascending=False), color='#bdbdbd')
plt.show()

x = pd.DataFrame({
    'defectors': defection_df[features_types['target']],
    'pred': model_pipeline.predict(defection_df),
    'proba': model_pipeline.predict_proba(defection_df)[:, 1]
})
print(classification_report(x['defectors'], x['pred']))
_ = sns.heatmap(pd.DataFrame(confusion_matrix(x['defectors'], x['pred']), columns=sorted(x['pred'].unique()), index=sorted(x['pred'].unique())), annot=True, fmt="d", cmap=sns.color_palette('gray' ))
plt.xlabel('Predictions')
plt.ylabel('True')

In [None]:
def plot_roc(x):
    fpr, tpr, _ = roc_curve(x['defectors'].ravel(), x['proba'].ravel())
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='#fdae61',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='#2c7bb6', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
def plot_pr_curve(x):
    p, r, _ = precision_recall_curve(x['defectors'], x['proba'])
    base = x['defectors'].value_counts()[1] / x.shape[0]
    plt.plot(r, p, lw=2, color='#fdae61')
    plt.plot([0, 1], [base, base], linestyle='--', lw=2,  color='#2c7bb6')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('P-R Curve')

In [None]:
plot_pr_curve(x)
plot_roc(x)

# Run Model

In [None]:
model_pipeline = Model(features_types, RandomForestClassifier(max_depth=5))
model_pipeline.fit(X, y)
save_model(model_pipeline, 'output/binarized_models/model_pipeline.joblib')

In [None]:
# System modifies environental variables which are read by score_model script
os.environ['binary_location'] = "output/binarized_models/model_pipeline.joblib"
os.environ['file_location'] = "input/data/data.csv"
os.environ['output_file_location'] = "output/defection/scores/"

In [None]:
# Clean for the purpose of testing properly
del model_pipeline
gc.collect()

# Productionalization

In [None]:
def score_model():
    now = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")
    
    # Load Binarized Pipeline
    model_pipeline = load_pickle(os.environ['binary_location'])
    
    # Load Data from csv
    data = pd.read_csv(os.environ['file_location'], delimiter='|')
    
    #Score and write to a csv file
    predictions = model_pipeline.predict(data)
    np.savetxt(os.environ['output_file_location'] + now + '.csv', predictions)
    
    print('Model Done')

In [None]:
score_model()