In [None]:
import datetime
import pickle
import os
import gc

import numpy as np
import pandas as pd

from scipy.stats import randint as sp_randint
from joblib import dump, load
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, Normalizer, Imputer, LabelBinarizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_validate
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score

from analytics_package.cic.preprocessing.preprocessing import *
from analytics_package.cic.outlier_processing.outlier_processing import *
from analytics_package.cic.pipeline.pipeline import *
from analytics_package.cic.io.file import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Modeling

In [None]:
features_types = {
    'numeric': [
        None
    ],
    'nominal': [
        None
    ],
    'target': None
}

In [None]:
def Model(features_types , model):
    pipeline_features_numeric = Pipeline([
        ('features', ColumnSelector(features_types['numeric'])),
        ('transformer_impute', Imputer()),
        ('transformer_norm', Normalizer())
    ])

    pipeline_features_dummies = Pipeline([
        ('features', ColumnSelector(features_types['nominal'])),
        ('transformer_impute', FillNaN('0')),
        ('transformer_le', MultiColumnLabelEncoder())
    ])

    pipeline_preprocessing = FeatureUnion([
        ('features_numeric', pipeline_features_numeric),
        ('features_dummies', pipeline_features_dummies)
    ])

    model_pipeline = Pipeline([
        ('preprocessing', pipeline_preprocessing),
        ('model', model)
    ])
    return model_pipeline

# Parameter Search

In [None]:
param_dist = {
    "max_depth": [5, None],
    "max_features": sp_randint(1, 11),
    "min_samples_split": sp_randint(2, 11),
    "criterion": ["gini", "entropy"]
}

random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=20, cv=3)

# Cross-val

In [None]:
cross_validate(X, y, cv=3)

# Run Model

In [None]:
model_pipeline = Model(features_types, RandomForestClassifier(max_depth=5))
model_pipeline.fit(X, y)
save_model(model_pipeline, 'output/binarized_models/model_pipeline.joblib')

In [None]:
# System modifies environental variables which are read by score_model script
os.environ['binary_location'] = "output/binarized_models/model_pipeline.joblib"
os.environ['file_location'] = "input/data/data.csv"
os.environ['output_file_location'] = "output/defection/scores/"

In [None]:
# Clean for the purpose of testing properly
del model_pipeline
gc.collect()

# Productionalization

In [None]:
def score_model():
    now = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")
    
    # Load Binarized Pipeline
    model_pipeline = load_pickle(os.environ['binary_location'])
    
    # Load Data from csv
    data = pd.read_csv(os.environ['file_location'], delimiter='|')
    
    #Score and write to a csv file
    predictions = model_pipeline.predict(data)
    np.savetxt(os.environ['output_file_location'] + now + '.csv', predictions)
    
    print('Model Done')

In [None]:
score_model()