# Training Models

In [1]:
#Importing Libraries
import sys
import os
import numpy as np
import pandas as pd
import mlflow

In [2]:
#Jupyter Notebook Settings
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
%matplotlib inline

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import plot_tree
from scipy.stats import uniform
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.model_selection import cross_val_score


In [4]:
#importing local scripts
#Adding scripts path
sys.path.append(os.path.abspath(os.path.join('..')))
#importing data_manipulator script
from scripts.ML_modelling_utils import *

## Importing and Fixing Data

In [5]:
#Importing the collected Data
path = 'data/AdSmartABdata.csv'
repo = 'https://github.com/DePacifier/abtest-mlops'
all_dfs = import_all_data_using_tagslist(path=path, repo=repo, tags=[
                                         'chrome-mobile', 'chrome-mobile-view', 'facebook', 'samsung-internet', 'platform-6'])


In [6]:
all_dfs['chrome-mobile'].head(5)

Unnamed: 0,auction_id,experiment,date,hour,device_make,platform_os,browser,response
0,378645d2-f8cf-43c0-b1be-e699d846e596,control,2020-07-04,6,Generic Smartphone,6,Chrome Mobile,0
1,ef334830-f200-4c1b-aea2-b01b96ca0950,exposed,2020-07-03,12,Generic Smartphone,6,Chrome Mobile,0
2,49e7690d-4825-499c-9553-756af3198a10,control,2020-07-07,10,Generic Smartphone,6,Chrome Mobile,1
3,80b9fecc-ef23-4896-bd43-b81c074f49c3,control,2020-07-09,17,Generic Smartphone,6,Chrome Mobile,1
4,3dd09b49-9b11-45bd-ab0d-e279a8e5edc7,exposed,2020-07-03,0,Generic Smartphone,6,Chrome Mobile,0


In [7]:
# Spliting the date column to year, month and day columns and removing it
all_dfs_date_fixed = split_date_to_numbers(all_dfs, 'date')
all_dfs_date_fixed['chrome-mobile'].head(5)

Unnamed: 0,auction_id,experiment,hour,device_make,platform_os,browser,response,year,month,day
0,378645d2-f8cf-43c0-b1be-e699d846e596,control,6,Generic Smartphone,6,Chrome Mobile,0,2020,7,4
1,ef334830-f200-4c1b-aea2-b01b96ca0950,exposed,12,Generic Smartphone,6,Chrome Mobile,0,2020,7,3
2,49e7690d-4825-499c-9553-756af3198a10,control,10,Generic Smartphone,6,Chrome Mobile,1,2020,7,7
3,80b9fecc-ef23-4896-bd43-b81c074f49c3,control,17,Generic Smartphone,6,Chrome Mobile,1,2020,7,9
4,3dd09b49-9b11-45bd-ab0d-e279a8e5edc7,exposed,0,Generic Smartphone,6,Chrome Mobile,0,2020,7,3


In [8]:
#drop columns from each data, the grouping column b/c its a univalue column
#We have 5 dataframes of which 4 are grouped based on browser and 1 based on platform
for i in all_dfs_date_fixed:
    if(i != "platform-6"):
        all_dfs_date_fixed[i].drop('browser', axis=1, inplace=True)
    else:
        all_dfs_date_fixed[i].drop('platform_os', axis=1, inplace=True)

In [9]:

data_type_fixed_dfs = change_columns_to_numbers(all_dfs_date_fixed, ['experiment', 'device_make', 'browser'])
data_type_fixed_dfs['platform-6'].sample(5)


Unnamed: 0,auction_id,experiment,hour,device_make,browser,response,year,month,day
994,60079877-1f31-42ea-9336-ff20fdfad39c,0,3,13,1,0,2020,7,4
204,e3e1430c-21f4-4c8d-aca0-0798637bb2ce,0,15,13,1,0,2020,7,3
137,f347a028-3083-45a7-8aa6-173bf1b8a6bf,1,6,13,1,0,2020,7,4
1214,fb546a62-59f9-4d47-ae03-818e38f90107,0,8,33,3,0,2020,7,9
1058,96600bbd-5ecd-48a9-ad9a-7975bdc5dafa,1,7,47,2,0,2020,7,8


In [10]:
# #Get all train, validate and test sets
chrome_mobile_dict = get_train_validate_test_sets(data_type_fixed_dfs['chrome-mobile'], predicted_column='response', remove_columns=['auction_id'])
chrome_mobile_view_dict = get_train_validate_test_sets(data_type_fixed_dfs['chrome-mobile-view'], predicted_column='response', remove_columns=['auction_id'])
facebook_dict = get_train_validate_test_sets(data_type_fixed_dfs['facebook'], predicted_column='response', remove_columns=['auction_id'])
samsung_internet_dict = get_train_validate_test_sets(data_type_fixed_dfs['samsung-internet'], predicted_column='response', remove_columns=['auction_id'])
platform_6_dict = get_train_validate_test_sets(data_type_fixed_dfs['platform-6'], predicted_column='response', remove_columns=['auction_id'])

In [11]:
chrome_mobile_dict['train_x'].head()


Unnamed: 0,experiment,hour,device_make,platform_os,year,month,day
317,1,12,1,6,2020,7,10
504,0,15,1,6,2020,7,3
203,1,15,1,6,2020,7,8
393,0,15,1,6,2020,7,3
426,1,20,1,6,2020,7,8


# Training

> Training only done for 4 or the 5 data(samsung-internet is omitted)

In [12]:
mlflow.sklearn.autolog(log_input_examples=True,disable_for_unsupported_versions=True, silent=True)
# mlflow.xgboost.autolog(log_input_examples=True,disable_for_unsupported_versions=True, silent=True)

In [13]:
def calculate_metrics(y_test, y_preds):
    rmse = np.sqrt(mean_squared_error(y_test, y_preds))
    r_sq = r2_score(y_test, y_preds)
    mae = mean_absolute_error(y_test, y_preds)

    return {'RMSE Score': rmse, 'R2_Squared': r_sq, 'MAE Score': mae}


In [14]:
def evaluate_model(dt_classifier,x_train, y_train ,x_test, y_test):
    print("Train Accuracy :", accuracy_score(
        y_train, dt_classifier.predict(x_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(
        y_test, dt_classifier.predict(x_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(x_test)))


In [15]:
def plot_preds(y_test, y_preds, model_name):
    N = len(y_test)
    original = plt.scatter(np.arange(1, N+1), y_test, c='blue')
    prediction = plt.scatter(np.arange(1, N+1), y_preds, c='red')
    plt.xticks(np.arange(1, N+1))
    plt.xlabel('# Oberservation')
    plt.ylabel('Response')
    title = 'True labels vs. Predicted Labels ({})'.format(model_name)
    plt.title(title)
    # plt.figure(figsize=(20, 8))
    plt.legend((original, prediction), ('Original', 'Prediction'))
    plt.show()


In [16]:
def get_dt_graph(dt_classifier, x, show=False):
    plt.figure(figsize=(25, 20))
    treeplt = tree.plot_tree(dt_classifier,
                       feature_names=x.columns,
                       class_names=['No Response', "Yes Response"],
                       filled=True)
    if(show):
        plt.show()
    return plt


## Logistic Regression Model

In [17]:
def train_logistic_model(x_train,y_train,x_valid, y_valid,cross_val_size:int=5):
    cv_acc_results = []
    model_list = []
    c = [0.001,0.01,0.1,1,10,100,1000]
    for i in c:
        # with mlflow.start_run():
        model = LogisticRegression(penalty='l2', C=i, random_state=0)
        model.fit(x_train, y_train)
        kfold = KFold(n_splits=cross_val_size)
        results = cross_val_score(model, x_train, y_train, cv=kfold)
        mlflow.log_param("Model","Logistic Regression")
        mlflow.log_param("Penalty", 'l2')
        mlflow.log_metric("C Value", i)
        pre_acc_score = model.score(x_valid, y_valid)
        cv_scores = (results.mean(), results.std())
        cv_acc_results.append(cv_scores[0])
        model_list.append(model)

            # measure_metrics = calculate_metrics(y_valid, y_preds=model.predict(x_valid.values))
            # mlflow.log_param("Model Type", "Logistic Regression")
            # mlflow.set_tag('Model Type', 'Logistic Regression')
            # mlflow.log_metric("Predicted Accuracy Score",pre_acc_score)
            # mlflow.log_metric("Cross Validation Mean Accuracy Score",cv_scores[0])
            # mlflow.log_metric("Cross Validation Std Score",cv_scores[1])
            # mlflow.log_metric("RMSE Score", measure_metrics['RMSE Score'])
            # mlflow.log_metric("R2_Squared Score", measure_metrics['R2_Squared'])
            # mlflow.log_metric("MAE Score", measure_metrics['MAE Score'])
            # modelpath = '../models/model-%s-%f-%f-%f'%('lr',cv_scores[0],cv_scores[1],i)
            # figpath = '../models/fig-%s-%f-%f-%f.png'%('lr', cv_scores[0], cv_scores[1],i)
            # roc_plot = plot_roc_curve_log(x_valid, y_valid, model=model)
            # try:
            #     roc_plot.savefig(figpath)
            #     mlflow.log_artifact(figpath)
            #     mlflow.sklearn.save_model("model",modelpath)
            #     mlflow.log_artifact(modelpath)
            # except:
            #     # Model already exists
            #     pass
            # print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
    # predicted = model.predict(x_valid.values)
    # print("Parameter Values: ")
    # print(model.coef_)
    # print(metrics.accuracy_score(y_valid, predicted))
    # print(metrics.classification_report(y_valid, predicted))
    # print(calculate_metrics(y_valid, predicted))
    # plot_preds(y_valid, predicted, "Logistic Regression")
    best_model = model_list[cv_acc_results.index(min(cv_acc_results))]


    return best_model


In [18]:
def plot_roc_curve_log(x_test,y_test,model, show=False):
    logit_roc_auc = roc_auc_score(y_test, model.predict(x_test))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:, 1])
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    if(show):
        plt.show()
    
    return plt


In [19]:
# For chrome mobile
import warnings
warnings.filterwarnings('ignore')
chrome_mobile_model = train_logistic_model(
    chrome_mobile_dict['train_x'], chrome_mobile_dict['train_y'], chrome_mobile_dict['val_x'], chrome_mobile_dict['val_y'])


In [20]:
chrome_mobile_model

LogisticRegression(C=0.01, random_state=0)

In [21]:
chrome_mobile_view_model = train_logistic_model(
    chrome_mobile_view_dict['train_x'], chrome_mobile_view_dict['train_y'], chrome_mobile_view_dict['val_x'], chrome_mobile_view_dict['val_y'])


In [22]:
facebook_model = train_logistic_model(
    facebook_dict['train_x'], facebook_dict['train_y'], facebook_dict['val_x'], facebook_dict['val_y'])


In [23]:
samsung_internet_model = train_logistic_model(
    samsung_internet_dict['train_x'], samsung_internet_dict['train_y'], samsung_internet_dict['val_x'], samsung_internet_dict['val_y'])


In [24]:
platform_6_model = train_logistic_model(
    platform_6_dict['train_x'], platform_6_dict['train_y'], platform_6_dict['val_x'], platform_6_dict['val_y'])


## Decision Trees

In [14]:
def train_decision_tree(x_train, y_train, x_valid, y_valid):
    dt = DecisionTreeClassifier(random_state=42)
    params = {
        'max_depth': [*range(3,15)],
        'min_samples_split': [*range(2,10)],
        'min_samples_leaf': [*range(1,10)],
        'criterion': ["gini", "entropy"],
        'max_features': ['sqrt','log2'],
        # 'presort': [False, True],
        'class_weight': [None, 'balanced']
    }
    grid_search = GridSearchCV(estimator=dt,
                            param_grid=params,
                            cv=5, n_jobs=4, verbose=1, scoring="accuracy")
    grid_search.fit(x_train, y_train)

    # with mlflow.start_run():
    #     measure_metrics = calculate_metrics(y_valid, y_preds=grid_search.best_estimator_.predict(x_valid.values))
    #     mlflow.log_param("Model Type", "Decision Tree")
    #     mlflow.set_tag('Model Type', 'Decision Tree')
    #     mlflow.set_tag('Model Status', 'Best')
    #     mlflow.log_metric("Predicted Accuracy Score", grid_search.score(x_valid,y_valid))
    #     mlflow.log_metric("RMSE Score", measure_metrics['RMSE Score'])
    #     mlflow.log_metric("R2_Squared Score",measure_metrics['R2_Squared'])
    #     mlflow.log_metric("MAE Score", measure_metrics['MAE Score'])

    #     modelpath = '../models/model-%s-%f-%f' % ('dt', grid_search.best_score_, grid_search.best_index_)
    #     figpath = '../models/fig-%s-%f-%f.png' % ('dt', grid_search.best_score_, grid_search.best_index_)
    #     dt_graph = get_dt_graph(grid_search.best_estimator_, x_train)
    #     try:
    #         dt_graph.savefig(figpath)
    #         mlflow.log_artifact(figpath)
    #         mlflow.sklearn.save_model("model", modelpath)
    #         mlflow.log_artifact(modelpath)
    #     except:
    #         # Model already exists
    #         pass
    # score_df = pd.DataFrame(grid_search.cv_results_)
    # score_df.nlargest(5,"mean_test_score")
    # dt_best = grid_search.best_estimator_
    # print("Score: {}".format(clf.score(x_valid, y_valid)))
    # print(metrics.accuracy_score(y_valid, predicted))
    # print(metrics.classification_report(y_valid, predicted))
    # print(clf.best_score_, clf.best_params_)
    # print(calculate_metrics(y_valid, predicted))
    # plot_preds(y_valid, predicted, "Decision Tree")

    return grid_search


In [15]:
chrome_mobile_tree_model = train_decision_tree(
    chrome_mobile_dict['train_x'], chrome_mobile_dict['train_y'], chrome_mobile_dict['val_x'], chrome_mobile_dict['val_y'])

Fitting 5 folds for each of 6912 candidates, totalling 34560 fits


In [None]:
chrome_mobile_tree_model.best_estimator_


In [None]:
chrome_mobile_view_tree_model = train_decision_tree(
    chrome_mobile_view_dict['train_x'], chrome_mobile_view_dict['train_y'], chrome_mobile_view_dict['val_x'], chrome_mobile_view_dict['val_y'])


Error: Kernel is dead

In [None]:
facebook_tree_model = train_decision_tree(
    facebook_dict['train_x'], facebook_dict['train_y'], facebook_dict['val_x'], facebook_dict['val_y'])


In [None]:
samsung_internet_tree_model = train_decision_tree(
    samsung_internet_dict['train_x'], samsung_internet_dict['train_y'], samsung_internet_dict['val_x'], samsung_internet_dict['val_y'])


In [None]:
platform_6_tree_model = train_decision_tree(
    platform_6_dict['train_x'], platform_6_dict['train_y'], platform_6_dict['val_x'], platform_6_dict['val_y'])


## XGB Boost

In [None]:
def get_xgbc_graph(model, rankdir='LR', show=False):
    plt.figure(figsize=(500, 400))
    fig = plot_tree(model, num_trees=0, rankdir=rankdir)
    
    if(show):
        plt.show()
        
    return plt


In [14]:
# k-fold cross validation evaluation of xgboost model
# CV model
def train_xgb_classifier(x_train, y_train, x_valid, y_valid):
    mlflow.xgboost.autolog()
    clf_xgb = XGBClassifier(objective='binary:logistic')
    params = {
        'max_depth': [*range(3,10)],
        'min_child_weight':[*range(1,6)],
        'learning_rate':uniform(0.01, 0.59),
        'subsample':uniform(0.3,0.6),
        'colsample_bytree':uniform(0.5,0.4),
    }

    kfold = KFold(n_splits=5,shuffle=True)

    random_search = RandomizedSearchCV(clf_xgb, param_distributions=params,
    cv=kfold, n_iter=5, scoring='accuracy', error_score=0, verbose=1, n_jobs=4)

    random_search.fit(x_train, y_train)

    # score_df = pd.DataFrame(random_search.cv_results_)
    # score_df.nlargest(5, "mean_test_score")
    # xgb_best_classifier = random_search.best_estimator_
    #additional
    # random_search.best_params_
    # random_search.get_params
    # random_search.best_index_
    # random_search.best_score_

    # with mlflow.start_run():
    #     measure_metrics = calculate_metrics(y_valid, y_preds=random_search.best_estimator_.predict(x_valid.values))
    #     mlflow.log_param("Model Type", "XGB Classifier")
    #     mlflow.set_tag('Model Type', 'XGB Classifier')
    #     mlflow.set_tag('Model Status', 'Best')
    #     mlflow.log_metric("Predicted Accuracy Score",random_search.score(x_valid, y_valid))
    #     mlflow.log_metric("RMSE Score", measure_metrics['RMSE Score'])
    #     mlflow.log_metric("R2_Squared Score", measure_metrics['R2_Squared'])
    #     mlflow.log_metric("MAE Score", measure_metrics['MAE Score'])

    #     modelpath = '../models/model-%s-%f-%f' % ('xgbC', random_search.best_score_, random_search.best_index_)
    #     figpath = '../models/fig-%s-%f-%f.png' % ('xgbC', random_search.best_score_, random_search.best_index_)
    #     xgbc_graph = get_xgbc_graph(random_search.best_estimator_)
    #     try:
    #         xgbc_graph.savefig(figpath)
    #         mlflow.log_artifact(figpath)
    #         mlflow.sklearn.save_model("model", modelpath)
    #         mlflow.log_artifact(modelpath)
    #     except:
    #         # Model already exists
    #         pass

    return random_search

In [15]:
chrome_mobile_xgbc_model = train_xgb_classifier(
    chrome_mobile_dict['train_x'], chrome_mobile_dict['train_y'], chrome_mobile_dict['val_x'], chrome_mobile_dict['val_y'])


Fitting 5 folds for each of 5 candidates, totalling 25 fits


exception calling callback for <Future at 0x2543d867ac0 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "C:\Python38\lib\site-packages\joblib\externals\loky\_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "C:\Python38\lib\site-packages\joblib\parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "C:\Python38\lib\site-packages\joblib\parallel.py", line 792, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "C:\Python38\lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Python38\lib\site-packages\joblib\parallel.py", line 777, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Python38\lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
    future = self._workers.submit(SafeFunction(func))
  File "C:\Python38\lib\site-packages\joblib\externals\loky\reusable_executor.

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:

chrome_mobile_xgbc_model.best_estimator_


## Too Much Compution Time Required to train xgb classifier with these parameters

In [None]:
# chrome_mobile_view_xgbc_model = train_xgb_classifier(
#     chrome_mobile_view_dict['train_x'], chrome_mobile_view_dict['train_y'], chrome_mobile_view_dict['val_x'], chrome_mobile_view_dict['val_y'])


In [None]:
# facebook_xgbc_model = train_xgb_classifier(
#     facebook_dict['train_x'], facebook_dict['train_y'], facebook_dict['val_x'], facebook_dict['val_y'])


In [None]:
# samsung_internet_xgbc_model = train_xgb_classifier(
#     samsung_internet_dict['train_x'], samsung_internet_dict['train_y'], samsung_internet_dict['val_x'], samsung_internet_dict['val_y'])


In [None]:
# platform_6_xgbc_model = train_xgb_classifier(
#     platform_6_dict['train_x'], platform_6_dict['train_y'], platform_6_dict['val_x'], platform_6_dict['val_y'])
