# Binary Logisitic Regression Model Training Via sklearn

## Tech Spec
* Google Cloud Compute Engine
* n1-standard-4 (4 vCPUs, 15 GB memory)
* Debian GNU/ Linux 9

## Model Training

In [None]:
import numpy as np
import pandas as pd
import time

from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, \
                            auc, \
                            confusion_matrix, \
                            log_loss, make_scorer, \
                            roc_auc_score, roc_curve, \
                            precision_recall_curve, \
                            precision_score, \
                            recall_score, \
                            f1_score
from sklearn.model_selection import GridSearchCV, \
                                    train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler

### Load Data

In [None]:
df = pd.read_pickle('../data/preprocessed_training_data.pkl')

### Brief Data Exploration

In [None]:
print(df.head())

In [None]:
print(df.info())

The training dataset has over 3.5 million records and 17 features. 

In [None]:
df['install'].value_counts()

We find that the class distribution of the install to no-install status is extremely imbalanced at 1:82. 

In [None]:
numerical_columns = ['startCount', 'viewCount', 'installCount', 'startCount1d', 'startCount7d', 'timeSinceLastStart']
categorical_columns = ['campaignId', 'sourceGameId', 'country']

In [None]:
for feat in categorical_columns:
    print(feat)
    print("==========")
    print(df[feat].value_counts())
    print("        ")

This shows us that the cardinality of the campaignId and sourceGameId features are very high.

 ### Data Preprocessing

In [None]:
numerical_pipeline = make_pipeline(RobustScaler(with_centering=True))

In [None]:
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

In [None]:
preprocessor = ColumnTransformer(
    [('numerical_preprocessing', numerical_pipeline, numerical_columns), 
     ('categorical_preprocessing', categorical_pipeline, categorical_columns)], 
    remainder='drop')

### Dataset Training/Test Split

In [None]:
X = df[numerical_columns + categorical_columns]
y = df['install']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

## Performance Metrics

The metrics used to measure the classifier performance other than AUROC, log-loss and prediction bias are the precision and recall.

In [None]:
def log_loss_score(clf, x, y):
    return log_loss(y, clf.predict_proba(x))

def auroc_score(clf, x, y):
    return roc_auc_score(y, clf.predict_proba(x)[:, 1])

## 2.4 Grid Search

In [None]:
pipeline = make_pipeline(preprocessor, 
                         RandomUnderSampler(random_state=0), 
                         LogisticRegression(penalty='l2', max_iter=2000, random_state=0))

In [None]:
param_range = [0.01, 0.1, 1.0, 10.0]
param_grid = [{'logisticregression__C': param_range}]

In [None]:
t_0 = time.time()
gs = GridSearchCV(estimator=pipeline,
                  param_grid=param_grid,
                  scoring='roc_auc',
                  cv=3)
gs.fit(X_train, y_train)
print('{} minutes'.format((time.time() - t_0) / 60.0))
print(gs.best_score_)
print(gs.best_params_)

11 minutes on a million rows of data - AUROC score of 0.73. l2 penalty, C=0.1.

## Optimal classifier training time

In [None]:
t_0 = time.time()
pipeline = make_pipeline(preprocessor, 
                         RandomUnderSampler(random_state=0), 
                         LogisticRegression(C=1.0, penalty='l2', max_iter=2000, random_state=0))
pipeline.fit(X_train, y_train)
print('{} seconds'.format((time.time() - t_0)))

## Performance Metrics

In [None]:
y_pred = pipeline.predict(X_test)
print("Precision: {}%".format(int(100 * precision_score(y_test, y_pred))))
print("Recall: {}%".format(int(100 * recall_score(y_test, y_pred))))
print("Log-loss: {}%".format(int(100 * log_loss_score(pipeline, X_test, y_test))))
print("AUROC: {}%".format(int(100 * roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]))))
tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
print("True Negatives: {}, Fale Positives: {}, False Negatives: {}, True Positives: {}".format(tn, fp, fn, tp))
print("Prediction bias: {}".format(sum(y_pred) / len(y_pred) - sum(y_test) / len(y_test)))

On a million data points, we find that the model has a very low precision of 2% but perhaps this is jutified by the recall of 67%. 
When using the 3.7 million rows, we find that the same result holds true.