# Binary Logisitic Regression Model Training Via sklearn

## Tech Spec
* Google Cloud Compute Engine
* n1-standard-4 (4 vCPUs, 15 GB memory)
* Debian GNU/ Linux 9

## Model Training

In [1]:
import numpy as np
import pandas as pd
import time

from imblearn.pipeline import make_pipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, \
                            auc, \
                            confusion_matrix, \
                            log_loss, make_scorer, \
                            roc_auc_score, roc_curve, \
                            precision_recall_curve, \
                            precision_score, \
                            recall_score, \
                            f1_score
from sklearn.model_selection import GridSearchCV, \
                                    train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler

### Load Data

In [2]:
df = pd.read_pickle('../data/preprocessed_training_data.pkl')

### Brief Data Exploration

In [3]:
print(df.head())

                         id           timestamp                campaignId  \
0  5c36658fb58fad351175f0b6 2019-01-09 21:20:15  59687f0d896a6b0e5ce6ea15   
1  5c38d5ab1c16172870186b5a 2019-01-11 17:43:07  59687f0d896a6b0e5ce6ea15   
2  5c38815de8f4e50e256e4f9c 2019-01-11 11:43:25  59687f0d896a6b0e5ce6ea15   
3  5c409ace532d5806d2c6a5e6 2019-01-17 15:10:06  59687f0d896a6b0e5ce6ea15   
4  5c3904b92d798c41e7f3088a 2019-01-11 21:03:53  59687f0d896a6b0e5ce6ea15   

  platform softwareVersion sourceGameId country  startCount  viewCount  \
0      ios          11.4.1      1373094      US          25         24   
1      ios            12.1      2739989      US          10          9   
2      ios          12.1.2      1373094      US          27         26   
3      ios          12.1.2      1217749      US          15         14   
4      ios          12.0.1      1373094      US          20         18   

   clickCount  installCount           lastStart  startCount1d  startCount7d  \
0           0

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3738937 entries, 0 to 3738936
Data columns (total 18 columns):
id                    object
timestamp             datetime64[ns]
campaignId            object
platform              object
softwareVersion       object
sourceGameId          object
country               object
startCount            int64
viewCount             int64
clickCount            int64
installCount          int64
lastStart             datetime64[ns]
startCount1d          int64
startCount7d          int64
connectionType        object
deviceType            object
install               int64
timeSinceLastStart    float64
dtypes: datetime64[ns](2), float64(1), int64(7), object(8)
memory usage: 513.5+ MB
None


The training dataset has over 3.5 million records and 17 features. 

In [5]:
df['install'].value_counts()

0    3694193
1      44744
Name: install, dtype: int64

We find that the class distribution of the install to no-install status is extremely imbalanced at 1:82. 

In [6]:
numerical_columns = ['startCount', 'viewCount', 'installCount', 'startCount1d', 'startCount7d', 'timeSinceLastStart']
categorical_columns = ['campaignId', 'sourceGameId', 'country']

In [7]:
for feat in categorical_columns:
    print(feat)
    print("==========")
    print(df[feat].value_counts())
    print("        ")

campaignId
5c3bfb0b36c2c6cc18710e7b    41740
5c385d02ee4549000d8b9ddd    36861
5c0f2ff2f4ee9d00225714c2    32856
5afbea849f23a400284f2619    30286
5c26db700f371292325680ec    28537
                            ...  
5c335c897bda2f05734c8d1e        1
5c3cdf290ad04f2a9cb5d411        1
5c0150ec9d08963e623ab452        1
5c125e8f8bb1330034ef5287        1
5c344fa5031ca4647aa33da8        1
Name: campaignId, Length: 9692, dtype: int64
        
sourceGameId
1711292    73156
1483109    35089
1782302    34015
111890     28319
2762289    27812
           ...  
1702207        1
1168174        1
1717206        1
1767070        1
1374997        1
Name: sourceGameId, Length: 34849, dtype: int64
        
country
US    579740
RU    287701
IN    267325
BR    200542
DE    148443
       ...  
CF         3
NF         3
IO         3
TD         2
FK         1
Name: country, Length: 221, dtype: int64
        


This shows us that the cardinality of the campaignId and sourceGameId features are very high.

 ### Data Preprocessing

In [8]:
numerical_pipeline = make_pipeline(RobustScaler(with_centering=True))

In [9]:
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))

In [10]:
preprocessor = ColumnTransformer(
    [('numerical_preprocessing', numerical_pipeline, numerical_columns), 
     ('categorical_preprocessing', categorical_pipeline, categorical_columns)], 
    remainder='drop')

### Dataset Training/Test Split

In [11]:
X = df[numerical_columns + categorical_columns]
y = df['install']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

## Performance Metrics

The metrics used to measure the classifier performance other than AUROC, log-loss and prediction bias are the precision and recall.

In [12]:
def log_loss_score(clf, x, y):
    return log_loss(y, clf.predict_proba(x))

def auroc_score(clf, x, y):
    return roc_auc_score(y, clf.predict_proba(x)[:, 1])

## 2.4 Grid Search

In [13]:
pipeline = make_pipeline(preprocessor, 
                         RandomUnderSampler(random_state=0), 
                         LogisticRegression(penalty='l2', max_iter=2000, random_state=0))

In [14]:
param_range = [0.01, 0.1, 1.0, 10.0]
param_grid = [{'logisticregression__C': param_range}]

In [15]:
t_0 = time.time()
gs = GridSearchCV(estimator=pipeline,
                  param_grid=param_grid,
                  scoring='roc_auc',
                  cv=3)
gs.fit(X_train, y_train)
print('{} minutes'.format((time.time() - t_0) / 60.0))
print(gs.best_score_)
print(gs.best_params_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

6.712116340796153 minutes
0.7261716779067208
{'logisticregression__C': 0.1}


11 minutes on a million rows of data - AUROC score of 0.73. l2 penalty, C=0.1.

## Optimal classifier training time

In [13]:
t_0 = time.time()
pipeline = make_pipeline(preprocessor, 
                         RandomUnderSampler(random_state=0), 
                         LogisticRegression(C=1.0, penalty='l2', max_iter=1000, random_state=0))
pipeline.fit(X_train, y_train)
print('{} seconds'.format((time.time() - t_0)))

42.110485792160034 seconds


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## Performance Metrics

In [14]:
y_pred = pipeline.predict(X_test)
print("Precision: {}%".format(int(100 * precision_score(y_test, y_pred))))
print("Recall: {}%".format(int(100 * recall_score(y_test, y_pred))))
print("Log-loss: {}%".format(int(100 * log_loss_score(pipeline, X_test, y_test))))
print("AUROC: {}%".format(int(100 * roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1]))))
tn, fp, fn, tp = confusion_matrix(y_pred, y_test).ravel()
print("True Negatives: {}, Fale Positives: {}, False Negatives: {}, True Positives: {}".format(tn, fp, fn, tp))
print("Prediction bias: {}".format(sum(y_pred) / len(y_pred) - sum(y_test) / len(y_test)))

Precision: 2%
Recall: 67%
Log-loss: 61%
AUROC: 72%
True Negatives: 489217, Fale Positives: 2879, False Negatives: 249622, True Positives: 6070
Prediction bias: 0.32996384001882884


On a million data points, we find that the model has a very low precision of 2% but perhaps this is jutified by the recall of 67%. 
When using the 3.7 million rows, we find that the same result holds true.