In [8]:
import csv
import datetime as dt
import gc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import time


from scipy.sparse import csc_matrix, hstack 
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.metrics import accuracy_score, \
                            auc, \
                            confusion_matrix, \
                            log_loss, make_scorer, \
                            roc_auc_score, roc_curve, \
                            precision_recall_curve, \
                            precision_score, \
                            recall_score, \
                            f1_score
from sklearn.model_selection import GridSearchCV, \
                                    learning_curve, \
                                    StratifiedKFold, \
                                    train_test_split

In [74]:
df = pd.read_pickle('preprocessed_training_data.pkl')
print(df.head())
print(df.shape)

                 campaignId platform softwareVersion sourceGameId country  \
0  59687f0d896a6b0e5ce6ea15      ios          11.4.1      1373094      US   
1  59687f0d896a6b0e5ce6ea15      ios            12.1      2739989      US   
2  59687f0d896a6b0e5ce6ea15      ios          12.1.2      1373094      US   
3  59687f0d896a6b0e5ce6ea15      ios          12.1.2      1217749      US   
4  59687f0d896a6b0e5ce6ea15      ios          12.0.1      1373094      US   

   startCount  viewCount  clickCount  installCount  startCount1d  \
0          25         24           0             2             1   
1          10          9           2             0             1   
2          27         26           0             0             1   
3          15         14           2             0             3   
4          20         18           0             0            13   

   startCount7d connectionType deviceType  install  timeSinceLastStart  
0             8       cellular  iPhone8,2        0     

In [75]:
X_num = df[['startCount', 'viewCount', 'clickCount', 'installCount', 'startCount1d', 'startCount7d', 'timeSinceLastStart']].values
X_cat = df[['campaignId', 'sourceGameId', 'country', 'platform', 'softwareVersion', 'connectionType', 'deviceType']]

print(X_num.shape)
print(X_cat.shape)

y = df['install']
enc = OneHotEncoder(handle_unknown='ignore')
X_cat = enc.fit_transform(X_cat)
X = hstack((X_cat, X_num))

(3738937, 7)
(3738937, 7)


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train = csc_matrix(X_train)
y_train = y_train.values
X_test = csc_matrix(X_test)
y_test = y_test.values

In [77]:
def undersample_fit(x, y):
    zero_indices = []
    one_indices = []
    for idx, class_label in enumerate(y):
        if class_label == 0:
            zero_indices.append(idx)
        else:
            one_indices.append(idx)
    
    resampled_indices = one_indices + random.sample(zero_indices, len(one_indices))
    random.shuffle(resampled_indices)
    resampled_x, resampled_y = [], []
    for idx in resampled_indices:
        resampled_x.append(x[idx].toarray()[0])
        resampled_y.append(y[idx])
    return csc_matrix(resampled_x), resampled_y

In [78]:
def log_loss_score(clf, x, y):
    return log_loss(y, clf.predict_proba(x))

def auroc_score(clf, x, y):
    return roc_auc_score(y, clf.predict_proba(x)[:, 1])

In [79]:
clf = make_pipeline(RobustScaler(with_centering=False),
                    SGDClassifier(loss='log', penalty='l1', alpha=0.0001))

In [80]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('robustscaler',
                 RobustScaler(copy=True, quantile_range=(25.0, 75.0),
                              with_centering=False, with_scaling=True)),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='log',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l1', power_t=0.5, random_state=None,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False)

In [81]:
auroc_score(clf, X_train, y_train)

0.6523589915585692

In [82]:
auroc_score(clf, X_test, y_test)

0.6484047458388864