In [None]:
import numpy as np
import pandas as pd
import random

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
def get_class_label_indices(df, target_var):
    zeroes_indices = df.index[df[target_var] == 0].tolist()
    ones_indices = df.index[df[target_var] == 1].tolist()
    return zeroes_indices, ones_indices

In [None]:
def upsampled_stratified_test_train_indices(zeros, ones, test_split=0.3):
    random.shuffle(zeros)
    random.shuffle(ones)

    m, n = int(test_split * len(zeros)), int(test_split * len(ones))

    train_indices = zeros[m:] + random.choices(ones[n:], k=len(zeros) - m)
    test_indices = zeros[:m] + ones[:n]

    random.shuffle(train_indices)
    random.shuffle(test_indices)

    return train_indices, test_indices

In [None]:
def generate_feature_and_target_vectors(df, indices):
    X = df.loc[indices, ['startCount', 'viewCount', 'clickCount',
                         'installCount']].values.tolist()
    y = np.ravel(df.loc[indices, ['install']].values)
    return X, y

In [None]:
data = pd.read_csv('training_data.csv', sep=';')
features = ['startCount', 'viewCount', 'clickCount', 'installCount', 
            'lastStart', 'startCount1d', 'startCount7d', 'install']
data = data[features]

In [None]:
zeros, ones = get_class_label_indices(data, 'install')
train, test = upsampled_stratified_test_train_indices(zeros, ones, 0.3)

X_train, y_train = generate_feature_and_target_vectors(data, train)
X_test, y_test = generate_feature_and_target_vectors(data, test)

In [None]:
pipe_lr = Pipeline([('scl', StandardScaler()),
                   ('clf', LogisticRegression(random_state=0))])
param_range = [0.01, 0.1, 1.0, 10.0, 100.0]
param_grid = [{'clf__C': param_range,
              'clf__penalty': ['l1', 'l2']}]

In [None]:
gs = GridSearchCV(estimator=pipe_lr,
                 param_grid=param_grid,
                 scoring='roc_auc',
                 cv=3,
                 n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)

In [None]:
print(gs_best_params_)

In [None]:
print(accuracy_score(y_test, y_pred))
print(y_pred[:100], y_test[:100])
print(confusion_matrix(y_pred, y_test))

In [None]:
print(roc_auc_score(y_pred, y_test))