In [None]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import math
import pickle

import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.autograd import Variable
from torch import nn as nn
from torch.optim import lr_scheduler
import copy
import time
from tqdm import tqdm, tqdm_notebook

from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc, classification_report, balanced_accuracy_score

import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Load and transform data to tensor

In [None]:
seed=622
np.random.seed(seed)
torch.manual_seed(seed)

In [None]:
colab_env = False

if not(colab_env):
#     train = pd.read_hdf('preprocessed.h5', 'train')
    validation = pd.read_hdf('preprocessed.h5', 'validation')
    X_resampled, y_resampled = pickle.load(open( "resampled", "rb" ))
    
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    validation = pd.read_hdf('/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5', 'validation')
    test = pd.read_hdf('/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5', 'test')
#     X_resampled, y_resampled = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/we_data/resampled", "rb" ))
    X_resampled, y_resampled = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/we_data/subsampled", "rb" ))

In [None]:
X_validation = validation.drop(columns=['click', 'payprice']).values
y_validation = validation['click'].values
    
# X_train = train.drop(columns=['click', 'payprice']).values
# y_train = train['click'].values
                                              
# weights = torch.DoubleTensor(1-(np.bincount(y_resampled))/len(y_resampled))                                   
# sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, 2) 
# samples = torch.utils.data.BatchSampler(sampler, batch_size=1000, drop_last=False)

# print(X_resampled.shape[1] == X_validation.shape[1])

# train_loader = TensorDataset(torch.from_numpy(X_resampled).float(), torch.from_numpy(y_resampled).float())
train_loader = TensorDataset(torch.from_numpy(X_subsampled.values).float(), torch.from_numpy(y_subsampled).float())
# train_loader = TensorDataset(torch.from_numpy(X_train.values).float(), torch.from_numpy(y_train).float())

train_loader = DataLoader(train_loader, num_workers=4, batch_size=50000, shuffle=True)

validation_loader = TensorDataset(torch.from_numpy(X_validation).float(), torch.from_numpy(y_validation).float())
validation_loader = DataLoader(validation_loader, batch_size=40000, shuffle=True, num_workers=4)

# Define model

In [None]:
# set seed fucntiosn
seed=622
np.random.seed(seed)
torch.manual_seed(seed)

# set number of nodes per hidden layer
input_layer = X_validation.shape[1]
hidden_layer_1 = 2**11
hidden_layer_2 = 2**11
hidden_layer_3 = 2**11
hidden_layer_4 = 2**11
output_layer = 1

# add linear layers and init weight function
linear1 = nn.Linear(input_layer, hidden_layer_1)
nn.init.xavier_uniform_(linear1.weight)

linear2 = nn.Linear(hidden_layer_1, hidden_layer_2)
nn.init.xavier_uniform_(linear2.weight)

linear3 = nn.Linear(hidden_layer_2, hidden_layer_3)
nn.init.xavier_uniform_(linear3.weight)

linear4 = nn.Linear(hidden_layer_3, hidden_layer_4)
nn.init.xavier_uniform_(linear4.weight)

output = nn.Linear(hidden_layer_4, output_layer)
nn.init.xavier_uniform_(output.weight)

# dropout nodes when training the model to prevent over fitting
drop_prob = 0.5
dropout = nn.Dropout(p=drop_prob)

# set sequential NN model
model = torch.nn.Sequential(
    linear1,
    dropout,
    nn.PReLU(),
    linear2,
    dropout,
    nn.PReLU(),
    linear3,
    dropout,
    nn.PReLU(),
    linear4,
    dropout,
    nn.ReLU(),
    output,
    nn.Sigmoid()
)

# Utilise GPU when avalible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# check if this is redundeant when the above is given
if torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()
    
print('Running on {}'.format(device))

# enable parallel processing
model = nn.DataParallel(model)

# set loss fuction
criterion = nn.BCELoss()

# set optimiser function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Decay LR by a factor of 0.1 every step_size
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)


print(model)

# Train function

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    print('Running on {}\n'.format(device))
    
    # keep track of historical ROC AUC scores
    rocauc_history = []
    
    # keep track of time spend
    since = time.time()
    
    # init current best model
    best_model_wts = copy.deepcopy(model.state_dict())
    
    # best perforamce metrics
    best_roc = 0.0
    best_acc = 0.0
    model_loss = None
    
    num_updates = 0
    
    # loop through number of epochs
    for epoch in range(num_epochs):
        print('\nEpoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        # each epoch contains of a train and valdiation phase 
        # where validation is done on validation set
        for phase in ['train', 'val']:

            # Set NN to training
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
                loader = train_loader
                loader_len = len(loader.dataset)

            # Set NN to evaluate
            else:
                model.eval()
                loader = validation_loader
                loader_len = len(loader.dataset)
            
            # scores of current phase and epoch
            running_loss = 0.0
            running_roc = 0.0
            running_acc = 0.0
            
            # loop trough minibatches
            for batch, (data, target) in enumerate(loader):
                data, target = Variable(data), Variable(target)

                data = data.to(device)
                target = target.to(device)

                # Reset gradients
                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    # Forward Propagation
                    output = model(data)

                    # loss function
                    loss = criterion(output.squeeze(), target)

                    prediction = (output.data).float()
                    y_hat_prob = prediction.cpu().numpy().squeeze()
                    
                    y_hat_class = (prediction.cpu() > 0.5).float()

                    target_y = target.cpu().data.numpy()

                    # get predicted labels
                    _, preds = torch.max(output, 1)

                    # optimise in training
                    if phase == 'train':

                        # Backwards propagation error
                        loss.backward()

                        # apply
                        optimizer.step()

                # statistics
                running_loss += loss.item() * data.size(0)
                running_roc += roc_auc_score(target_y,
                                             y_hat_prob) * data.size(0)
                running_acc += balanced_accuracy_score(
                    target_y, y_hat_class) * data.size(0)

                if (batch + 1) % 50 == 0:
                    print('{} Epoch: {} [{}/{} ({:.0f}%)]'.format(
                        phase, epoch, (batch + 1) * len(data),
                        len(loader.dataset), 100. * (batch + 1) / len(loader)))
                    
            # update epoch scores
            epoch_loss = running_loss / loader_len
            epoch_roc = running_roc / loader_len
            epoch_acc = running_acc / loader_len

            print(
                '\n\t{}:\tLoss {:.5f},\tROC AUC {:.5f},\tBalanced Acc {:.5f}'.
                format(phase, epoch_loss, epoch_roc, epoch_acc))

            if (phase == 'val'):

                # keep track of ROC AUC development
                rocauc_history.append(epoch_roc)

                if (model_loss == None):
                    model_loss = epoch_loss

                # deep copy the model
                if epoch_roc >= best_roc:
                    print('\t\tsave updated model')
                    best_roc = epoch_roc
                    model_loss = epoch_loss
                    best_model_wts = copy.deepcopy(model.state_dict())
                    num_updates += 1

    print('\n')
    print('=' * 30)
    print('=' * 30)
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s\n'.format(
        time_elapsed // 60, time_elapsed % 60))
    print(
        'Model Loss:\t{:4f}\nROC AUC:\t{:4f}\nAccuracy:\t{:4f}\nModel updates:\t{}'
        .format(model_loss, best_roc, best_acc, num_updates))

    filename = 'model_roc_{:.3f}_balanced_acc_{:.3f}_model_loss_{:.3f}.pt'.format(
        best_roc, best_acc, model_loss)

    # load best model weights and return this model
    model.load_state_dict(best_model_wts)
    
    return model, filename, rocauc_history

In [None]:
# train model
best_model, filename, history = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=2)

# save model to drive
file_poiter = '/content/gdrive/My Drive/Colab Notebooks/' + filename
torch.save(best_model, file_poiter)

# diagnostic plot
plt.plot(history)
plt.title('ROC AUC evaluation over time')
plt.ylabel('ROC AUC')
sns.despine()
plt.show()

## Train on entire train dataset

In [None]:
store = pd.HDFStore('preprocessed.h5', 'r')
train_hdf = store.get_storer('train')

chunks = np.array_split(np.arange(0,train_hdf.shape[0]), 5)

i = 0
for chunk in chunks:
    print('/'*100)
    print('chunk {}'.format(i))
    print('/'*100)
    print('')
    start = chunk[0]
    stop = chunk[-1]
    del(chunk)

    sub_frame = store.select('train',start=start,stop=stop)
    
    X_train_chunk = sub_frame.drop(columns=['click', 'payprice']).values
    y_train_chunk = sub_frame['click'].values
    
    del(train_loader)
    train_loader = TensorDataset(torch.from_numpy(X_train_chunk).float(), torch.from_numpy(y_train_chunk).float())
    train_loader = DataLoader(train_loader, num_workers=4, batch_size=50000, shuffle=True)
    
    model, filename, history = train_model(model, criterion, optimizer, exp_lr_scheduler, num_epochs=2)
    
    file_poiter = '/content/gdrive/My Drive/Colab Notebooks/' + filename
    torch.save(best_model, file_poiter)
    

## Validation

In [None]:
file = 'nmodel_roc_0.851_balanced_acc_0.000_model_loss_0.012.pt'

if colab_env:
    PATH = '/content/gdrive/My Drive/Colab Notebooks/' + file
    model = torch.load(PATH)
else:
    PATH = '/Users/davidvanrooij/Google Drive/Colab Notebooks/' + file
    print(PATH)
    model = torch.load(PATH, map_location='cpu')
    

model.eval()
print(model)

In [None]:
batches = np.array_split(validation, 200)

pCTR = []
for batch in batches:
    data = batch.drop(columns=['click', 'payprice']).values
    target = batch['click'].values
    
    data = torch.from_numpy(data).float()
    target = torch.from_numpy(target).float()
    
    output = model(data)
    
    prediction = (output.data).float()
    y_hat_prob = prediction.cpu().numpy().squeeze()
    
    pCTR.extend(y_hat_prob)

In [None]:
%%time
payprice = validation['payprice']
clicks = validation['click']

no_click, click = np.bincount(clicks)
avgCTR = click / (no_click + click)

bid_strategy = lambda lamda, const, pCTR: np.sqrt(np.multiply((const / lamda), pCTR) + const**2) - const

lamda_range = np.linspace(1e-7, 1e-4, num=20)
const_range = np.arange(10, 100, 10)

parameter_grid = [(x,i) for x in const_range for i in lamda_range]
# bid_range_wide = np.arange(1, 5, .5)
# bid_range = np.concatenate((bid_range, bid_range_wide), axis=0)

statistics = {}

# find optimal base_bid
for const, lamda in tqdm(parameter_grid):
    budget_remaining = 6250*1000
    
    index = (const, lamda)
    
    statistics[index] = {
        'impressions':0,
        'spend': 0,
        'clicks': 0,
        'too_expensive': 0
    }
    
    # list of bids for all ad requests
    bids = bid_strategy(lamda, const, pCTR)
    
    # loop through all bids for every ad request
    for i in range(len(bids)):    
        second_highest_bid = payprice[i]
        
        won = bids[i] >= second_highest_bid and second_highest_bid <= budget_remaining
        
        if(second_highest_bid > budget_remaining):
            statistics[index]['too_expensive'] += 1
        
        
        if(won):
            statistics[index]['impressions'] += 1
            statistics[index]['spend'] += second_highest_bid
            statistics[index]['clicks'] += clicks[i]
            
            # subtract current bid from budget 
            budget_remaining -= second_highest_bid
            
    
statistics = pd.DataFrame(statistics).T
statistics['CTR'] = statistics['clicks'] / statistics['impressions']
statistics['aCPM'] = statistics['spend'] / statistics['impressions'] 
statistics['aCPC'] = (statistics['spend']/1000) / statistics['clicks']
print('done')

In [None]:
statistics.sort_values('clicks', ascending=False).head(5)

## Use NN to predict pCTR on test and apply non-linear bidding strategy

In [None]:
batches = np.array_split(test, 200)

pCTR_test = []
for batch in batches:
    data = batch.drop(columns=['click', 'payprice']).values
    target = batch['click'].values
    
    data = torch.from_numpy(data).float()
    target = torch.from_numpy(target).float()
    
    output = model(data)
    
    prediction = (output.data).float()
    y_hat_prob = prediction.cpu().numpy().squeeze()
    
    pCTR_test.extend(y_hat_prob)

In [None]:
X_test = test.drop(columns=['click', 'payprice'])

bids = bid_strategy(0.000016, 30, pCTR_test)

# get bid id's
test_raw = pd.read_csv(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/test.csv')

# export to file
df_bids = pd.DataFrame(
    np.round(bids, 1), index=test_raw['bidid'].values, columns=['bidprice'])
df_bids.index.name = 'bidid'
df_bids = df_bids.reset_index()

file = '/content/gdrive/My Drive/Colab Notebooks/bid_attemnt_{}.csv'.format(
    time.strftime('%Y-%m-%d_%H:%M:%S'))
df_bids.to_csv(file, index=False)