In [0]:
from __future__ import print_function, division

from google.colab import drive
drive.mount('/content/gdrive')

import numpy as np
import pandas as pd
import math
import pickle
import torch
import time
from tqdm import tqdm, tqdm_notebook
from sklearn.externals import joblib

from sklearn.metrics import balanced_accuracy_score, roc_auc_score, \
                            classification_report, log_loss

from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/')
from multi_agent_simulator import BiddingAgent, BiddingEnvironment, BidStrategy


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2

In [0]:
validation = pd.read_hdf(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5',
    'validation')

# Collect pCTR from different models

In [0]:
# load the trained LR
lr = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/lr_model", "rb" ))

# user LR to predict pCTR
pCTR_lr = lr.predict_proba(validation.drop(columns=['payprice', 'click']))[:, 1]



In [0]:
# load the best NN
file = 'nmodel_roc_0.856_balanced_acc_0.000_model_loss_0.017.pt'
PATH = '/content/gdrive/My Drive/Colab Notebooks/' + file

model = torch.load(PATH)
model.eval()

# use NN to predict pCTR
batches = np.array_split(validation, 200)

pCTR_NN = []
for batch in batches:
    data = batch.drop(columns=['click', 'payprice']).values
    target = batch['click'].values
    
    data = torch.from_numpy(data).float()
    target = torch.from_numpy(target).float()
    
    output = model(data)
    
    prediction = (output.data).float()
    y_hat_prob = prediction.cpu().numpy().squeeze()
    
    pCTR_NN.extend(y_hat_prob)
    
pCTR_NN = np.array(pCTR_NN)
del(data, target, model, output, prediction, y_hat_prob)
print('done')

done


In [0]:
# Load different models
xgb = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/XGB_ROC_0.8911", "rb" ))
pCTR_XGB = xgb.predict_proba(validation.drop(columns=['payprice', 'click']))[:, 1]
print('done')

done


In [0]:
pCTR_ensemble = np.average(np.array([pCTR_lr, pCTR_NN, pCTR_XGB]), axis=0)

In [0]:
environment = BiddingEnvironment(validation)

In [0]:
# get baseline avgCTR
no_click, click = np.bincount(validation['click'].values)
avgCTR = click / (no_click + click)

In [0]:

results_ensemble = {}
for const in tqdm(np.arange(6, 10, .2)):
    
    agent = BiddingAgent(6250*1000, environment)

    bids = BidStrategy.linear_bidding(pCTR_ensemble, avgCTR, const)
    agent.simulate(bids, criteria='1')
    results_ensemble[const] = agent.statistics()
    
results_ensemble = pd.DataFrame(results_ensemble).T

100%|██████████| 20/20 [01:00<00:00,  3.10s/it]


In [0]:
results_ensemble.sort_values('clicks', ascending=False).head(5)

Unnamed: 0,CTR,aCPC,aCPM,budget_left,clicks,impressions,items,lost,spend,too_expensive
8.6,0.001373,39.739389,54.569407,10916.0,157.0,114333.0,303925.0,189592.0,6239084.0,0.0
8.4,0.001385,39.266173,54.381901,124477.0,156.0,112639.0,303925.0,191286.0,6125523.0,0.0
8.0,0.001419,38.048142,54.00903,352538.0,155.0,109194.0,303925.0,194731.0,5897462.0,0.0
7.6,0.001469,36.473748,53.583468,596569.0,155.0,105507.0,303925.0,198418.0,5653431.0,0.0
7.8,0.001443,37.273226,53.800846,472650.0,155.0,107384.0,303925.0,196541.0,5777350.0,0.0


In [0]:
test = pd.read_hdf(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5',
    'test')

In [0]:
# load the best NN
file = 'nmodel_roc_0.856_balanced_acc_0.000_model_loss_0.017.pt'
PATH = '/content/gdrive/My Drive/Colab Notebooks/' + file

model = torch.load(PATH)
model.eval()

# use NN to predict pCTR
batches = np.array_split(test, 200)

pCTR_NN_testset = []
for batch in batches:
    data = batch.drop(columns=['click', 'payprice']).values
    target = batch['click'].values
    
    data = torch.from_numpy(data).float()
    target = torch.from_numpy(target).float()
    
    output = model(data)
    
    prediction = (output.data).float()
    y_hat_prob = prediction.cpu().numpy().squeeze()
    
    pCTR_NN_testset.extend(y_hat_prob)
    
pCTR_NN_testset = np.array(pCTR_NN_testset)

# load the trained LR
lr = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/lr_model", "rb" ))

# user LR to predict pCTR
pCTR_lr_testset = lr.predict_proba(test.drop(columns=['payprice', 'click']))[:, 1]


# Load different models
xgb = joblib.load('/content/gdrive/My Drive/Colab Notebooks/xgb_model')
pCTR_XGB_testset = xgb.predict(test.drop(columns=['payprice', 'click']).values)

pCTR_ensemble_testset = np.average(np.array([pCTR_NN_testset, pCTR_lr_testset, pCTR_XGB_testset]), axis=0)

print('done')



done


In [0]:
X_test = test.drop(columns=['click', 'payprice'])

bids = BidStrategy.linear_bidding(pCTR_ensemble_testset, avgCTR, 6.4)

# get bid id's
test_raw = pd.read_csv(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/test.csv')

# export to file
df_bids = pd.DataFrame(
    np.round(bids, 1), index=test_raw['bidid'].values, columns=['bidprice'])
df_bids.index.name = 'bidid'
df_bids = df_bids.reset_index()

file = '/content/gdrive/My Drive/Colab Notebooks/bid_attemnt_ensemble_linear_6.4_{}.csv'.format(
    time.strftime('%Y-%m-%d_%H:%M:%S'))
df_bids.to_csv(file, index=False)