In [2]:
from __future__ import print_function, division

from google.colab import drive
drive.mount('/content/gdrive')

import numpy as np
import pandas as pd
import math
import pickle
import torch
import time
from tqdm import tqdm, tqdm_notebook
from sklearn.externals import joblib

from sklearn.metrics import balanced_accuracy_score, roc_auc_score, \
                            classification_report, log_loss

from sklearn.linear_model import LogisticRegression

import sys
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/')
from multi_agent_simulator import BiddingAgent, BiddingEnvironment, BidStrategy


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
validation = pd.read_hdf(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5',
    'validation')

# Collect pCTR from different models

In [4]:
# load the trained LR
lr = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/lr_model", "rb" ))

# user LR to predict pCTR
pCTR_lr = lr.predict_proba(validation.drop(columns=['payprice', 'click']))[:, 1]



In [5]:
# load the best NN
file = 'nmodel_roc_0.856_balanced_acc_0.000_model_loss_0.017.pt'
PATH = '/content/gdrive/My Drive/Colab Notebooks/' + file

model = torch.load(PATH)
model.eval()

# use NN to predict pCTR
batches = np.array_split(validation, 200)

pCTR_NN = []
for batch in batches:
    data = batch.drop(columns=['click', 'payprice']).values
    target = batch['click'].values
    
    data = torch.from_numpy(data).float()
    target = torch.from_numpy(target).float()
    
    output = model(data)
    
    prediction = (output.data).float()
    y_hat_prob = prediction.cpu().numpy().squeeze()
    
    pCTR_NN.extend(y_hat_prob)
    
pCTR_NN = np.array(pCTR_NN)
del(data, target, model, output, prediction, y_hat_prob)
print('done')

done


In [6]:
# Load different models
xgb = joblib.load('/content/gdrive/My Drive/Colab Notebooks/xgb_model')
pCTR_XGB = xgb.predict(validation.drop(columns=['payprice', 'click']).values)
print('done')



done


In [0]:
pCTR_ensemble = np.average(np.array([pCTR_lr, pCTR_NN, pCTR_XGB]), axis=0)

In [11]:
# get baseline avgCTR
no_click, click = np.bincount(validation['click'].values)
avgCTR = click / (no_click + click)

environment = BiddingEnvironment(validation)
results_ensemble = {}
for const in tqdm(np.arange(6.4, 6.5, .005)):
    
    agent = BiddingAgent(6250*1000, environment)

    bids = BidStrategy.linear_bidding(pCTR_ensemble, avgCTR, const)
    agent.simulate(bids, criteria='1')
    results_ensemble[const] = agent.statistics()
    
results_ensemble = pd.DataFrame(results_ensemble).T

100%|██████████| 20/20 [00:52<00:00,  2.60s/it]


In [12]:
results_ensemble.sort_values('clicks', ascending=False).head(5)

Unnamed: 0,CTR,aCPC,aCPM,budget_left,clicks,impressions,items,lost,spend,too_expensive
6.4,0.001535,38.78895,59.52401,4979.0,161.0,104916.0,303925.0,199009.0,6245021.0,0.0
6.41,0.001533,38.819851,59.529441,4.0,161.0,104990.0,303925.0,198922.0,6249996.0,13.0
6.415,0.001533,38.81987,59.527769,1.0,161.0,104993.0,303925.0,198882.0,6249999.0,50.0
6.42,0.001533,38.81987,59.523233,1.0,161.0,105001.0,303925.0,198837.0,6249999.0,87.0
6.425,0.001533,38.81987,59.528335,1.0,161.0,104992.0,303925.0,198788.0,6249999.0,145.0


In [18]:
agent = BiddingAgent(6250*1000, environment)

bids = BidStrategy.ortb1(pCTR_ensemble, 10, 2e-8)
agent.simulate(bids, criteria='1')
agent.statistics()

{'CTR': 0.0006431584024934756,
 'aCPC': 120.1923076923077,
 'aCPM': 77.3026926073889,
 'budget_left': 0,
 'clicks': 52,
 'impressions': 80851,
 'items': 303925,
 'lost': 3698,
 'spend': 6250000,
 'too_expensive': 219376}

In [0]:
test = pd.read_hdf(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/preprocessed.h5',
    'test')

In [20]:
# load the best NN
file = 'nmodel_roc_0.856_balanced_acc_0.000_model_loss_0.017.pt'
PATH = '/content/gdrive/My Drive/Colab Notebooks/' + file

model = torch.load(PATH)
model.eval()

# use NN to predict pCTR
batches = np.array_split(test, 200)

pCTR_NN_testset = []
for batch in batches:
    data = batch.drop(columns=['click', 'payprice']).values
    target = batch['click'].values
    
    data = torch.from_numpy(data).float()
    target = torch.from_numpy(target).float()
    
    output = model(data)
    
    prediction = (output.data).float()
    y_hat_prob = prediction.cpu().numpy().squeeze()
    
    pCTR_NN_testset.extend(y_hat_prob)
    
pCTR_NN_testset = np.array(pCTR_NN_testset)

# load the trained LR
lr = pickle.load(open( "/content/gdrive/My Drive/Colab Notebooks/lr_model", "rb" ))

# user LR to predict pCTR
pCTR_lr_testset = lr.predict_proba(test.drop(columns=['payprice', 'click']))[:, 1]


# Load different models
xgb = joblib.load('/content/gdrive/My Drive/Colab Notebooks/xgb_model')
pCTR_XGB_testset = xgb.predict(test.drop(columns=['payprice', 'click']).values)

pCTR_ensemble_testset = np.average(np.array([pCTR_NN_testset, pCTR_lr_testset, pCTR_XGB_testset]), axis=0)

print('done')



done


In [23]:
pCTR_ensemble_testset

array([8.72092017e-02, 4.41478144e-01, 6.34906610e-04, ...,
       1.79798253e-04, 3.47865679e-01, 4.21911805e-05])

In [0]:
X_test = test.drop(columns=['click', 'payprice'])

bids = BidStrategy.linear_bidding(pCTR_ensemble_testset, avgCTR, 6.4)

# get bid id's
test_raw = pd.read_csv(
    '/content/gdrive/My Drive/Colab Notebooks/we_data/test.csv')

# export to file
df_bids = pd.DataFrame(
    np.round(bids, 1), index=test_raw['bidid'].values, columns=['bidprice'])
df_bids.index.name = 'bidid'
df_bids = df_bids.reset_index()

file = '/content/gdrive/My Drive/Colab Notebooks/bid_attemnt_ensemble_linear_6.4_{}.csv'.format(
    time.strftime('%Y-%m-%d_%H:%M:%S'))
df_bids.to_csv(file, index=False)