In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [2]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from models.CTRXGboost import CTRXGboost

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.CDFBid import *
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [3]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [4]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [5]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

In [6]:
np.random.seed(111)
cdf_bid = CDFBid(CECDFAlgorithm(), CTRXGboost(), n_rounds=1, n_ctr=5)
cdf_bid.train(data_handler)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 128, after min freq : 104
####### categories: domain - orig: 8477, after min freq : 281
####### categories: city - orig: 370, after min freq : 319
####### categories: slotid - orig: 14080, after min freq : 325
[0]	validation_0-logloss:0.600089	validation_1-logloss:0.599216
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

 20%|████████████████▌                                                                  | 1/5 [10:15<41:02, 615.74s/it]

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 128, after min freq : 104
####### categories: domain - orig: 8428, after min freq : 282
####### categories: city - orig: 369, after min freq : 317
####### categories: slotid - orig: 14083, after min freq : 323
[0]	validation_0-logloss:0.600266	validation_1-logloss:0.599328
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

 40%|█████████████████████████████████▏                                                 | 2/5 [19:55<30:14, 604.85s/it]

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 10
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 128, after min freq : 104
####### categories: domain - orig: 8356, after min freq : 279
####### categories: city - orig: 369, after min freq : 319
####### categories: slotid - orig: 13971, after min freq : 330
[0]	validation_0-logloss:0.600777	validation_1-logloss:0.599576
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

 60%|█████████████████████████████████████████████████▊                                 | 3/5 [34:27<22:50, 685.19s/it]

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 10
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 128, after min freq : 104
####### categories: domain - orig: 8419, after min freq : 274
####### categories: city - orig: 370, after min freq : 317
####### categories: slotid - orig: 13962, after min freq : 325
[0]	validation_0-logloss:0.600109	validation_1-logloss:0.599217
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

 80%|██████████████████████████████████████████████████████████████████▍                | 4/5 [42:47<10:29, 629.61s/it]

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 129, after min freq : 104
####### categories: domain - orig: 8321, after min freq : 286
####### categories: city - orig: 370, after min freq : 318
####### categories: slotid - orig: 14046, after min freq : 324
[0]	validation_0-logloss:0.600799	validation_1-logloss:0.599554
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [49:23<00:00, 559.47s/it]
  alpha = mu * ((mu * (1 - mu)) / var - 1)
  beta = (1 - mu) * ((mu * (1 - mu)) / var - 1)
  cond = logical_and(cond, (asarray(arg) > 0))


146 349.9412891634894 17.80783321432661
136 474.6859771387371 9.819457754802139
202 497.8135739650449 4.019093523159486
202 477.9608976706322 3.86575893933015
202 511.72555375151137 2.873328576064144
202 523.2714059042581 2.6466973934112175
202 513.3322097204476 2.1863481811844214
202 527.2599887897559 2.059205350548423
202 507.0839006330521 1.592823348894755
202 481.7082268618426 1.655346782885599
Optimal base bid mean, std: 6.163333666859772, 0.16736143784410396
Optimal min bid mean, std: 0.12020523453823247, 0.8761338811253185


In [9]:
from Evaluator import _evaluate

In [12]:
train_bids = cdf_bid.predict(tx)
valid_bids = cdf_bid.predict(vx)
test_bids = cdf_bid.predict(tex)

  alpha = mu * ((mu * (1 - mu)) / var - 1)
  beta = (1 - mu) * ((mu * (1 - mu)) / var - 1)
  cond = logical_and(cond, (asarray(arg) > 0))
  alpha = mu * ((mu * (1 - mu)) / var - 1)
  beta = (1 - mu) * ((mu * (1 - mu)) / var - 1)
  cond = logical_and(cond, (asarray(arg) > 0))


In [11]:
_evaluate(vy, valid_bids)

(171,
 0.002223320158102767,
 5955.344,
 77.43062200956938,
 34.82657309941521,
 76912)

In [7]:
import dill, gzip

dill.dump(cdf_bid, gzip.open('../pretrain/cdf_xgb.bin.gz', 'w+b'))

In [None]:
vdict = dict(zip(vx.bidid, valid_bids))
tedict = dict(zip(tex.bidid, test_bids))

dump = {}
dump.update(vdict)
dump.update(tedict)

In [13]:
dill.dump(dump, gzip.open('../pretrain/prepared_cdf_xgb.gz', 'w+b'))

In [14]:
dill.dump({'train': train_bids,
           'valid': valid_bids,
            'test': test_bids}, gzip.open('../pretrain/cdfbid_xgb_preds.bin.gz', 'wb'))