In [23]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [24]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from models.CTRLogistic import CTRLogistic
from algorithms.LinearBid import LinearBid

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.ConstantBid import ConstantBid, ConstantBidCE
from algorithms.RandomBid import RandomBid
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [25]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [26]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [27]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

# Logistic

## Calibrated probability + avg pCTR

In [28]:
np.random.seed(100)

In [29]:
ceAlgo = CEAlgorithm(n_samples=100, p=0.2, max_iter=10, n_jobs=10)
ctrLog = CTRLogistic(max_iter=500)

algo = LinearBid(ceAlgo, ctrModel=ctrLog, n_rounds=1)

ev = Evaluator(algo, data_handler)

number_clicks = ev.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 6
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 9
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: slotheight - orig: 14, after min freq : 14
####### categories: keypage - orig: 19, after min freq : 19
####### categories: slotwidth - orig: 21, after min freq : 21
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 35
####### categories: useragent - orig: 38, after min freq : 38
####### categories: creative - orig: 131, after min freq : 131
####### categories: slotprice - orig: 286, after min freq : 286
####### categories: city - orig: 370, after min freq : 370
####### categories: domain - ori

In [30]:
number_clicks

(166,
 0.0014819311526924725,
 6091.884,
 54.38405227824596,
 36.69809638554217,
 112016)

In [11]:
algo.avg_ctr_test

0.0006587308902696465

In [32]:
p = algo.ctrModel.predict(vx)
roc_auc_score(vy.click, p)

0.8648783285890742

In [12]:
train_bids = algo.predict(tx, mode='train')
valid_bids = algo.predict(vx, mode='valid')
test_bids = algo.predict(tex, mode='test')

In [13]:
import dill, gzip
dill.dump({'train': train_bids,
          'valid': valid_bids,
          'test': test_bids}, gzip.open('../pretrain/linearbid_logistic_preds.bin.gz','wb'))

### Non-linear

In [15]:
from algorithms.LinearBid import NonLinearBid

In [16]:
nonlinear_log = NonLinearBid(ctrModel=algo.ctrModel)
nonlinear_log.train_base_bid(data_handler)

Optimal c, lmd: 80, 1.4000000000000018e-06


In [18]:
train_bids = nonlinear_log.predict(tx)
valid_bids = nonlinear_log.predict(vx)
test_bids = nonlinear_log.predict(tex)

In [19]:
import dill
dill.dump({'train': train_bids,
          'valid': valid_bids,
          'test': test_bids}, gzip.open('../pretrain/nonlinearbid_logistic_preds.bin.gz','wb'))

In [21]:
from Evaluator import _evaluate

In [22]:
_evaluate(vy, valid_bids)

(164,
 0.0013104170162443769,
 6238.458,
 49.84744828247477,
 38.039378048780485,
 125151)

## + avg CTR

In [16]:
algo.use_pretrained = True
algo.avg_ctr = ty.click.mean()
algo._base_params = None
number_clicks = ev.evaluate()

125.18548705985702
141.85238346419584
144.3068124533798
144.32580144279368
converge!!
Optimal mean, std: 4.972043214925374, 0.0077510061610544185
Optimal base_bid: 144.32580144279368


In [17]:
number_clicks

(166,
 0.0014804773201576797,
 6118.484,
 54.567932504503865,
 36.858337349397594,
 112126)