In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [2]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from CTRModels.CTRLogistic import CTRLogistic
from algorithms.LinearBid import LinearBid

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.ConstantBid import ConstantBid, ConstantBidCE
from algorithms.RandomBid import RandomBid
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [3]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [4]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [5]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

# Logistic

* Calibrate probablity + avgCTR $\approx$ Calibrate probability + avgPCTR > avgCTR

## Calibrated probability + avg pCTR

In [30]:
np.random.seed(100)

In [31]:
ceAlgo = CEAlgorithm(n_samples=100, p=0.2, max_iter=10, n_jobs=10)
ctrLog = CTRLogistic(features=['slotformat',
                 'adexchange',
                 'os',
                 'weekday',
                 'advertiser',
                 'browser',
                 'slotvisibility',
                 'slotheight',
                 'keypage',
                 'slotwidth',
                 'hour',
                 'region',
                 'useragent',
                 'creative',
                 'slotprice',  # low ranking feature imp but more clicks in the end
#                  'slotprice_z',
                 'city',
                 'domain',
                 'slotid',
                 'IP',  # 'IP split',
                 # 'usertag',
                 'bag of tags',
                 'url'])

algo = LinearBid(ceAlgo, ctrModel=ctrLog, n_rounds=1)

ev = Evaluator(algo, data_handler)

number_clicks = ev.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 6
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 9
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: slotheight - orig: 14, after min freq : 14
####### categories: keypage - orig: 19, after min freq : 19
####### categories: slotwidth - orig: 21, after min freq : 21
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 35
####### categories: useragent - orig: 38, after min freq : 38
####### categories: creative - orig: 131, after min freq : 131
####### categories: slotprice - orig: 286, after min freq : 286
####### categories: city - orig: 370, after min freq : 370
####### categories: domain - ori



112.87275081737168
135.11295519030205
141.22656417814787
141.3722709575078
converge!!
Optimal mean, std: 4.951190122010974, 0.020322827903079873
Optimal base_bid: 141.3722709575078


In [29]:
number_clicks

(161,
 0.0014919425833773503,
 5811.543,
 53.8539656945873,
 36.0965403726708,
 107913)

In [24]:
number_clicks # without both

(165,
 0.0014790114825073278,
 6063.087,
 54.34772904509641,
 36.74598181818182,
 111561)

In [25]:
p = algo.ctrModel.predict(vx)
roc_auc_score(vy.click, p)

0.864863936241018

In [16]:
number_clicks

(165,
 0.001500982461247362,
 5950.751,
 54.13316898333455,
 36.065157575757574,
 109928)

In [17]:
p = algo.ctrModel.predict(vx) # with z-score only
roc_auc_score(vy.click, p)

0.864863936241018

In [13]:
number_clicks # with z-score and slotprice

(165,
 0.0015056667822532074,
 5948.434,
 54.28096654682167,
 36.051115151515155,
 109586)

In [10]:
p = algo.ctrModel.predict(vx)
roc_auc_score(vy.click, p)

0.8647839226095246

## + avg CTR

In [12]:
algo.use_pretrained = True
algo.avg_ctr = ty.click.mean()
algo._base_bid = None
number_clicks = ev.evaluate()

In [13]:
number_clicks

(142,
 0.0012839175761082831,
 6253.955,
 56.54621651190336,
 44.04193661971831,
 110599)

In [10]:
number_clicks

(166, 0.001440597066736093, 6301446, 54.68581098672221, 2.634315996677588e-05)

## avg pCTR

In [14]:
ceAlgo2 = CEAlgorithm(n_samples=100, p=0.2, max_iter=10, n_jobs=10)
ctrLog2 = CTRLogistic(calibrate_prob=False)

algo2 = LinearBid(ceAlgo2, ctrModel=ctrLog2, n_rounds=1)
ev2 = Evaluator(algo2, data_handler)

number_clicks = ev2.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 6
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 9
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: slotheight - orig: 14, after min freq : 14
####### categories: keypage - orig: 19, after min freq : 19
####### categories: slotwidth - orig: 21, after min freq : 21
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 35
####### categories: useragent - orig: 38, after min freq : 38
####### categories: creative - orig: 131, after min freq : 131
####### categories: slotprice - orig: 286, after min freq : 286
####### categories: city - orig: 370, after min freq : 370
####### categories: domain - ori



96.7565660037767
119.55339125630691
128.49267937756036
122.81299342911112
108.25445796292934
106.64339465892144
109.85344206999187
106.84904124386385
107.37733883971593
converge!!
Optimal mean, std: 4.676291896385382, 0.01070193049999594
Optimal base_bid: 107.37733883971593


In [15]:
algo2.avg_ctr

0.2450299400976302

In [16]:
number_clicks

(104,
 0.0009561372056890164,
 6261.28,
 57.56387272342812,
 60.20461538461538,
 108771)