In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [2]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from models.CTRXGboost import CTRXGboost
from algorithms.LinearBid import LinearBid

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.ConstantBid import ConstantBid, ConstantBidCE
from algorithms.RandomBid import RandomBid
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [3]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [5]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [6]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

# XGBoost Training

In [7]:
np.random.seed(111)

In [8]:
ceAlgo = CEAlgorithm(n_samples=100, p=0.3, max_iter=10, n_jobs=10)
ctrXG = CTRXGboost()

algo = LinearBid(ceAlgo, ctrModel=ctrXG, n_rounds=1, submission=True)

ev = Evaluator(algo, data_handler)

number_clicks = ev.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 128, after min freq : 104
####### categories: domain - orig: 8477, after min freq : 281
####### categories: city - orig: 370, after min freq : 319
####### categories: slotid - orig: 14080, after min freq : 325
[0]	validation_0-logloss:0.600089	validation_1-logloss:0.599216
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

In [9]:
number_clicks

(173,
 0.0015403518769143102,
 6097.353,
 54.28941698126648,
 35.24481502890173,
 112312)

In [10]:
import dill, gzip
dill.dump(algo, gzip.open('../pretrain/linearbid_xgb.bin.gz','w+b'))

In [11]:
algo.avg_ctr_valid

0.0006646376573167722

In [12]:
p = algo.ctrModel.predict(vx)
roc_auc_score(vy.click, p)

0.8953859582775773

In [13]:
algo._base_params

139.7535276118427