In [2]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [3]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from CTRModels.CTRXGboost import CTRXGboost
from algorithms.LinearBid import LinearBid

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.ConstantBid import ConstantBid, ConstantBidCE
from algorithms.RandomBid import RandomBid
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [4]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [5]:
# Combine train and validations sets 
# df = pd.concat([pd.read_csv(train_set_path), pd.read_csv(vali_set_path)])
# df.to_csv('../../dataset/we_data/submission_train.csv', index=False)

In [6]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [7]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

# XGBoost Training

In [8]:
np.random.seed(111)

In [9]:
ceAlgo = CEAlgorithm(n_samples=100, p=0.3, max_iter=10, n_jobs=10)
ctrXG = CTRXGboost()

algo = LinearBid(ceAlgo, ctrModel=ctrXG, n_rounds=1, submission=True)

ev = Evaluator(algo, data_handler)

number_clicks = ev.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 5
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 8
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: keypage - orig: 19, after min freq : 19
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 33
####### categories: creative - orig: 128, after min freq : 104
####### categories: domain - orig: 8477, after min freq : 281
####### categories: city - orig: 370, after min freq : 319
####### categories: slotid - orig: 14080, after min freq : 325
[0]	validation_0-logloss:0.600547	validation_1-logloss:0.599452
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping

In [10]:
number_clicks

(170,
 0.0015225471317898884,
 6076.349,
 54.420751421790335,
 35.74322941176471,
 111655)

In [11]:
algo.avg_ctr_valid

0.0006646376573167723

In [12]:
p = algo.ctrModel.predict(vx)
roc_auc_score(vy.click, p)

0.8914570347010106

In [28]:
algo._base_params

145

In [38]:
# use alternative base bid  
algo._base_params = 140

In [39]:
bids = algo.predict(tex, mode='test')

# Generate csv

In [40]:
submission = tex.bidid.to_frame()

In [41]:
submission['bidprice'] = bids

In [42]:
submission.to_csv('../../submissions/xgboost_train_and_valid.csv', index=False)

In [43]:
bids.nonzero()[0].shape

(253626,)

# Submit to leaderboard

In [47]:
r = []

In [48]:
import requests
with open('../../submissions/xgboost_train_and_valid.csv', 'rb') as f:
    r += [requests.post('http://deepmining.cs.ucl.ac.uk/api/upload/wining_criteria_2/31pr3HIVQEC9', 
                      files={'file': f})]

In [49]:
print(r[-1].content.decode('utf-8'))

{
  "daily submission limit": 5, 
  "group": "3", 
  "ranking": 4, 
  "result": {
    "clicks": 14, 
    "cost": 4348.604165398205, 
    "cpc": 310.6145832427289, 
    "ctr": 0.000782472613458529, 
    "impressions": 17892
  }, 
  "today tried times": 1
}

