In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [2]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from models.CTRLogistic import CTRLogistic
from models.CTRXGboost import CTRXGboost
from algorithms.LinearBid import LinearBid, NonLinearBid

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.ConstantBid import ConstantBid, ConstantBidCE
from algorithms.RandomBid import RandomBid
from DataHandler import DataHandler
from Evaluator import _evaluate

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

import dill as pickle
import gzip

# Load data files

In [3]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [4]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [5]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

# Train

In [6]:
linear_xgb = pickle.load(gzip.open('../pretrain/linearbid_xgb.bin.gz', 'r+b'))

In [10]:
nonlinear_xgb = NonLinearBid(ctrModel=linear_xgb.ctrModel)
nonlinear_xgb.train_base_bid(data_handler)

Optimal c, lmd: 40, 1.1000000000000013e-06


In [11]:
_evaluate(vy, nonlinear_xgb.predict(vx))

(169,
 0.0014150903899453223,
 5921.501,
 49.58259857486163,
 35.0384674556213,
 119427)

In [12]:
pickle.dump(nonlinear_xgb, gzip.open('../pretrain/nonlinearbid_xgb.bin.gz','wb'))

# Load pretrained

In [6]:
linear_xgb = pickle.load(gzip.open('../pretrain/linearbid_xgb.bin.gz', 'r+b'))
nonlinear_xgb = pickle.load(gzip.open('../pretrain/nonlinearbid_xgb.bin.gz', 'r+b'))

In [7]:
linear_log_preds = pickle.load(gzip.open('../pretrain/linearbid_logistic_preds.bin.gz', 'r+b'))
nonlinear_log_preds = pickle.load(gzip.open('../pretrain/nonlinearbid_logistic_preds.bin.gz', 'r+b'))
cdf_log_preds = pickle.load(gzip.open('../pretrain/cdfbid_logistic_preds.bin.gz', 'r+b'))
cdf_xgb_preds = pickle.load(gzip.open('../pretrain/cdfbid_xgb_preds.bin.gz', 'r+b'))

In [8]:
pred_log = linear_log_preds['train']
pred_xgb = linear_xgb.predict(tx, mode='valid')
pred_nnlog = nonlinear_log_preds['train']
pred_nnxgb = nonlinear_xgb.predict(tx)

pred_cdflog = cdf_log_preds['train']
pred_cdfxgb = cdf_xgb_preds['train']

In [9]:
bids_raw = np.vstack([pred_log, pred_xgb, pred_nnlog, pred_nnxgb, pred_cdflog, pred_cdfxgb])

In [16]:
train_y = pd.DataFrame(np.hstack([ty.values, bids_raw.T]))
train_y.columns = ty.columns.to_list()+train_y.columns[len(ty.columns):].tolist()

In [17]:
pickle.dump(train_y, gzip.open('../pretrain/ensemble_train_y_preds.bin.gz','wb'))

In [18]:
pred_log = linear_log_preds['valid']
pred_xgb = linear_xgb.predict(vx, mode='valid')
pred_nnlog = nonlinear_log_preds['valid']
pred_nnxgb = nonlinear_xgb.predict(vx)

pred_cdflog = cdf_log_preds['valid']
pred_cdfxgb = cdf_xgb_preds['valid']

In [19]:
bids_raw = np.vstack([pred_log, pred_xgb, pred_nnlog, pred_nnxgb, pred_cdflog, pred_cdfxgb])

In [20]:
valid_y = pd.DataFrame(np.hstack([vy.values, bids_raw.T]))
valid_y.columns = vy.columns.to_list()+valid_y.columns[len(vy.columns):].tolist()

In [21]:
pickle.dump(valid_y, gzip.open('../pretrain/ensemble_valid_y_preds.bin.gz','wb'))

In [35]:
pred_log = linear_log_preds['test']
pred_xgb = linear_xgb.predict(tex, mode='test')
pred_nnlog = nonlinear_log_preds['test']
pred_nnxgb = nonlinear_xgb.predict(tex)

pred_cdflog = cdf_log_preds['test']
pred_cdfxgb = cdf_xgb_preds['test']

In [40]:
bids_raw = np.vstack([pred_log, pred_xgb, pred_nnlog, pred_nnxgb, pred_cdflog, pred_cdfxgb])

In [41]:
test_y = pd.DataFrame(bids_raw).T

In [42]:
pickle.dump(test_y, gzip.open('../pretrain/ensemble_test_preds.bin.gz','wb'))

# Ensembling

## Load predictions

In [6]:
train_y = pickle.load(gzip.open('../pretrain/ensemble_train_y_preds.bin.gz','rb'))
valid_y = pickle.load(gzip.open('../pretrain/ensemble_valid_y_preds.bin.gz','rb'))

## Compute weights

In [22]:
from Evaluator import _evaluate
from algorithms.CEEnsemble import CEEnsemble

In [23]:
init = []
for i in range(train_y.shape[1]-3):
    init.append(_evaluate(ty, train_y.iloc[:,3+i].values)[0])

init = np.array(init)
print(init)

init = init/init.sum()

[1698 1785 1657 1779 1721 1791]


In [24]:
init

array([0.16278401, 0.17112453, 0.15885342, 0.17054932, 0.16498898,
       0.17169974])

In [25]:
for i in range(valid_y.shape[1]-3):
    print(_evaluate(vy, valid_y.iloc[:,3+i].values))

(166, 0.0014819311526924725, 6091.884, 54.38405227824596, 36.69809638554217, 112016)
(173, 0.0015403518769143102, 6097.353, 54.28941698126648, 35.24481502890173, 112312)
(164, 0.0013104170162443769, 6238.458, 49.84744828247477, 38.039378048780485, 125151)
(169, 0.0014150903899453223, 5921.501, 49.58259857486163, 35.0384674556213, 119427)
(157, 0.0014448606215661554, 5888.972, 54.19582002742474, 37.50937579617834, 108661)
(171, 0.002223320158102767, 5955.344, 77.43062200956938, 34.82657309941521, 76912)


In [59]:
np.random.seed(121)
wgt = CEEnsemble(n_samples=100, p=0.2, max_iter=10, n_jobs=12).train(train_y.drop(columns=[5,7]), valid_y.drop(columns=[5,7]))

102.0 [0.23451611 0.27900058 0.19820285 0.33990962]
175.0 [0.14382936 0.16176251 0.13967941 0.56564127]
201.0 [0.0931865  0.12225656 0.13072693 0.52131517]
202.0 [0.09074314 0.14238043 0.13603581 0.52744718]
202.0 [0.1074495  0.10085408 0.13460124 0.49677356]
201.0 [0.13906216 0.12519896 0.13546826 0.55819499]
201.0 [0.15517284 0.10539188 0.15616291 0.56386529]
199.0 [0.16869958 0.12419077 0.18888206 0.58404809]
202.0 [0.14575741 0.13120231 0.19251608 0.56610793]
202.0 [0.13408145 0.11834282 0.16515827 0.57197318]
Optimal weights: [0.13408145 0.11834282 0.16515827 0.57197318]


In [60]:
valid_bids = valid_y.drop(columns=[5, 7]).iloc[:,3:]@wgt
_evaluate(valid_y, valid_bids)

(174.0,
 0.001796555569322265,
 6260.933,
 64.64433362243423,
 35.98237356321839,
 96852)

In [61]:
test_y = pickle.load(gzip.open('../pretrain/ensemble_test_preds.bin.gz','rb'))

In [62]:
test_bids = test_y.drop(columns=[2, 4])@wgt

In [63]:
dump = {}
dump.update(dict(zip(vx.bidid, valid_bids)))
dump.update(dict(zip(tex.bidid, test_bids)))

In [64]:
pickle.dump(dump, gzip.open('../pretrain/prepared_ensemble.gz','wb'))

# Submission

In [65]:
submission = tex.bidid.to_frame()
submission['bidprice'] = test_bids

In [66]:
submission.to_csv('../../submissions/ensemble.csv', index=False)

In [67]:
r = []

In [70]:
# import requests
# with open('../../submissions/ensemble.csv', 'rb') as f:
#     r += [requests.post('http://deepmining.cs.ucl.ac.uk/api/upload/wining_criteria_1/31pr3HIVQEC9', 
#                       files={'file': f})]

In [71]:
print(r[-1].content.decode('utf-8'))

{
  "best result": {
    "clicks": 186, 
    "cost": 5990.831000001053, 
    "cpc": 32.208768817209965, 
    "ctr": 0.0019278207334010489, 
    "impressions": 96482
  }, 
  "daily submission limit": 5, 
  "group": "3", 
  "ranking": 1, 
  "result": {
    "clicks": 186, 
    "cost": 5990.831000001053, 
    "cpc": 32.208768817209965, 
    "ctr": 0.0019278207334010489, 
    "impressions": 96482
  }, 
  "today tried times": 4
}



In [72]:
# import requests
# with open('../../submissions/ensemble.csv', 'rb') as f:
#     r += [requests.post('http://deepmining.cs.ucl.ac.uk/api/upload/wining_criteria_2/31pr3HIVQEC9', 
#                       files={'file': f})]

In [73]:
print(r[-1].content.decode('utf-8'))

{
  "daily submission limit": 5, 
  "group": "3", 
  "ranking": 3, 
  "result": {
    "clicks": 21, 
    "cost": 2304.788217015783, 
    "cpc": 109.75181985789443, 
    "ctr": 0.0026714158504007124, 
    "impressions": 7861
  }, 
  "today tried times": 1
}

