In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [2]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from models.CTRLogistic import CTRLogistic

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.CDFBid import *
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [3]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [4]:
# create a data hanlder instance:
data_handler = DataHandler(train_set_path, vali_set_path, test_set_path, debug_mode=False)

In [5]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

In [6]:
np.random.seed(100)
cdf_bid = CDFBid(CECDFAlgorithm(), CTRLogistic(max_iter=500), n_rounds=1)
ev = Evaluator(cdf_bid, data_handler)

number_clicks = ev.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 6
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 9
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: slotheight - orig: 14, after min freq : 14
####### categories: keypage - orig: 19, after min freq : 19
####### categories: slotwidth - orig: 21, after min freq : 21
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 35
####### categories: useragent - orig: 38, after min freq : 38
####### categories: creative - orig: 131, after min freq : 131
####### categories: slotprice - orig: 286, after min freq : 286
####### categories: city - orig: 370, after min freq : 370
####### categories: domain - ori

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [33:26<00:00, 189.88s/it]


142 284.37003030883335 18.511161303695932
186 488.23733076340017 11.701268245932262
181 614.3011719679836 9.396981296571665
192 798.1356906008754 7.824958933952929
194 842.4041851338989 9.33430808260245
195 846.2297463601091 10.25252564344463
186 914.5090300293996 10.724446211240585
195 928.4178516597553 14.229708348314809
193 941.6766403004673 17.795827982513366
193 944.3954302052351 24.32832287434821
Optimal base bid mean, std: 6.849404961610803, 0.04774944512590976
Optimal min bid mean, std: 3.1138069551322176, 0.3945485188730979


In [7]:
number_clicks

(157,
 0.0014448606215661554,
 5888.972,
 54.19582002742474,
 37.50937579617834,
 108661)

In [9]:
train_bids = cdf_bid.predict(tx)
valid_bids = cdf_bid.predict(vx)
test_bids = cdf_bid.predict(tex)

In [None]:
import dill, gzip

In [None]:
vdict = dict(zip(vx.bidid, valid_bids))
tedict = dict(zip(tex.bidid, test_bids))

dump = {}
dump.update(vdict)
dump.update(tedict)

In [8]:
dill.dump(dump, gzip.open('../pretrain/prepared_cdf_logistic.gz', 'wb'))

In [11]:
dill.dump({'train': train_bids,
           'valid': valid_bids,
            'test': test_bids}, gzip.open('../pretrain/cdfbid_logistic_preds.bin.gz', 'wb'))