In [1]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import gc

plt.rcParams['figure.figsize'] = (8, 5)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png', 'pdf')

%matplotlib inline

sns.set(font_scale=1.4)
sns.set_style("ticks")

from sklearn.metrics import roc_auc_score

In [2]:
# add pycharm project root path for ease of importing 
import sys
sys.path.append('../')

from models.CTRLogistic import CTRLogistic
from algorithms.LinearBid import LinearBid

from algorithms.CEAlgorithm import CEAlgorithm
from algorithms.ConstantBid import ConstantBid, ConstantBidCE
from algorithms.RandomBid import RandomBid
from Evaluator import Evaluator
from DataHandler import DataHandler

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 1000)

# define the places of the datasets:

In [3]:
train_set_path = '../../dataset/we_data/train.csv'
vali_set_path = '../../dataset/we_data/validation.csv'
test_set_path = '../../dataset/we_data/test.csv'

In [10]:
# Combine train and validations sets 
# df = pd.concat([pd.read_csv(train_set_path), pd.read_csv(vali_set_path)])
# df.to_csv('../../dataset/we_data/submission_train.csv', index=False)

In [4]:
# create a data hanlder instance:
data_handler = DataHandler('../../dataset/we_data/submission_train.csv', 
                           vali_set_path, test_set_path, debug_mode=False)

In [5]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

# Logistic Training

In [7]:
np.random.seed(100)

In [8]:
ceAlgo = CEAlgorithm(n_samples=100, p=0.2, max_iter=10, n_jobs=10)
ctrLog = CTRLogistic(max_iter=500)

algo = LinearBid(ceAlgo, ctrModel=ctrLog, n_rounds=1)

ev = Evaluator(algo, data_handler)

number_clicks = ev.evaluate()

####### categories: slotformat - orig: 4, after min freq : 4
####### categories: adexchange - orig: 5, after min freq : 5
####### categories: os - orig: 6, after min freq : 6
####### categories: weekday - orig: 7, after min freq : 7
####### categories: advertiser - orig: 9, after min freq : 9
####### categories: browser - orig: 9, after min freq : 9
####### categories: slotvisibility - orig: 11, after min freq : 11
####### categories: slotheight - orig: 14, after min freq : 14
####### categories: keypage - orig: 19, after min freq : 19
####### categories: slotwidth - orig: 21, after min freq : 21
####### categories: hour - orig: 24, after min freq : 24
####### categories: region - orig: 35, after min freq : 35
####### categories: useragent - orig: 38, after min freq : 38
####### categories: creative - orig: 131, after min freq : 131
####### categories: slotprice - orig: 287, after min freq : 287
####### categories: city - orig: 370, after min freq : 370
####### categories: domain - ori

In [9]:
number_clicks

(194,
 0.0017398947094644892,
 6097.051,
 54.681581331109136,
 31.428097938144333,
 111501)

In [11]:
algo.avg_ctr_train

0.0006667230855212643

In [12]:
p = algo.ctrModel.predict(vx)
roc_auc_score(vy.click, p)

0.9499355245626201

In [13]:
algo._base_params

147.87555131613308

In [14]:
data_handler = DataHandler(train_set_path, 
                           vali_set_path, test_set_path, debug_mode=False)

In [15]:
tx, ty, vx, vy, tex = data_handler.get_datasets()

In [16]:
train_bids = algo.predict(tx)
valid_bids = algo.predict(vx)
test_bids = algo.predict(tex)

In [17]:
import dill, gzip

In [19]:
dill.dump({'train': train_bids,
          'valid': valid_bids,
          'test': test_bids}, gzip.open('../pretrain/submission_linearbid_logistic_preds.bin.gz','wb'))

In [23]:
# use alternative base bid  
algo._base_params = 173

In [13]:
bids = algo.predict(tex)

# Generate csv

In [14]:
submission = tex.bidid.to_frame()

In [15]:
submission['bidprice'] = bids

In [16]:
submission.to_csv('../../submissions/logistic_train_and_valid.csv', index=False)

In [21]:
bids[:20]

array([684.02117336, 531.4805529 ,  24.08786669, 207.59675084,
       139.12554941,  72.0125071 ,  28.67383405, 324.45525797,
        12.48264281, 154.03316957,  22.56088865, 647.82064534,
       123.8856705 ,  80.91017608,  67.23857502,  84.40791557,
        27.79163566,  26.75938326,  59.04562626,  83.09240152])

# Submit to leaderboard

In [24]:
r = []

In [25]:
# import requests
# with open('../../submissions/logistic_train_and_valid.csv', 'rb') as f:
#     r += [requests.post('http://deepmining.cs.ucl.ac.uk/api/upload/wining_criteria_1/31pr3HIVQEC9', 
#                       files={'file': f})]

In [26]:
print(r[-1].content.decode('utf-8'))

{
  "best result": {
    "clicks": 170, 
    "cost": 6249.999000002058, 
    "cpc": 36.764700000012105, 
    "ctr": 0.00140619054709084, 
    "impressions": 120894
  }, 
  "daily submission limit": 5, 
  "group": "3", 
  "ranking": 1, 
  "result": {
    "clicks": 170, 
    "cost": 6249.999000002058, 
    "cpc": 36.764700000012105, 
    "ctr": 0.00140619054709084, 
    "impressions": 120894
  }, 
  "today tried times": 5
}

