In [1]:
import sys
import csv
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
import numpy as np
from math import sqrt

from IPython.display import clear_output

In [2]:
# read the data
train_set = pd.read_csv("data/train.csv")
valid_set = pd.read_csv("data/validation.csv")
test_set = pd.read_csv("data/test.csv") 

print(train_set.shape)
print(valid_set.shape)
print(test_set.shape)

(2430981, 25)
(303925, 25)
(303375, 22)


In [24]:
# downsample train set, too big
def downsampling(data):
    no_click = data.query('click == 0')
    do_click = data.query('click == 1')
    nums = len(do_click) * 500
    new_no_click = no_click.sample(n=nums, random_state=615)
    return pd.concat([new_no_click, do_click])

In [25]:
# drop user tag for problem 3 ... to be considered more
# slotprice is considered as continuous variable

def data_preprocessing(data, enforce_cols = None):
    #data = data.sort_index(axis=0)
    
    # drop features
    to_drop_columns = ['bidid', 'keypage', 'userid', 'url', 'urlid',
                       'IP', 'domain', 'slotid', 'creative'] # , 'usertag'
    data = data.drop(to_drop_columns, axis=1)
    
    # one hot encoding categorical variables
    categoricals = ['weekday', 'hour', 'useragent', 'region', 'city', 'adexchange', 'slotwidth',
                    'slotheight', 'slotvisibility', 'slotformat', 'advertiser']

    #for tag in categoricals:
    #    s = pd.Series(data[tag])
    #    d = pd.get_dummies(s, dummy_na=True)
    #    
    #    for k in d.keys():
    #        data[tag + '_' + str(k)] = d[k]
    #    
    #    data = data.drop(tag, axis = 1)
    
    #colums_split = data['useragent'].str.split('_', expand=True)
    #data = data.join(colums_split.rename(columns={0:'os', 1:'browser'}))
    
    colums_split = data['usertag'].str.split(',')
    colums_split = colums_split.str.join('|').str.get_dummies()
    colums_split = colums_split.add_prefix('usertag_')
    data = data.join(colums_split)
    
    #data.drop(['useragent', 'usertag'], axis=1, inplace=True)
    data.drop(['usertag'], axis=1, inplace=True)
    data.fillna("unknown", inplace=True)
    data = pd.get_dummies(data)
    
    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(sorted(data.columns), axis=1)
        
    return data

In [27]:
train = downsampling(train_set)
train = data_preprocessing(train)
valid = data_preprocessing(valid_set, train.columns)
test = data_preprocessing(test_set, train.columns)

print(train.shape)
print(valid.shape)
print(test.shape)

(898293, 136)
(303925, 136)
(303375, 136)


In [28]:
to_drop_columns = ['bidprice', 'payprice']
train = train.drop(to_drop_columns, axis = 1)
valid = valid.drop(to_drop_columns, axis = 1)
test = test.drop(to_drop_columns, axis = 1)
test = test.drop('click', axis = 1)

print(train.shape)
print(valid.shape)
print(test.shape)

(898293, 134)
(303925, 134)
(303375, 133)


In [21]:
train.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,browser_chrome,browser_firefox,browser_ie,browser_maxthon,...,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,weekday
0,0,1,0,0,0,3427,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
1,1,0,0,0,0,2821,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3427,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
3,1,0,0,0,0,1458,0,0,1,0,...,0,0,0,0,0,0,0,0,0,6
4,0,1,0,0,0,2259,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5


In [10]:
valid.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,city,click,hour,region,...,useragent_windows_chrome,useragent_windows_firefox,useragent_windows_ie,useragent_windows_maxthon,useragent_windows_opera,useragent_windows_other,useragent_windows_safari,useragent_windows_sogou,useragent_windows_theworld,weekday
0,1,0,0,0,0,1458,79,0,20,79,...,0,0,1,0,0,0,0,0,0,4
1,1,0,0,0,0,3476,79,0,21,79,...,1,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3358,2,0,8,2,...,0,0,1,0,0,0,0,0,0,4
3,0,1,0,0,0,3358,205,0,15,201,...,1,0,0,0,0,0,0,0,0,5
4,0,1,0,0,0,3476,135,0,18,134,...,1,0,0,0,0,0,0,0,0,1


In [11]:
test.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,city,hour,region,slotformat_0,...,useragent_windows_chrome,useragent_windows_firefox,useragent_windows_ie,useragent_windows_maxthon,useragent_windows_opera,useragent_windows_other,useragent_windows_safari,useragent_windows_sogou,useragent_windows_theworld,weekday
0,0,0,1,0,0,3427,159,12,146,1,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,2997,1,14,1,0,...,0,0,0,0,0,0,0,0,0,3
2,1,0,0,0,0,1458,34,19,27,0,...,0,0,1,0,0,0,0,0,0,5
3,0,0,0,1,0,2821,245,21,238,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,3386,35,20,27,1,...,0,0,1,0,0,0,0,0,0,2


In [29]:
train_x = train.drop('click', axis = 1)
train_y = train['click']
valid_x = valid.drop('click', axis = 1)
valid_y = valid['click']
test_x = test

In [30]:
def rmse(pred, train):
    labels = train.get_label()
    diff = pred - labels                       
    mse = (diff ** 2).mean()  
    result = np.sqrt(mse)
    return result

def KFold_model(trainX, trainY, test):
    trainX = np.array(trainX)
    trainY = np.array(trainY)
    test = np.array(test)
    
    K = 5
    kf = KFold(n_splits = K, random_state = 615, shuffle = True)

    xgb_preds = []    
    for train_index, test_index in kf.split(trainX):
        
        train_X, valid_X = trainX[train_index], trainX[test_index]
        train_y, valid_y = trainY[train_index], trainY[test_index]
        xgb_params = {'eta': 0.3, 
                      'max_depth': 3, 
                      'subsample': 1.0, 
                      'colsample_bytree': 1.0, 
                      'objective': 'reg:logistic', 
                      'eval_metric': 'rmse', 
                      'seed': 99, 
                      'silent': True}

        d_train = xgb.DMatrix(train_X, train_y)
        d_valid = xgb.DMatrix(valid_X, valid_y)
        d_test = xgb.DMatrix(test)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(xgb_params, d_train, 500,  watchlist, maximize=False, 
                              verbose_eval=50, early_stopping_rounds=50)
        
        xgb_pred = model.predict(d_test)
        xgb_preds.append(list(xgb_pred))

    preds = []
    for i in range(len(xgb_preds[0])):
        sumres = 0
        for j in range(K):
            sumres += xgb_preds[j][i]
        preds.append(sumres / K)
    
    return preds

In [31]:
valid_p = KFold_model(train_x, train_y, valid_x)

[0]	train-rmse:0.355583	valid-rmse:0.355573
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.04006	valid-rmse:0.041173
[100]	train-rmse:0.039295	valid-rmse:0.040918
[150]	train-rmse:0.038851	valid-rmse:0.040775
[200]	train-rmse:0.038505	valid-rmse:0.040706
[250]	train-rmse:0.038227	valid-rmse:0.040696
[300]	train-rmse:0.037961	valid-rmse:0.040664
[350]	train-rmse:0.03769	valid-rmse:0.040667
Stopping. Best iteration:
[300]	train-rmse:0.037961	valid-rmse:0.040664

[0]	train-rmse:0.355556	valid-rmse:0.355627
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.039775	valid-rmse:0.041695
[100]	train-rmse:0.038997	valid-rmse:0.041138
[150]	train-rmse:0.038713	valid-rmse:0.04109
[200]	train-rmse:0.038333	valid-rmse:0.041015
[250]	train-rmse:0.03808	valid-rmse:0.040935
Sto

Linear XGB prediction

In [34]:
pCTR = valid_p
avgCTR = sum(train_set['click']) / len(train_set)

result_base_bid = []
result_clicks = []
result_CTR = []
result_spend = []
result_aCPM = []
result_aCPC = []

for base_bid in range(20, 150, 1):
    print(base_bid)
    bids = [p * base_bid / avgCTR for p in pCTR]
    
    clicks = 0
    winning_impressions = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            winning_impressions += 1
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPM = 0
        aCPC = 0
    else:
        aCPM = spend / winning_impressions * 1000
        aCPC = spend / clicks
    CTR = clicks / winning_impressions
    
    result_base_bid.append(base_bid)
    result_clicks.append(clicks)
    result_CTR.append(CTR)
    result_spend.append(spend)
    result_aCPM.append(aCPM)
    result_aCPC.append(aCPC)
    
    clear_output()
    print(clicks)

79
21


KeyboardInterrupt: 

In [13]:
submission = pd.DataFrame({'base_bid': result_base_bid, 'Clicks': result_clicks, 'CTR': result_CTR,
                           'Spend': result_spend, 'aCPM': result_aCPM, 'aCPC': result_aCPC})
submission.to_csv('xgboost_linear.csv', index=False)

In [17]:
# submission file generator
avgCTR = sum(train_set['click']) / len(train_set)
test_p = KFold_model(train_x, train_y, test_x)
bids = [p * 40 / avgCTR for p in test_p]

submission = pd.DataFrame({'bidid': test_set['bidid'], 'bidprice': bids})
submission.to_csv('Group 20.csv', index=False)

[0]	train-rmse:0.355319	valid-rmse:0.355311
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.037607	valid-rmse:0.03742
Stopping. Best iteration:
[28]	train-rmse:0.037636	valid-rmse:0.037413

The validation set score is :	0.037413470447063446

[0]	train-rmse:0.355302	valid-rmse:0.355346
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.037266	valid-rmse:0.038774
Stopping. Best iteration:
[23]	train-rmse:0.037302	valid-rmse:0.038757

The validation set score is :	0.038757454603910446

[0]	train-rmse:0.355337	valid-rmse:0.355277
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.037936	valid-rmse:0.036019
Stopping. Best iteration:
[40]	train-rmse:0.037949	

In [19]:
saveCTR = pd.DataFrame({'bidid': test_set['bidid'], 'pCTR': test_p})
saveCTR.to_csv('xgb_pCTR.csv', index=False)

Non-linear XGB

In [None]:
pCTR = KFold_model(train_x, train_y, valid_x)

In [42]:
result_base_bid = []
result_clicks = []
result_CTR = []
result_spend = []
result_aCPM = []
result_aCPC = []

lambda_val = 7 * (10 ** (-6))

for c in range(50, 100, 1):
    print(c)
    #bids = [(c * (((p + sqrt((c ** 2) * (lambda_val ** 2) + p ** 2)) / (c * lambda_val)) ** (1 / 3) 
    #              - ((c * lambda_val) / (p + sqrt((c ** 2) * (lambda_val ** 2) + p ** 2))) ** (1 / 3))) for p in pCTR]
    bids = [sqrt((c / lambda_val * p) + (c ** 2)) - c for p in pCTR]
    
    clicks = 0
    winning_impressions = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            winning_impressions += 1
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPM = 0
        aCPC = 0
    else:
        aCPM = spend / winning_impressions * 1000
        aCPC = spend / clicks
    CTR = clicks / winning_impressions
    
    result_base_bid.append(c)
    result_clicks.append(clicks)
    result_CTR.append(CTR)
    result_spend.append(spend)
    result_aCPM.append(aCPM)
    result_aCPC.append(aCPC)
    
    clear_output()
    print(clicks)

106


In [60]:
submission = pd.DataFrame({'c': result_base_bid, 'Clicks': result_clicks, 'CTR': result_CTR,
                           'Spend': result_spend, 'aCPM': result_aCPM, 'aCPC': result_aCPC})
submission.to_csv('xgboost_ortb1_kukuku.csv', index=False)

In [61]:
# submission file generator
pCTR_file = pd.read_csv("xgb/xgb_pCTR.csv")
pCTR = np.array(pCTR_file['pCTR'])

lambda_val = 7 * (10 ** (-6))
c = 120

bids = [sqrt((c / lambda_val * p) + (c ** 2)) - c for p in pCTR]

submission = pd.DataFrame({'bidid': test_set['bidid'], 'bidprice': bids})
submission.to_csv('Group 20.csv', index=False)