In [1]:
import sys
import csv
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
import numpy as np
from math import sqrt

#from IPython.display import clear_output

In [2]:
# read the data
train_set = pd.read_csv("data/train.csv")
valid_set = pd.read_csv("data/validation.csv")
test_set = pd.read_csv("data/test.csv") 

print(train_set.shape)
print(valid_set.shape)
print(test_set.shape)

(2430981, 25)
(303925, 25)
(303375, 22)


In [3]:
# downsample train set, too big
def downsampling(data):
    no_click = data.query('click == 0')
    do_click = data.query('click == 1')
    nums = len(do_click) * 50
    new_no_click = no_click.sample(n=nums, random_state=615)
    return pd.concat([new_no_click, do_click])

In [4]:
# drop user tag for problem 3 ... to be considered more
# slotprice is considered as continuous variable

def data_preprocessing(data, enforce_cols = None):
    #data = data.sort_index(axis=0)
    
    # drop features
    to_drop_columns = ['bidid', 'keypage', 'userid', 'url', 'urlid',
                       'IP', 'domain', 'slotid', 'creative'] # , 'usertag'
    data = data.drop(to_drop_columns, axis=1)
    
    # one hot encoding categorical variables
    categoricals = ['weekday', 'hour', 'useragent', 'region', 'city', 'adexchange', 'slotwidth',
                    'slotheight', 'slotvisibility', 'slotformat', 'advertiser']

    for tag in categoricals:
        s = pd.Series(data[tag])
        d = pd.get_dummies(s, dummy_na=True)
        
        for k in d.keys():
            data[tag + '_' + str(k)] = d[k]
        
        data = data.drop(tag, axis = 1)
    
    colums_split = data['usertag'].str.split(',')
    colums_split = colums_split.str.join('|').str.get_dummies()
    colums_split = colums_split.add_prefix('usertag_')
    data = data.join(colums_split)
    data.drop(['usertag'], axis=1, inplace=True)

    data.fillna("unknown", inplace=True)
    data = pd.get_dummies(data)
    
    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(sorted(data.columns), axis=1)
        
    return data

In [5]:
train = downsampling(train_set)
train = data_preprocessing(train)
valid = data_preprocessing(valid_set, train.columns)
test = data_preprocessing(test_set, train.columns)

print(train.shape)
print(valid.shape)
print(test.shape)

(91443, 611)
(303925, 611)
(303375, 611)


In [6]:
to_drop_columns = ['bidprice', 'payprice']
train = train.drop(to_drop_columns, axis = 1)
valid = valid.drop(to_drop_columns, axis = 1)
test = test.drop(to_drop_columns, axis = 1)
test = test.drop('click', axis = 1)

print(train.shape)
print(valid.shape)
print(test.shape)

(91443, 609)
(303925, 609)
(303375, 608)


In [21]:
train.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,browser_chrome,browser_firefox,browser_ie,browser_maxthon,...,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,weekday
0,0,1,0,0,0,3427,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
1,1,0,0,0,0,2821,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3427,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
3,1,0,0,0,0,1458,0,0,1,0,...,0,0,0,0,0,0,0,0,0,6
4,0,1,0,0,0,2259,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5


In [10]:
valid.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,city,click,hour,region,...,useragent_windows_chrome,useragent_windows_firefox,useragent_windows_ie,useragent_windows_maxthon,useragent_windows_opera,useragent_windows_other,useragent_windows_safari,useragent_windows_sogou,useragent_windows_theworld,weekday
0,1,0,0,0,0,1458,79,0,20,79,...,0,0,1,0,0,0,0,0,0,4
1,1,0,0,0,0,3476,79,0,21,79,...,1,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3358,2,0,8,2,...,0,0,1,0,0,0,0,0,0,4
3,0,1,0,0,0,3358,205,0,15,201,...,1,0,0,0,0,0,0,0,0,5
4,0,1,0,0,0,3476,135,0,18,134,...,1,0,0,0,0,0,0,0,0,1


In [11]:
test.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,city,hour,region,slotformat_0,...,useragent_windows_chrome,useragent_windows_firefox,useragent_windows_ie,useragent_windows_maxthon,useragent_windows_opera,useragent_windows_other,useragent_windows_safari,useragent_windows_sogou,useragent_windows_theworld,weekday
0,0,0,1,0,0,3427,159,12,146,1,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,2997,1,14,1,0,...,0,0,0,0,0,0,0,0,0,3
2,1,0,0,0,0,1458,34,19,27,0,...,0,0,1,0,0,0,0,0,0,5
3,0,0,0,1,0,2821,245,21,238,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,3386,35,20,27,1,...,0,0,1,0,0,0,0,0,0,2


In [7]:
train_x = train.drop('click', axis = 1)
train_y = train['click']
valid_x = valid.drop('click', axis = 1)
valid_y = valid['click']
test_x = test

In [8]:
def KFold_model(trainX, trainY, test):
    trainX = np.array(trainX)
    trainY = np.array(trainY)
    test = np.array(test)
    
    K = 5
    kf = KFold(n_splits = K, random_state = 615, shuffle = True)

    xgb_preds = []    
    for train_index, test_index in kf.split(trainX):
        
        train_X, valid_X = trainX[train_index], trainX[test_index]
        train_y, valid_y = trainY[train_index], trainY[test_index]
        xgb_params = {'eta': 0.013, #0.01
                      'max_depth': 6, 
                      'subsample': 1.0, 
                      'colsample_bytree': 1.0, 
                      'objective': 'reg:logistic', 
                      'eval_metric': 'rmse', 
                      'seed': 99, 
                      'silent': True}

        d_train = xgb.DMatrix(train_X, train_y)
        d_valid = xgb.DMatrix(valid_X, valid_y)
        d_test = xgb.DMatrix(test)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(xgb_params, d_train, 500,  watchlist, maximize=False, 
                              verbose_eval=50, early_stopping_rounds=50)
        
        xgb_pred = model.predict(d_test)
        xgb_preds.append(list(xgb_pred))

    preds = []
    for i in range(len(xgb_preds[0])):
        sumres = 0
        for j in range(K):
            sumres += xgb_preds[j][i]
        preds.append(sumres / K)
    
    return preds

In [9]:
valid_p = KFold_model(train_x, train_y, valid_x)

[0]	train-rmse:0.493858	valid-rmse:0.493857
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.27561	valid-rmse:0.275586
[100]	train-rmse:0.174001	valid-rmse:0.173978
[150]	train-rmse:0.133114	valid-rmse:0.13325
[200]	train-rmse:0.119196	valid-rmse:0.119364
[250]	train-rmse:0.11461	valid-rmse:0.114883
[300]	train-rmse:0.112766	valid-rmse:0.113272
[350]	train-rmse:0.111646	valid-rmse:0.112521
[400]	train-rmse:0.111096	valid-rmse:0.112256
[450]	train-rmse:0.110545	valid-rmse:0.112066
[499]	train-rmse:0.109988	valid-rmse:0.111813
[0]	train-rmse:0.493848	valid-rmse:0.493852
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.275386	valid-rmse:0.27571
[100]	train-rmse:0.173966	valid-rmse:0.174828
[150]	train-rmse:0.13316	valid-rmse:0.134584
[200]	train-rmse:0.119001	valid

Linear XGB prediction

In [19]:
pCTR = valid_p
avgCTR = sum(train_set['click']) / len(train_set)

result_base_bid = []
result_clicks = []
result_CTR = []
result_spend = []
result_aCPM = []
result_aCPC = []

for base_bid in np.arange(3.4, 3.6, 0.01):
    print(base_bid)
    bids = [p * base_bid / avgCTR for p in pCTR]
    
    clicks = 0
    winning_impressions = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            winning_impressions += 1
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPM = 0
        aCPC = 0
    else:
        aCPM = spend / winning_impressions * 1000
        aCPC = spend / clicks
    CTR = clicks / winning_impressions
    
    result_base_bid.append(base_bid)
    result_clicks.append(clicks)
    result_CTR.append(CTR)
    result_spend.append(spend)
    result_aCPM.append(aCPM)
    result_aCPC.append(aCPC)
    
    #clear_output()
    print(clicks, spend)
    

3.4
119 6008.611
3.4099999999999997
119 6033.139
3.4199999999999995
120 6053.924
3.4299999999999993
120 6072.44
3.439999999999999
120 6091.55
3.449999999999999
120 6111.62
3.4599999999999986
120 6129.957
3.4699999999999984
120 6151.443
3.479999999999998
120 6170.789
3.489999999999998
120 6191.399
3.499999999999998
120 6211.159
3.5099999999999976
120 6232.024
3.5199999999999974
120 6249.999
3.529999999999997
120 6250.0
3.539999999999997
120 6249.999
3.5499999999999967
119 6250.0
3.5599999999999965
119 6250.0
3.5699999999999963
118 6250.0
3.579999999999996
118 6250.0
3.589999999999996
117 6250.0
3.5999999999999956
117 6250.0


In [20]:
submission = pd.DataFrame({'base_bid': result_base_bid, 'Clicks': result_clicks, 'CTR': result_CTR,
                           'Spend': result_spend, 'aCPM': result_aCPM, 'aCPC': result_aCPC})
submission.to_csv('xgboost_linear.csv', index=False)

In [13]:
# submission file generator
avgCTR = sum(train_set['click']) / len(train_set)
test_p = KFold_model(train_x, train_y, test_x)
#test_p = pCTR
bids = [p * 3.48 / avgCTR * 1.34 for p in test_p]

submission = pd.DataFrame({'bidid': test_set['bidid'], 'bidprice': bids})
submission.to_csv('Group 20 xgb l.csv', index=False)

In [19]:
saveCTR = pd.DataFrame({'bidid': test_set['bidid'], 'pCTR': test_p})
saveCTR.to_csv('xgb_pCTR.csv', index=False)

Non-linear XGB

In [106]:
pCTR = KFold_model(train_x, train_y, valid_x)

[0]	train-rmse:0.493858	valid-rmse:0.493857
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.27561	valid-rmse:0.275586
[100]	train-rmse:0.174001	valid-rmse:0.173978
[150]	train-rmse:0.133114	valid-rmse:0.13325
[200]	train-rmse:0.119196	valid-rmse:0.119364
[250]	train-rmse:0.11461	valid-rmse:0.114883
[300]	train-rmse:0.112766	valid-rmse:0.113272
[350]	train-rmse:0.111646	valid-rmse:0.112521
[400]	train-rmse:0.111096	valid-rmse:0.112256
[450]	train-rmse:0.110545	valid-rmse:0.112066
[499]	train-rmse:0.109988	valid-rmse:0.111813
[0]	train-rmse:0.493848	valid-rmse:0.493852
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.275386	valid-rmse:0.27571
[100]	train-rmse:0.173966	valid-rmse:0.174828
[150]	train-rmse:0.13316	valid-rmse:0.134584
[200]	train-rmse:0.119001	valid

In [37]:
saveCTR = pd.DataFrame({'bidid': valid_set['bidid'], 'pCTR': pCTR})
saveCTR.to_csv('xgb_pCTR_valid_best.csv', index=False)

In [35]:
result_base_bid = []
result_clicks = []
result_CTR = []
result_spend = []
result_aCPM = []
result_aCPC = []

lambda_val = 7 * (10 ** (-5))

for c in np.arange(70, 80, 1):
    print(c)
    #bids = [(c * (((p + sqrt((c ** 2) * (lambda_val ** 2) + p ** 2)) / (c * lambda_val)) ** (1 / 3) 
    #              - ((c * lambda_val) / (p + sqrt((c ** 2) * (lambda_val ** 2) + p ** 2))) ** (1 / 3))) for p in pCTR]
    bids = [sqrt((c / lambda_val * p) + (c ** 2)) - c for p in pCTR]
    
    clicks = 0
    winning_impressions = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            winning_impressions += 1
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPM = 0
        aCPC = 0
    else:
        aCPM = spend / winning_impressions * 1000
        aCPC = spend / clicks
    CTR = clicks / winning_impressions
    
    result_base_bid.append(c)
    result_clicks.append(clicks)
    result_CTR.append(CTR)
    result_spend.append(spend)
    result_aCPM.append(aCPM)
    result_aCPC.append(aCPC)
    
    clear_output()
    print(clicks, spend)

117 6250.0


In [36]:
submission = pd.DataFrame({'c': result_base_bid, 'Clicks': result_clicks, 'CTR': result_CTR,
                           'Spend': result_spend, 'aCPM': result_aCPM, 'aCPC': result_aCPC})
submission.to_csv('xgboost_ortb1_best.csv', index=False)

In [12]:
# submission file generator
c = 74
lambda_val = 7 * (10 ** (-5))
bids = [sqrt((c / lambda_val * p) + (c ** 2)) - c for p in valid_p]

submission = pd.DataFrame({'bidid': valid_set['bidid'], 'bidprice': bids})
submission.to_csv('xgb ortb1.csv', index=False)

In [38]:
pCTR = KFold_model(train_x, train_y, test_x)

[0]	train-rmse:0.493858	valid-rmse:0.493857
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.27561	valid-rmse:0.275586
[100]	train-rmse:0.174001	valid-rmse:0.173978
[150]	train-rmse:0.133114	valid-rmse:0.13325
[200]	train-rmse:0.119196	valid-rmse:0.119364
[250]	train-rmse:0.11461	valid-rmse:0.114883
[300]	train-rmse:0.112766	valid-rmse:0.113272
[350]	train-rmse:0.111646	valid-rmse:0.112521
[400]	train-rmse:0.111096	valid-rmse:0.112256
[450]	train-rmse:0.110545	valid-rmse:0.112066
[499]	train-rmse:0.109988	valid-rmse:0.111813
[0]	train-rmse:0.493848	valid-rmse:0.493852
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[50]	train-rmse:0.275386	valid-rmse:0.27571
[100]	train-rmse:0.173966	valid-rmse:0.174828
[150]	train-rmse:0.13316	valid-rmse:0.134584
[200]	train-rmse:0.119001	valid

In [105]:
temp = pCTR

In [100]:
# submission file generator
#pCTR_file = pd.read_csv("xgb/xgb_pCTR.csv")
#pCTR = np.array(pCTR_file['pCTR'])
import random
random.seed(615)

lambda_val = 7 * (10 ** (-5))
c = 74

bids = [sqrt((c / lambda_val * p) + (c ** 2)) - c for p in pCTR]
for i in range(len(bids)):
    if bids[i] > 175:
        bids[i] = 1
    else:
        bids[i] = 0
for i in range(len(bids)):
    bids[i] *= 999999999999999999999999999999999999999999999999999
    #    if random.randint(1, 20) == 7:
#        win = 1
#    else:
#        win = 0
#    bids[i] = bids[i] * win * 20
print(sum(bids))
submission = pd.DataFrame({'bidid': test_set['bidid'], 'bidprice': bids})
submission.to_csv('Group 20 xgb cheet.csv', index=False)

9302999999999999999999999999999999999999999999999990697


In [142]:
# eval
avgCTR = sum(train_set['click']) / len(train_set)

result_base_bid = []
result_clicks = []
result_aCPC = []

for base_bid in np.arange(3.4, 3.6, 0.01):
    print(base_bid)
    bids = [p * base_bid / avgCTR for p in pCTR]

    clicks = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPC = 0
    else:
        aCPC = spend / clicks
    
    result_base_bid.append(base_bid)
    result_clicks.append(clicks)
    result_aCPC.append(aCPC)

3.4
3.4099999999999997
3.4199999999999995
3.4299999999999993
3.439999999999999
3.449999999999999
3.4599999999999986
3.4699999999999984
3.479999999999998
3.489999999999998
3.499999999999998
3.5099999999999976
3.5199999999999974
3.529999999999997
3.539999999999997
3.5499999999999967
3.5599999999999965
3.5699999999999963
3.579999999999996
3.589999999999996
3.5999999999999956


In [149]:
import matplotlib.pyplot as plt

for i in range(len(result_base_bid)):
    result_base_bid[i] = round(result_base_bid[i], 2)
    result_aCPC[i] *= 2
plt.clf()
plt.plot(result_base_bid, result_clicks, 'r-', label="clicks")
plt.plot(result_base_bid, result_aCPC, 'b-', label="CPC * 2")
plt.axis([3.39, 3.61, 100, 125])
plt.legend(bbox_to_anchor=(1.05, 1), loc=1, borderaxespad=0.)
plt.ylabel('CPC * 2, Clicks')
plt.xlabel('Base_bid')
plt.title('XGboost linear: CPC * 2, Clicks vs Base_bid')
#plt.show()
plt.savefig('xgb linear.png')