In [18]:
import sys
import csv
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy as np
from math import sqrt
from sklearn.model_selection import KFold

#from IPython.display import clear_output

In [2]:
# read the data
train_set = pd.read_csv("data/train.csv")
valid_set = pd.read_csv("data/validation.csv")
test_set = pd.read_csv("data/test.csv") 

print(train_set.shape)
print(valid_set.shape)
print(test_set.shape)

(2430981, 25)
(303925, 25)
(303375, 22)


In [5]:
# downsample train set, too big
def downsampling(data):
    no_click = data.query('click == 0')
    do_click = data.query('click == 1')
    nums = len(do_click) * 700
    new_no_click = no_click.sample(n=nums, random_state=615)
    return pd.concat([new_no_click, do_click])

In [4]:
# drop user tag for problem 3 ... to be considered more
# slotprice is considered as continuous variable

def data_preprocessing(data, enforce_cols = None):
    #data = data.sort_index(axis=0)
    
    # drop features
    to_drop_columns = ['bidid', 'keypage', 'userid', 'url', 'urlid',
                       'IP', 'domain', 'slotid', 'creative', 'usertag']
    data = data.drop(to_drop_columns, axis=1)
    
    # one hot encoding categorical variables
    categoricals = ['weekday', 'hour', 'useragent', 'region', 'city', 'adexchange', 'slotwidth',
                    'slotheight', 'slotvisibility', 'slotformat', 'advertiser']

    for tag in categoricals:
        s = pd.Series(data[tag])
        d = pd.get_dummies(s, dummy_na=True)
        
        for k in d.keys():
            data[tag + '_' + str(k)] = d[k]
        
        data = data.drop(tag, axis = 1)
    
    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(sorted(data.columns), axis=1)
        
    return data

In [6]:
train = downsampling(train_set)
train = data_preprocessing(train)
valid = data_preprocessing(valid_set, train.columns)
test = data_preprocessing(test_set, train.columns)

print(train.shape)
print(valid.shape)
print(test.shape)

(1256893, 551)
(303925, 551)
(303375, 551)


In [7]:
to_drop_columns = ['bidprice', 'payprice']
train = train.drop(to_drop_columns, axis = 1)
valid = valid.drop(to_drop_columns, axis = 1)
test = test.drop(to_drop_columns, axis = 1)
test = test.drop('click', axis = 1)

print(train.shape)
print(valid.shape)
print(test.shape)

(1256893, 549)
(303925, 549)
(303375, 548)


In [8]:
train.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_nan,advertiser_1458.0,advertiser_2259.0,advertiser_2261.0,advertiser_2821.0,advertiser_2997.0,...,useragent_windows_sogou,useragent_windows_theworld,weekday_0.0,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,weekday_6.0,weekday_nan
25572,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2193497,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1880322,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1044423,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1083144,0,0,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
valid.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_nan,advertiser_1458.0,advertiser_2259.0,advertiser_2261.0,advertiser_2821.0,advertiser_2997.0,...,useragent_windows_sogou,useragent_windows_theworld,weekday_0.0,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,weekday_6.0,weekday_nan
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
test.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_nan,advertiser_1458.0,advertiser_2259.0,advertiser_2261.0,advertiser_2821.0,advertiser_2997.0,...,useragent_windows_sogou,useragent_windows_theworld,weekday_0.0,weekday_1.0,weekday_2.0,weekday_3.0,weekday_4.0,weekday_5.0,weekday_6.0,weekday_nan
0,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [11]:
train_x = train.drop('click', axis = 1)
train_y = train['click']
valid_x = valid.drop('click', axis = 1)
valid_y = valid['click']
test_x = test

In [16]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

def KFold_model(trainX, trainY, test):
    trainX = np.array(trainX)
    trainY = np.array(trainY)
    test = np.array(test)
    
    K = 5
    kfold = KFold(n_splits=K, random_state=7,shuffle = True)

    xgb_preds = []
    val_scores = []
    
    for train_index, val_index in kfold.split(trainX):
        
        train_X, valid_X = trainX[train_index], trainX[val_index]
        train_y, valid_y = trainY[train_index], trainY[val_index]
        xgb_params = {'eta': 0.3, 'max_depth': 3, 'subsample': 1.0
                      , 'colsample_bytree': 1.0, 'objective': 'reg:logistic'
                      , 'eval_metric': 'rmse', 'seed': 99, 'silent': True}

        d_train = xgb.DMatrix(train_X, train_y)
        d_valid = xgb.DMatrix(valid_X, valid_y)
        d_test = xgb.DMatrix(test)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        model = xgb.train(xgb_params, d_train, 500,  watchlist, maximize=False, 
                              verbose_eval=50, early_stopping_rounds=50)
        
        val_score = rmse(model.predict(d_valid, ntree_limit=model.best_ntree_limit), d_valid)
        print("The validation set score is :\t{}\n".format(val_score))
        val_scores.append(val_score)
        
        xgb_pred = model.predict(d_test)
        xgb_preds.append(list(xgb_pred))
        
    print(val_scores)
    
    preds = []
    for i in range(len(xgb_preds[0])):
        sumres = 0
        for j in range(K):
            sumres += xgb_preds[j][i]
        preds.append(sumres / K)
    
    return preds

In [17]:
valid_p = KFold_model(train_x, train_y, valid_x)

NameError: name 'KFold' is not defined

In [None]:
avgCTR = sum(train_set['click']) / len(train_set)

result_base_bid = []
result_clicks = []
result_CTR = []
result_spend = []
result_aCPM = []
result_aCPC = []

for base_bid in range(20, 80, 1):
    print(base_bid)
    bids = [p * base_bid / avgCTR for p in pCTR]
    
    clicks = 0
    winning_impressions = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            winning_impressions += 1
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPM = 0
        aCPC = 0
    else:
        aCPM = spend / winning_impressions * 1000
        aCPC = spend / clicks
    CTR = clicks / winning_impressions
    
    result_base_bid.append(base_bid)
    result_clicks.append(clicks)
    result_CTR.append(CTR)
    result_spend.append(spend)
    result_aCPM.append(aCPM)
    result_aCPC.append(aCPC)
    
    clear_output()