In [1]:
import sys
import csv
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import numpy as np
from math import sqrt

from IPython.display import clear_output

In [2]:
# read the data
train_set = pd.read_csv("data/train.csv")
valid_set = pd.read_csv("data/validation.csv")
test_set = pd.read_csv("data/test.csv") 

print(train_set.shape)
print(valid_set.shape)
print(test_set.shape)

(2430981, 25)
(303925, 25)
(303375, 22)


In [3]:
# downsample train set, too big
def downsampling(data):
    no_click = data.query('click == 0')
    do_click = data.query('click == 1')
    nums = len(do_click) * 700
    new_no_click = no_click.sample(n=nums, random_state=615)
    return pd.concat([new_no_click, do_click])

In [4]:
# drop user tag for problem 3 ... to be considered more
# slotprice is considered as continuous variable

def data_preprocessing(data, enforce_cols = None):
    #data = data.sort_index(axis=0)
    
    # drop features
    to_drop_columns = ['bidid', 'keypage', 'userid', 'url', 'urlid',
                       'IP', 'domain', 'slotid', 'creative'] #, 'usertag'
    data = data.drop(to_drop_columns, axis=1)
    
    # one hot encoding categorical variables
    """categoricals = ['weekday', 'hour', 'useragent', 'region', 'city', 'adexchange', 'slotwidth',
                    'slotheight', 'slotvisibility', 'slotformat', 'advertiser']   
    
    for tag in categoricals:
        s = pd.Series(data[tag])
        d = pd.get_dummies(s, dummy_na=True)
        
        for k in d.keys():
            data[tag + '_' + str(k)] = d[k]
        
        data = data.drop(tag, axis = 1)"""
        
    colums_split = data['useragent'].str.split('_', expand=True)
    data = data.join(colums_split.rename(columns={0:'os', 1:'browser'}))
    
    colums_split = data['usertag'].str.split(',')
    colums_split = colums_split.str.join('|').str.get_dummies()
    colums_split = colums_split.add_prefix('usertag_')
    data = data.join(colums_split)
    
    data.drop(['useragent', 'usertag'], axis=1, inplace=True)
    data.fillna("unknown", inplace=True)
    data = pd.get_dummies(data)
    
    # match test set and training set columns
    if enforce_cols is not None:
    # enforce_cols is the columns of train set, to_drop and to_add finds the difference
        to_drop = np.setdiff1d(data.columns, enforce_cols)
        to_add = np.setdiff1d(enforce_cols, data.columns)
        data.drop(to_drop, axis=1, inplace=True)
        data = data.assign(**{c: 0 for c in to_add})
        
    data = data.reindex(sorted(data.columns), axis=1)
        
    return data

In [6]:
#train = downsampling(train_set)
train = data_preprocessing(train_set)
valid = data_preprocessing(valid_set, train.columns)
test = data_preprocessing(test_set, train.columns)

print(train.shape)
print(valid.shape)
print(test.shape)

(2430981, 114)
(303925, 114)
(303375, 114)


In [7]:
to_drop_columns = ['bidprice', 'payprice']
train = train.drop(to_drop_columns, axis = 1)
valid = valid.drop(to_drop_columns, axis = 1)
test = test.drop(to_drop_columns, axis = 1)
test = test.drop('click', axis = 1)

print(train.shape)
print(valid.shape)
print(test.shape)

(2430981, 112)
(303925, 112)
(303375, 111)


In [16]:
train.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,browser_chrome,browser_firefox,browser_ie,browser_maxthon,...,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,weekday
0,0,1,0,0,0,3427,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
1,1,0,0,0,0,2821,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3427,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
3,1,0,0,0,0,1458,0,0,1,0,...,0,0,0,0,0,0,0,0,0,6
4,0,1,0,0,0,2259,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5


In [9]:
valid.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,browser_chrome,browser_firefox,browser_ie,browser_maxthon,...,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,weekday
0,1,0,0,0,0,1458,0,0,1,0,...,0,0,0,0,0,0,0,0,0,4
1,1,0,0,0,0,3476,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,0,0,0,3358,0,0,1,0,...,0,0,0,0,0,0,0,0,0,4
3,0,1,0,0,0,3358,1,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,0,1,0,0,0,3476,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
test.head()

Unnamed: 0,adexchange_1.0,adexchange_2.0,adexchange_3.0,adexchange_4.0,adexchange_unknown,advertiser,browser_chrome,browser_firefox,browser_ie,browser_maxthon,...,usertag_13874,usertag_14273,usertag_15398,usertag_16593,usertag_16617,usertag_16661,usertag_16706,usertag_16751,usertag_16753,weekday
0,0,0,1,0,0,3427,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,2997,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2,1,0,0,0,0,1458,0,0,1,0,...,0,0,0,0,0,0,0,0,0,5
3,0,0,0,1,0,2821,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,3386,0,0,1,0,...,0,0,0,0,0,0,0,0,0,2


In [11]:
train_x = train.drop('click', axis = 1)
train_y = train['click']
valid_x = valid.drop('click', axis = 1)
valid_y = valid['click']
test_x = test

In [23]:
error = sys.maxsize
best_model = None
pCTR = None
for i in range(5):
    # default L2 penalty
    model = linear_model.LogisticRegression()
    model.fit(train_x, train_y)
    # get probability of click = 1
    valid_p = model.predict_proba(valid_x)[:,1]
    rms = sqrt(mean_squared_error(valid_y, valid_p))
    if rms < error:
        error = rms
        best_model = model
        pCTR = valid_p

In [24]:
# sum pCTR
sum([x for x in pCTR]) / len(valid_x)

0.0008264186781649615

In [26]:
avgCTR = sum(train_set['click']) / len(train_set)

result_base_bid = []
result_clicks = []
result_CTR = []
result_spend = []
result_aCPM = []
result_aCPC = []

for base_bid in range(1, 131, 1):
    print(base_bid)
    bids = [p * base_bid / avgCTR for p in pCTR]
    
    clicks = 0
    winning_impressions = 0
    spend = 0
    budget = 6250 * 1000
    
    for i in range(len(valid_set)):
        if bids[i] > budget - spend:
            bid = budget - spend
        else:
            bid = bids[i]
        if bid >= valid_set['payprice'][i]:
            spend += valid_set['payprice'][i]
            winning_impressions += 1
            if str(valid_set['click'][i]) == '1':
                clicks += 1
    spend /= 1000
    
    if clicks == 0:
        aCPM = 0
        aCPC = 0
    else:
        aCPM = spend / winning_impressions * 1000
        aCPC = spend / clicks
    CTR = clicks / winning_impressions
    
    result_base_bid.append(base_bid)
    result_clicks.append(clicks)
    result_CTR.append(CTR)
    result_spend.append(spend)
    result_aCPM.append(aCPM)
    result_aCPC.append(aCPC)
    
    clear_output()
    print(clicks)

71


In [15]:
submission = pd.DataFrame({'base_bid': result_base_bid, 'Clicks': result_clicks, 'CTR': result_CTR,
                           'Spend': result_spend, 'aCPM': result_aCPM, 'aCPC': result_aCPC})
submission.to_csv('Question3_Jimmy.csv', index=False)

In [41]:
# generate online submission

avgCTR = sum(train_set['click']) / len(train_set)
test_p = model.predict_proba(test_x)[:,1]
bids = [p * 40 / avgCTR for p in test_p]

submission = pd.DataFrame({'bidid': test_set['bidid'], 'bidprice': bids})
submission.to_csv('Group 20.csv', index=False)