### Problem 3. Linear Model

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GridSearchCV

from pandas import Series, DataFrame

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

#### Negative down sampling

In [4]:
def downsample(df):
    df_maj = train_df[df.click == 0]
    df_min = train_df[df.click == 1]

    # Negative downsampling rate
    w = 0.025

    # Majority class sampled without replacement
    df_maj = sklearn.utils.resample(df_maj, replace = False, 
                                                     n_samples = int(w * df_maj.shape[0]))

    # Combine the downsampled majority class and the minority class and shuffle
    df_downsample = pd.concat([df_maj, df_min]).sample(frac = 1)
    
    return df_downsample

#### Feature engineering

In [5]:
# Function used to put slot prices into pre-defined buckets

def buckets(x):

    if x >= 0 and x < 1:
        return 'first_bucket'
    elif x >= 1 and x <= 10:
        return 'second_bucket'
    elif x >= 11 and x <= 50:
        return 'third_bucket'
    elif x >= 51 and x <= 100:
        return 'fourth_bucet'
    else:
        return 'fifth_bucket'

In [6]:
# Function used to preprocess data
def features(df):
    # The test set does not have bid- or payprise
    if 'bidprice' and 'payprice' in df.columns:
        df['bidprice'] = df['bidprice'].apply(lambda x: x/1000.0)
        df['payprice'] = df['payprice'].apply(lambda x: x/1000.0)
    
    df['slotprice_buckets'] = df['slotprice'].apply(buckets)
    df['slotprice'] = df['slotprice'].apply(lambda x: x/1000.0)
    df['os'] = df['useragent'].apply(lambda x: x.split('_')[0])
    df['browser'] = df['useragent'].apply(lambda x: x.split('_')[1])
    df['weekday'] = df['weekday'].apply(lambda x: str(x))
    df['hour'] = df['hour'].apply(lambda x: str(x))
    df['region'] = df['region'].apply(lambda x: str(x))
    df['city'] = df['city'].apply(lambda x: str(x))
    df['adexchange'] = df['adexchange'].apply(lambda x: str(x))
    df['advertiser'] = df['advertiser'].apply(lambda x: str(x))
    df['slot_width*height'] = df['slotwidth'].apply(lambda x: str(x)) + '*' \
                            + df['slotheight'].apply(lambda x: str(x))
    
    if 'bidprice' and 'payprice' in df.columns:
        drop = ['bidid', 'bidprice', 'userid', 'url', 'urlid', 'IP', 'slotid', 'slotwidth', 'slotheight',
                'useragent']
    else:
        drop = ['bidid', 'bidprice', 'userid', 'url', 'urlid', 'IP', 'slotid', 'slotwidth', 'slotheight',
                'useragent']
    df = df.drop(drop, axis = 1)
    
    return df

In [7]:
def tag(df):
    df['usertag'] = df['usertag'].apply(lambda x: str(x).split(','))
    
    AllTags = np.array(df['usertag'])
    Tags = [tag for usertags in AllTags for tag in usertags]
    Tags = list(set(Tags))
    
    for t in Tags:
        df['usertag_' + t] = df['usertag'].apply(lambda x: int(t in x))
    
    return df

In [8]:
# Function used to extract the summary statistics

def stats(df):
    # Create DataFrames including statistics
    df_stats = df[['click', 'bidprice', 'payprice', 'advertiser']].sum()
    
    # Impressions
    df_stats['impressions'] = df.shape[0]
    
    # CTR, CPM, and eCPC
    df_stats['CTR'] = df_stats['click']/df_stats['impressions']
    df_stats['CPM'] = (df_stats['payprice']/df_stats['impressions'])*1000.0
    df_stats['eCPC'] = df_stats['payprice']/df_stats['click']
    
    return df_stats

#### Downsampled training data

In [9]:
train_df = pd.read_csv('train.csv')

# Create a new downsampled training DataFrame, used for the Logistic Regression
new_train_df = downsample(train_df)

# Easier to work with the usertag feature seperately
q = DataFrame(new_train_df['usertag'])

new_train_df = new_train_df.drop('usertag', axis = 1)
new_train_df = features(new_train_df)

q = tag(q)
q = q.drop('usertag', axis = 1)

cols = ['payprice', 'slotprice']
p = DataFrame(new_train_df[cols])
new_train_df = new_train_df.drop(cols, axis = 1)

new_train_df = pd.get_dummies(new_train_df)
new_train_df = pd.concat([new_train_df, q, p], axis = 1)

#### Validation data

In [10]:
# Import the data
validation_df = pd.read_csv('validation.csv')

q = DataFrame(validation_df['usertag'])

validation_df = validation_df.drop('usertag', axis = 1)
validation_df = features(validation_df)

q = tag(q)
q = q.drop('usertag', axis = 1)

cols = ['payprice', 'slotprice']
p = DataFrame(validation_df[cols])
validation_df = validation_df.drop(cols, axis = 1)

validation_df = pd.get_dummies(validation_df)
validation_df = pd.concat([validation_df, q, p], axis = 1)

In [11]:
train_feat = new_train_df.columns
validation_feat = validation_df.columns
feat = [x for x in validation_feat if x in train_feat]

new_train_df = new_train_df[feat]
validation_df = validation_df[feat]

len(feat)

4229

#### Logstic Regression

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [13]:
y_train = new_train_df.click
X_train = new_train_df.drop(['click', 'payprice'], axis = 1)

In [14]:
X_train.shape

(62522, 4227)

In [15]:
y_val = validation_df.click
X_val = validation_df.drop(['click', 'payprice'], axis = 1)

#### GRID SEARCH

In [16]:
tuned_parameters = [{'C': np.linspace(start=0.01,stop=1,num=10),"penalty":["l2"], "class_weight":["balanced"]}]
model = GridSearchCV(LogisticRegression(max_iter=500),param_grid=tuned_parameters,scoring="roc_auc",cv=5,n_jobs=-1 )
model.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid=[{'C': array([0.01, 0.12, 0.23, 0.34, 0.45, 0.56, 0.67, 0.78, 0.89, 1.  ]), 'penalty': ['l2'], 'class_weight': ['balanced']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [17]:
predictedgrid = model.predict(X_val)

# Confusion matrix
conf_mat = DataFrame(metrics.confusion_matrix(y_val, predictedgrid), 
                     columns = ['No click', 'Click'], 
                     index = ['No click', 'Click'])
conf_mat

Unnamed: 0,No click,Click
No click,253765,49958
Click,58,144


In [None]:
print ('Logistic Regression AU ROC: ', metrics.roc_auc_score(y_val,  model.predict_proba(X_val)[:,1] ))

#### "Normal"

In [33]:
model = LogisticRegression(max_iter = 500, solver = 'lbfgs', class_weight = 'balanced')
model.fit(X_train, y_train)

predicted = model.predict(X_val)

print ('Logistic Regression AU ROC: ', metrics.roc_auc_score(y_val,  model.predict_proba(X_val)[:,1] ))

Logistic Regression AU ROC:  0.83710557916846


#### Optimal Bids

In [18]:
# Calculate the pCTR, important to remember to re-calibrate because we trained on downsampled data
predicted_CTR = model.predict_proba(X_val)[:,1]
predicted_CTR = predicted_CTR/(predicted_CTR+((1-predicted_CTR)/0.025)) 

In [19]:
# Values for the base_bid search
budget = 6250
best_basebid = 0
best_clicks = 0
avg_ctr = train_df.click.value_counts()[1] / (train_df.shape[0])

In [110]:
for i in range(spent.shape[0]):
        if i == (spent.shape[0] - 1):
            index = i
            break
            
        elif spent.iloc[i] > budget:
            index = i - 1
            break
            
        else:
            pass

0.0

In [None]:
# Grid search to find the base_bid that yields the highest number of clicks
for base_bid in np.linspace(validation_df['payprice'].min(), validation_df['payprice'].max(), 1000):
    # The formula for the linear proble, defined in the paper
    validation_df['bid'] = base_bid * (predicted_CTR/avg_ctr)
    
    # Payprice, bid and clicks for the instances we have the winning bid
    impressions = validation_df[(validation_df['bid'] >= validation_df['slotprice']) \
                                & (validation_df['bid'] >= validation_df['payprice'])] \
                                [['payprice', 'bid', 'click']]
    
    spent = impressions['payprice'].cumsum()
    spent = DataFrame(spent)
    index = len(spent[spent['payprice'] < budget])
    
    # Number of clicks
    clicks = impressions['click'].loc[:index].sum()
    
    if clicks > best_clicks:
        best_basebid = base_bid 
        best_clicks = clicks

print(best_basebid)
best_bid = best_basebid * (predicted_CTR/avg_ctr) * 1000

#### Test data

In [None]:
# Import the data
test_df = pd.read_csv('test.csv')

q = DataFrame(test_df['usertag'])

test_df = test_df.drop('usertag', axis = 1)
test_df = features(test_df)

q = tag(q)
q = q.drop('usertag', axis = 1)

test_df = pd.get_dummies(test_df)
test_df = pd.concat([test_df, q], axis = 1)

new_train_df = new_train_df[feat].drop(['payprice', 'bidprice'], axis = 1)
test_df = test_df[feat]

predicted = model.predict(test_df)

In [None]:
X_test = test_df.drop(['click', 'bidid', 'payprice', 'bidprice'], axis = 1)