In [1]:
pwd

'C:\\Users\\User\\OneDrive\\UCL\\term_2\\temp_git\\maai_bid_strategy'

In [1]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import math
import pickle
from tqdm import tqdm, tqdm_notebook
import sklearn

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import cross_val_score, KFold, train_test_split,GridSearchCV

from sklearn.metrics import balanced_accuracy_score, roc_auc_score, classification_report, log_loss, r2_score, accuracy_score

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

from imblearn.metrics import classification_report_imbalanced

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
X_resampled, y_resampled = pickle.load(open( "resampled_2", "rb" ))

In [3]:
def memory(df):
    print("Memory usage of the dataframe is {:.2f} MB".format(
        df.memory_usage().sum() / 1024**2))
    
    
def entropy(df, base = 2):
    """ Calculate the entropy for every column in a df"""
    
    entropy = {}
    
    for column in df.columns:
        prob = df[column].value_counts(normalize=True, sort=False)
        
        entropy[column] = -(prob * np.log(prob)/np.log(base)).sum()
        
    return pd.Series(entropy).sort_values(ascending=False)

In [7]:
google_drive = False
convert = False
path = 'C:/Users/User/OneDrive/UCL/term_2/git/maai_bid_strategy'

if(google_drive):
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/train.csv')
    validation = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/validation.csv')
    test = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/test.csv')
    
elif(convert):
    train = pd.read_csv(path+'/we_data/train.csv')
    train.to_hdf('train.h5', 'train')
    
else:
    train = pd.read_hdf('train.h5', 'train')
#     validation = pd.read_csv('validation.h5','validation')
    validation = pd.read_csv(path+'/we_data/validation.csv')
    test = pd.read_csv(path+'/we_data/test.csv')

In [8]:
train.shape, test.shape, validation.shape

((2430981, 25), (303375, 22), (303925, 25))

In [4]:
no_click,click = train['click'].value_counts().values
print('Baseline average CTR {:.5%}'.format(click/(no_click+no_click)))

Baseline average CTR 0.03691%


### Feature Engineering

In [5]:
def count_featuresize_one_hot_encoding(df):
    """Calcualte the number of featuers nessecary for one hot encoding"""

    total_features = 0
    for column in df.columns:
        total_features += len(df[column].unique())

    print('Rougly {:,} features in the feature space'.format(total_features))

    return total_features


def feature_engineering(df):
    """ Enrich dataframe with additional features
    
        Note that all fields that are joined are slightly redundent when
        implementing more sophisticated models like NN that could pick up
        on these feature combinations, however, it can improve the perforamce
        of simpler models such as logisitc regression"""

    # split user agent into os and browser
    df['os'], df['browser'] = df['useragent'].str.split('_').str

    # apple users
    df['apple'] = df['useragent'].str.match(r'(ios)|(mac)').astype(np.uint8)

    # deterime mobile devivce or not
    df['mobieldevice'] = df['useragent'].str.match(r'(ios)|(android)').astype(
        np.uint8)

    # hour per day
    df['weekdayhour'] = df['weekday'].astype(str) + '_' + df['hour'].astype(
        str)

    # bin hours into time of day
    df['timeofday'] = pd.cut(
        df['hour'].astype(int),
        4,
        labels=["night", "morning", "afternoon", "evening"])

    # bin ad surface size categories
    min_ad = min(df['slotwidth'] * df['slotheight']) - 1
    max_ad = max(df['slotwidth'] * df['slotheight'])

    ad_bins = pd.IntervalIndex.from_breaks(
        [min_ad, 65520, 75000, 90000, max_ad])

    replace, with_ = [
        pd.Interval(min_ad, 65520),
        pd.Interval(65520, 75000),
        pd.Interval(75000, 90000),
        pd.Interval(90000, max_ad)
    ], ['small', 'medium', 'large', 'x-large']

    df['adsize'] = pd.cut(
        df['slotwidth'] * df['slotheight'], bins=ad_bins).replace(
            replace, with_)

    # bin slot price into categories
    price_bins = pd.IntervalIndex.from_breaks(
        [min(df['slotprice']), 10, 50, 100,
         max(df['slotprice'])],
        closed='left')
    replace, with_ = [
        pd.Interval(min(df['slotprice']), 10, closed='left'),
        pd.Interval(10, 50, closed='left'),
        pd.Interval(50, 100, closed='left'),
        pd.Interval(100, max(df['slotprice']), closed='left')
    ], ['1', '2', '3', '4']

    df['slotprice'] = pd.cut(
        df['slotprice'], bins=price_bins).replace(replace, with_)

    # ad size category and visability
    df['advisabilitysize'] = df['slotvisibility'].astype(
        str) + '_' + df['adsize'].astype(str)

    return df.drop(columns=['slotwidth', 'slotheight'])


def pre_process_one_hot_encoding(df):
    """ Preprocess the dataframe for one hot encoding
    
        - Split the filed user tags and binary encode
        - Convert numerical categories into strings 
        """

    # convert numerical categories into strings as a quick hack
    # for one hot encoding to work properly on numerical categories
    df['weekday'] = df['weekday'].astype(str)
    df['hour'] = df['hour'].astype(str)
    df['region'] = df['region'].astype(str)
    df['city'] = df['city'].astype(str)
    df['adexchange'] = df['adexchange'].astype(str)
    df['advertiser'] = df['advertiser'].astype(str)

    # already processed and not found in columns
    if ('usertag' not in df.columns):
        return df

    df['usertag'] = df['usertag'].astype(str)
    mlb = MultiLabelBinarizer()

    df = df.join(
        pd.DataFrame(
            mlb.fit_transform(df['usertag'].str.split(',')),
            columns='usertag_' + mlb.classes_,
            index=df.index))

    # drop the usertag column
    df = df.drop(columns='usertag')

    return df


def drop_colums(df):

    # Remove uniuqe and meaningless featuers that are not know a pirori
    columns = ['bidprice', 'urlid', 'bidid']

    # remove some very sparse fields to reduce featuers (highest entropy)
    columns.extend(['userid', 'url', 'domain', 'slotid', 'IP'])

    # only remove columns that are in the df
    columns = [column for column in columns if column in df.columns]

    return df.drop(columns=columns)


def add_missing_colums(df, columns, sort_columns=True):
    """ Due to the feature engineering there is a chance a discrapency occurs
        between 
        """

    missing = [x for x in columns if x not in df.columns]

    for x in missing:
        print('Adding {}'.format(x))
        df[x] = 0

    if (sort_columns):
        return df[sorted(df.columns)]

    return df

### Performance Metrics

In [3]:
def calcluate_num_impressions(df, grouping = False):
    imp = {}
    
    if(grouping):
        imp = df.groupby(grouping).size().to_dict()
            
    else:
        imp = len(df)
    
    print(imp)
    return imp
    


def calcluate_num_clicks(df, grouping = False):
    clicks = {}
    
    if(grouping):
        for index, group in df.groupby(grouping):
            clicks[index] = np.sum(group['click'])
            
    else:
        clicks = np.sum(df['click'])
    
    print(clicks)
    return clicks
    

def calcluate_ctr(df, grouping = False):
    ctr = {}
    
    if(grouping):
        for index, group in df.groupby(grouping):
            ctr[index] = np.average(group['click']) * 100
            
    else:
        ctr = np.average(df['click']) * 100
    
    print(ctr)
    return ctr
    
    
def average_cost_per_mille(df, grouping = False):
    cpm = {}
    
    if(grouping):
        cpm = df.groupby('weekday').agg({'payprice': np.mean}).to_dict()['payprice']
        
    else:
        cpm = np.mean(df['payprice'])
        
    print(cpm)
    return cpm

### Build Feature Matrix

In [9]:
"""
other fillna strategies for adexchange should be considered at some point!

the only columns that contain a lot of missing values that are used in the final 
analysis are adexchange and usertag. Different strategies have been considered but 
it was deemed to be the most informative to assign a 'unknown' class which is easely
achieved trhough filling 0's since they do not occur in the dataset

"""
%time train = drop_colums(train)
%time train = train.fillna(0)
%time train = feature_engineering(train)
%time train = pre_process_one_hot_encoding(train)
%time train = pd.get_dummies(train)

%time test = drop_colums(test)
%time test = test.fillna(0)
%time test = pre_process_one_hot_encoding(test)
%time test = feature_engineering(test)
%time test = pd.get_dummies(test)
print('done')



Wall time: 635 ms
Wall time: 1.23 s
Wall time: 1min 10s
Wall time: 19 s
Wall time: 33 s
Wall time: 517 ms
Wall time: 198 ms
Wall time: 2.33 s
Wall time: 11 s
Wall time: 2.68 s
done


In [10]:
%time validation = drop_colums(validation)
%time validation = validation.fillna(0)
%time validation = feature_engineering(validation)
%time validation = pre_process_one_hot_encoding(validation)
%time validation = pd.get_dummies(validation)
print('done')

Wall time: 854 ms
Wall time: 134 ms
Wall time: 8.77 s
Wall time: 2.25 s
Wall time: 2.58 s
done


In [11]:
# the featuere engineering can construct columns that do not occur in other sets 
# this adds the columns of the joined colomuns
joined_colums = [item for slist in [validation.columns, train.columns, test.columns] for item in slist]

%time train = add_missing_colums(train,joined_colums)
%time validation = add_missing_colums(validation,joined_colums)
%time test = add_missing_colums(test,joined_colums)


# this list should be empty!!
[x for x in validation.columns if x not in train.columns]

Wall time: 11.9 s
Adding useragent_android_ie
Adding useragent_android_maxthon
Adding useragent_other_firefox
Adding creative_7324
Adding creative_7332
Adding useragent_android_maxthon
Adding creative_7332
Wall time: 1.24 s
Adding click
Adding payprice
Adding useragent_linux_ie
Adding useragent_mac_maxthon
Adding useragent_mac_sogou
Adding creative_7327
Adding click
Adding payprice
Adding useragent_android_ie
Adding useragent_linux_ie
Adding useragent_mac_maxthon
Adding useragent_mac_sogou
Adding useragent_other_firefox
Adding creative_7324
Adding creative_7327
Wall time: 1.09 s


[]

In [12]:
train.to_hdf('preprocessed.h5', 'train')
validation.to_hdf('preprocessed.h5', 'validation')
test.to_hdf('preprocessed.h5', 'test')
print('done')

done


In [13]:
train.shape, test.shape, validation.shape

((2430981, 965), (303375, 965), (303925, 965))

In [4]:
train = pd.read_hdf('preprocessed.h5', 'train')
validation = pd.read_hdf('preprocessed.h5', 'validation')
test = pd.read_hdf('preprocessed.h5', 'test')

train.shape, validation.shape

((2430981, 965), (303925, 965))

In [5]:
# Defining a Grid Search CV Function for Selected model
def model_CVGridSearch(model, param_grid, n_jobs, rs, scoring):
    
    estimator = model()
    cv = [(slice(None), slice(None))]
    classifier = GridSearchCV(estimator=estimator, cv=cv, refit=True,
                              param_grid=param_grid, n_jobs=n_jobs, scoring=scoring)
    classifier.fit(X_train, y_train)
    print("Best Estimator learned through GridSearch for " + str(model))
    print(classifier.best_estimator_)
    
    return cv, classifier.best_estimator_, classifier.best_params_, classifier.cv_results_

## KNN Classifier

#### Evaluate on AUC as log-loss shouldn't be used to evaluate KNN models [https://medium.com/@bengikoseoglu/why-log-loss-metric-shouldnt-be-used-to-evaluate-nearest-neighbour-classification-1fe314f460a2]

In [5]:
rs = 21
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,random_state = rs)

In [7]:
%%time
seed = 9
k_range = [5,9,17,20,25]
weight_options = ["uniform"]

scoring = 'accuracy'
param_grid = dict(n_neighbors = k_range, weights = weight_options)
knn = KNeighborsClassifier()

knn_jobs = -1
knn_rs = seed

knn_cv, knn_best, knn_params, knn_cvResults = model_CVGridSearch(model = KNeighborsClassifier
                                                                 ,param_grid = param_grid,n_jobs= knn_jobs, rs=knn_rs, scoring=scoring)


Best Estimator learned through GridSearch for <class 'sklearn.neighbors.classification.KNeighborsClassifier'>
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')
Wall time: 13h 31min 3s


In [9]:
knn_cvResults



{'mean_fit_time': array([394.97855902, 402.54831052, 399.01326299, 405.49282765,
        403.12892032]),
 'std_fit_time': array([0., 0., 0., 0., 0.]),
 'mean_score_time': array([18864.36063218, 20559.63835478, 23245.77737117, 23942.0502007 ,
        25034.18642879]),
 'std_score_time': array([0., 0., 0., 0., 0.]),
 'param_n_neighbors': masked_array(data=[5, 9, 17, 20, 25],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_weights': masked_array(data=['uniform', 'uniform', 'uniform', 'uniform', 'uniform'],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 5, 'weights': 'uniform'},
  {'n_neighbors': 9, 'weights': 'uniform'},
  {'n_neighbors': 17, 'weights': 'uniform'},
  {'n_neighbors': 20, 'weights': 'uniform'},
  {'n_neighbors': 25, 'weights': 'uniform'}],
 'split0_test_score': array([0.86041483, 0.77691466, 0.69031126, 0.68163557, 0.64

In [10]:
%%time
y_hat = knn_best.predict(X_test)

print(classification_report_imbalanced(y_test, y_hat, target_names=['no click', 'click']))
print("Balanaced accuracy score: {:.3%}".format(balanced_accuracy_score(y_test, y_hat) ))
print('ROC AUC score {}'.format(roc_auc_score(y_test, y_hat)))
print('done')

                   pre       rec       spe        f1       geo       iba       sup

   no click       1.00      0.61      1.00      0.76      0.78      0.59     24988
      click       0.72      1.00      0.61      0.84      0.78      0.64     24961

avg / total       0.86      0.81      0.81      0.80      0.78      0.61     49949

Balanced accuracy score: 80.691%
ROC AUC score 0.806907120714205
done
Wall time: 1h 13min 42s


In [11]:
%%time
## ReFit Best model on all resampled dataset

knn_best.fit(X_resampled,y_resampled)

Wall time: 4min 17s


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [6]:
%%time
knn_best = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                                metric_params=None, n_jobs=-1, n_neighbors=5, 
                                p=2, weights='uniform')

knn_best.fit(X_resampled,y_resampled)

Wall time: 3min 58s


In [7]:
%%time 
y_validate_true = validation['click'].values
X_validate = validation.drop(columns=['click', 'payprice'])

Wall time: 1.3 s


In [8]:
%%time
y_validate_hat = knn_best.predict(X_validate)

print(classification_report_imbalanced(y_validate_true, y_validate_hat, target_names=['no click', 'click']))
print('ROC AUC score {}'.format(roc_auc_score(y_validate_true, y_validate_hat)))
print('done')

                   pre       rec       spe        f1       geo       iba       sup

   no click       1.00      0.81      0.53      0.89      0.66      0.44    303723
      click       0.00      0.53      0.81      0.00      0.66      0.42       202

avg / total       1.00      0.81      0.53      0.89      0.66      0.44    303925

ROC AUC score 0.6698290469400157
done
Wall time: 3h 22min 34s


In [9]:
%%time
y_validate_prob = knn_best.predict_proba(X_validate)

Wall time: 3h 20min 30s


### Find optimal base bid based on pCTR

In [None]:

%%time
payprice = validation['payprice']
clicks = validation['click']

# VERIFY THAT THE pCTR INDEED IS EQUAL TO THE PROB OF CLICK == 1
pCTR = y_validate_prob[:, 1]

no_click, click = np.bincount(y_validate_true)
avgCTR = click / (no_click + click)

bid_strategy = lambda base_bid: base_bid * (pCTR / avgCTR)

bid_range = np.arange(3.4, 3.8, 0.02)
bid_range_wide = np.arange(1, 5, .5)
bid_range = np.concatenate((bid_range, bid_range_wide), axis=0)

# bid_range = np.arange(1.7, 1.9, 0.01)
statistics = {}

# find optimal base_bid
for base_bid in tqdm(bid_range):
    
    budget_remaining = 6250*1000
    
    statistics[base_bid] = {
        'impressions':0,
        'spend': 0,
        'clicks': 0
    }
    
    # list of bids for all ad requests
    bids = bid_strategy(base_bid)
    
    # loop through all bids for every ad request
    for i in range(len(bids)):    
        second_highest_bid = payprice[i]
        
        won = bids[i] >= second_highest_bid and second_highest_bid <= budget_remaining
        
        if(won):
            statistics[base_bid]['impressions'] += 1
            statistics[base_bid]['spend'] += second_highest_bid
            statistics[base_bid]['clicks'] += clicks[i]
            
            # subtract current bid from budget 
            budget_remaining -= second_highest_bid
            
statistics = pd.DataFrame(statistics).T
statistics['CTR'] = statistics['clicks'] / statistics['impressions']
statistics['aCPM'] = statistics['spend'] / statistics['impressions'] 
statistics['aCPC'] = (statistics['spend']/1000) / statistics['clicks']
print('done')


In [None]:
statistics.sort_values('clicks', ascending=False).head(5)

### Apply pCTR to test set

In [None]:
%%time

X_test = test.drop(columns=['click', 'payprice'])
y_test_prob = knn_best.predict_proba(X_test)
avgCTR = click / (no_click + click)

base_bid = 3.62
pCTR = y_test_prob[:, 1]


bid_strategy = lambda base_bid: (base_bid * (pCTR / avgCTR))

In [None]:
# get bid id's

google_drive = False

if(google_drive):
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    test_raw = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/test.csv')
    
else:
    test_raw = pd.read_csv('../we_data/test.csv')



In [None]:
# export to file
df_bids = pd.DataFrame(np.round(bid_strategy(base_bid),1), index=test_raw['bidid'].values, columns=['bidprice'])
df_bids.index.name = 'bidid'
df_bids = df_bids.reset_index()

df_bids.to_csv('bid_attemnt.csv', index=False)

In [2]:
# %%time
# y_validate_prob = knn_best.predict_proba(X_validate)
# print('Log loss {}'.format(log_loss(y_validate_true, y_validate_prob)))

In [16]:
# %%time

# uni_neigh = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='auto', leaf_size=30, 
#                                  p=2, metric='minkowski', metric_params=None, n_jobs=-1)
# uni_neigh.fit(X_train,y_train)

# dist_neigh = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='auto', leaf_size=30, 
#                                  p=2, metric='minkowski', metric_params=None, n_jobs=-1)
# dist_neigh.fit(X_train,y_train)

Wall time: 4min 7s


In [23]:
# %%time
# y_hat = dist_neigh.predict(X_test)

# print(classification_report_imbalanced(y_test, y_hat, target_names=['no click', 'click']))
# print("Balanced accuracy score: {:.3%}".format(balanced_accuracy_score(y_test, y_hat) ))
# print('ROC AUC score {}'.format(roc_auc_score(y_test, y_hat)))
# print('done')

                   pre       rec       spe        f1       geo       iba       sup

   no click       1.00      0.71      1.00      0.83      0.84      0.69     25101
      click       0.77      1.00      0.71      0.87      0.84      0.73     24848

avg / total       0.89      0.85      0.86      0.85      0.84      0.71     49949

Balanced accuracy score: 85.524%
ROC AUC score 0.8552415658639931
done
Wall time: 16min 36s
Wall time: 16min 36s


In [24]:
# %%time
# y_hat = uni_neigh.predict(X_test)

# print(classification_report_imbalanced(y_test, y_hat, target_names=['no click', 'click']))
# print("Balanced accuracy score: {:.3%}".format(balanced_accuracy_score(y_test, y_hat) ))
# print('ROC AUC score {}'.format(roc_auc_score(y_test, y_hat)))
# print('done')

                   pre       rec       spe        f1       geo       iba       sup

   no click       1.00      0.71      1.00      0.83      0.84      0.69     25101
      click       0.77      1.00      0.71      0.87      0.84      0.73     24848

avg / total       0.89      0.85      0.85      0.85      0.84      0.71     49949

Balanced accuracy score: 85.399%
ROC AUC score 0.8539874470564548
done
Wall time: 17min


In [None]:
# %%time
# y_validate_hat = dist_neigh.predict(X_validate)
# print(classification_report_imbalanced(y_validate_true, y_validate_hat, target_names=['no click', 'click']))
# print('ROC AUC score {}'.format(roc_auc_score(y_validate_true, y_validate_hat)))
# print('done')

In [3]:
# %%time
# y_validate_hat = uni_neigh.predict(X_validate)
# print(classification_report_imbalanced(y_validate_true, y_validate_hat, target_names=['no click', 'click']))
# print('ROC AUC score {}'.format(roc_auc_score(y_validate_true, y_validate_hat)))
# print('done')

In [4]:
# %%time
# y_validate_prob = dist_neigh.predict_proba(X_validate)
# print('Log loss {}'.format(log_loss(y_validate_true, y_validate_prob)))

In [12]:
# %%time
# for k in [1,5,9,17,20,25]:

#     neigh = KNeighborsClassifier(n_neighbors=k, weights='uniform', algorithm='auto', leaf_size=30, 
#                                  p=2, metric='minkowski', metric_params=None, n_jobs=-1) 
#     neigh.fit(X_train, y_train)
    
#     y_validate_prob = neigh.predict_proba(X_validate)
#     print('For k {} Log loss {}'.format(k, log_loss(y_validate_true, y_validate_prob)))
#     print('For k {} ROC AUC {}'.format(k, roc_auc_score(y_validate_true, y_validate_prob[:,1])))

For k 1 Log loss 4.133857136271352
For k 1 ROC AUC 0.6457579344623648
For k 5 Log loss 3.5690849487313066
For k 5 ROC AUC 0.6941830269197542
For k 9 Log loss 3.285574634612604
For k 9 ROC AUC 0.7004919281746529
Wall time: 33min 59s


In [13]:
# for k in [11,15,17,19]:

#     neigh = KNeighborsClassifier(n_neighbors=k, weights='uniform', algorithm='auto', leaf_size=30, 
#                                  p=2, metric='minkowski', metric_params=None, n_jobs=-1) 
#     neigh.fit(X_train, y_train)
    
#     y_validate_prob = neigh.predict_proba(X_validate)
#     print('For k {} Log loss {}'.format(k, log_loss(y_validate_true, y_validate_prob)))
#     print('For k {} ROC AUC {}'.format(k, roc_auc_score(y_validate_true, y_validate_prob[:,1])))

For k 11 Log loss 3.169505887897813
For k 11 ROC AUC 0.7041729056599025
For k 15 Log loss 2.9522700159010613
For k 15 ROC AUC 0.7121898021135269
For k 17 Log loss 2.8635431959821136
For k 17 ROC AUC 0.7127802551197722
For k 19 Log loss 2.781256169702471
For k 19 ROC AUC 0.7143356229717261
