In [2]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import math
import pickle

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

from imblearn.metrics import classification_report_imbalanced

from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
# !pip install imbalanced-learn
# !pip install xgboost

In [3]:
def memory(df):
    print("Memory usage of the dataframe is {:.2f} MB".format(
        df.memory_usage().sum() / 1024**2))
    
    
def entropy(df, base = 2):
    """ Calculate the entropy for every column in a df"""
    
    entropy = {}
    
    for column in df.columns:
        prob = df[column].value_counts(normalize=True, sort=False)
        
        entropy[column] = -(prob * np.log(prob)/np.log(base)).sum()
        
    return pd.Series(entropy).sort_values(ascending=False)

In [3]:
google_drive = False
convert = False

if(google_drive):
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/we_data/train.csv')
    
elif(convert):
    train = pd.read_csv('../we_data/train.csv')
    train.to_hdf('train.h5', 'train')
    
else:
    train = pd.read_hdf('../train.h5', 'train')
    validation = pd.read_csv('../we_data/validation.csv')
    test = pd.read_csv('../we_data/test.csv')

In [4]:
train.isnull().sum()

click                   0
weekday                 0
hour                    0
bidid                   0
userid                  0
useragent               0
IP                      0
region                  0
city                    0
adexchange          49829
domain             137135
url                 86812
urlid             2430981
slotid                  0
slotwidth               0
slotheight              0
slotvisibility          0
slotformat              0
slotprice               0
creative                0
bidprice                0
payprice                0
keypage            504990
advertiser              0
usertag            497479
dtype: int64

In [5]:
train['adexchange'].value_counts()

3.0    819458
2.0    763428
1.0    718271
4.0     79995
Name: adexchange, dtype: int64

In [7]:
entropy(train)

bidid             21.213107
userid            21.136952
IP                18.103315
url               16.149660
usertag           14.872094
slotid             9.422239
domain             8.498091
city               7.422192
payprice           7.076786
creative           5.873560
region             4.533946
hour               4.382678
slotprice          3.269283
keypage            3.195136
slotwidth          3.064538
advertiser         2.930429
weekday            2.799784
bidprice           2.614787
slotvisibility     2.351020
slotheight         2.024556
adexchange         1.741802
slotformat         1.478337
useragent          1.447568
click              0.008738
urlid             -0.000000
dtype: float64

In [8]:
train.head()

Unnamed: 0,click,weekday,hour,bidid,userid,useragent,IP,region,city,adexchange,domain,url,urlid,slotid,slotwidth,slotheight,slotvisibility,slotformat,slotprice,creative,bidprice,payprice,keypage,advertiser,usertag
0,0,5,22,b7bea80521fdecd95d2d761a38c91c3f09618066,2e880fb7d690cf7377b2e42e701728e3f3c0e4c1,windows_ie,125.37.175.*,2,2,2.0,trqRTvKaXTKfgg24JKTI,6447a7dfa30fe6eb410c91860e7c9b45,,2015392487,200,200,2,0,5,a4f763f78ef3eedfe614263b94a8924e,238,5,0f951a030abdaedd733ee8d114ce2944,3427,NaN
1,0,1,20,4f51205475678f5a124bc76b2c54163bf8eaa7eb,3a1fe01360ff8100e7d006b83b77a3e4c01d928c,windows_chrome,171.36.92.*,238,239,1.0,20fc675468712705dbf5d3eda94126da,3ddf173a94bd23c326683b6373c75dd4,,mm_10982364_973726_8930541,300,250,FourthView,Na,0,10722,294,23,,2821,NaN
2,0,3,13,b604e3fd054a658ab7ced4285ebf2ef54d2bd890,801d18a056b6fe6b06a794aef17fb0d6daff2414,windows_ie,59.46.106.*,40,41,2.0,trqRTJn7O95I1mKYUV,625d1b5916ea925332c7b326c0574cfa,,1720123646,250,250,2,0,5,798b2d49952d77f1eace9f23c210d0b5,238,24,0f951a030abdaedd733ee8d114ce2944,3427,10052100061386610110
3,0,6,23,0348beeae93e561584c3b50fc9e7746a33048ad7,0d6eaf2259699990e38a1fc5116f112070b9ecdc,windows_ie,114.250.226.*,1,1,1.0,5F97t5E0BTK7XhNrUMpENpn,dedc488b98ca20707bc9a723957e7d1f,,mm_10027070_118039_10308280,160,600,2,1,0,cb7c76e7784031272e37af8e7e9b062c,300,25,bebefa5efe83beee17a3d245e7c5085b,1458,138661006310111
4,0,5,6,268149c1789bce2bc9798ffd97ec431219bafeb3,a239d9bb642460d974ba67f85e63b8d3e214da0e,windows_ie,183.63.192.*,216,233,2.0,13625cb070ffb306b425cd803c4b7ab4,4199d1227c511fc776b76594dabff9f8,,1120200150,728,90,OtherView,Na,133,7330,277,133,,2259,NaN


In [9]:
no_click,click = train['click'].value_counts().values
print('Baseline average CTR {:.5f}'.format(click/no_click))

Baseline average CTR 0.00074


## Feature engineering

In [4]:
def count_featuresize_one_hot_encoding(df):
    """Calcualte the number of featuers nessecary for one hot encoding"""

    total_features = 0
    for column in df.columns:
        total_features += len(df[column].unique())

    print('Rougly {:,} features in the feature space'.format(total_features))

    return total_features


def feature_engineering(df):
    """ Enrich dataframe with additional features
    
        Note that all fields that are joined are slightly redundent when
        implementing more sophisticated models like NN that could pick up
        on these feature combinations, however, it can improve the perforamce
        of simpler models such as logisitc regression"""

    # split user agent into os and browser
    df['os'], df['browser'] = df['useragent'].str.split('_').str

    # apple users
    df['apple'] = df['useragent'].str.match(r'(ios)|(mac)').astype(np.uint8)

    # deterime mobile devivce or not
    df['mobieldevice'] = df['useragent'].str.match(r'(ios)|(android)').astype(
        np.uint8)

    # hour per day
    df['weekdayhour'] = df['weekday'].astype(str) + '_' + df['hour'].astype(
        str)

    # mobile per day of the week
    df['mobileweekday'] = df['mobieldevice'].astype(
        str) + '_' + df['weekday'].astype(str)

    # mobile per day of the week
    df['mobilehour'] = df['mobieldevice'].astype(
        str) + '_' + df['hour'].astype(str)

    # brouwser per day of the week
    df['browserweekday'] = df['browser'] + '_' + df['weekday'].astype(str)

    # brouwser per day of the week
    df['browserhour'] = df['browser'] + '_' + df['hour'].astype(str)

    # os per day of the week
    df['osweekday'] = df['os'] + '_' + df['weekday'].astype(str)

    # os per hour
    df['oshour'] = df['os'] + '_' + df['hour'].astype(str)

    # os per day per hour
    df['osdayhour'] = df['os'] + '_' + df['weekday'].astype(
        str) + '_' + df['hour'].astype(str)

    # bin hours into time of day
    df['timeofday'] = pd.cut(
        df['hour'].astype(int),
        4,
        labels=["night", "morning", "afternoon", "evening"])

    # bin ad surface size categories
    min_ad = min(df['slotwidth'] * df['slotheight']) - 1
    max_ad = max(df['slotwidth'] * df['slotheight'])

    ad_bins = pd.IntervalIndex.from_breaks(
        [min_ad, 65520, 75000, 90000, max_ad])

    replace, with_ = [
        pd.Interval(min_ad, 65520),
        pd.Interval(65520, 75000),
        pd.Interval(75000, 90000),
        pd.Interval(90000, max_ad)
    ], ['small', 'medium', 'large', 'x-large']

    df['adsize'] = pd.cut(
        df['slotwidth'] * df['slotheight'], bins=ad_bins).replace(
            replace, with_)

    # bin slot price into categories
    price_bins = pd.IntervalIndex.from_breaks(
        [min(df['slotprice']), 10, 50, 100,
         max(df['slotprice'])],
        closed='left')
    replace, with_ = [
        pd.Interval(min(df['slotprice']), 10, closed='left'),
        pd.Interval(10, 50, closed='left'),
        pd.Interval(50, 100, closed='left'),
        pd.Interval(100, max(df['slotprice']), closed='left')
    ], ['1', '2', '3', '4']

    df['slotprice'] = pd.cut(
        df['slotprice'], bins=price_bins).replace(replace, with_)

    # ad size category and visability
    df['advisabilitysize'] = df['slotvisibility'].astype(
        str) + '_' + df['adsize'].astype(str)

    return df.drop(columns=['slotwidth', 'slotheight'])


def pre_process_one_hot_encoding(df):
    """ Preprocess the dataframe for one hot encoding
    
        - Split the filed user tags and binary encode
        - Convert numerical categories into strings 
        """

    # convert numerical categories into strings as a quick hack
    # for one hot encoding to work properly on numerical categories
    df['weekday'] = df['weekday'].astype(str)
    df['hour'] = df['hour'].astype(int)
    df['region'] = df['region'].astype(str)
    df['city'] = df['city'].astype(str)
    df['adexchange'] = df['adexchange'].astype(str)

    # already processed and not found in columns
    if ('usertag' not in df.columns):
        return df

    df['usertag'] = df['usertag'].astype(str)
    mlb = MultiLabelBinarizer()

    df = df.join(
        pd.DataFrame(
            mlb.fit_transform(df['usertag'].str.split(',')),
            columns='usertag_' + mlb.classes_,
            index=df.index))

    # drop the usertag column
    df = df.drop(columns='usertag')

    return df


def add_missing_colums(df, columns, sort_columns=True):
    """ Due to the feature engineering there is a chance a discrapency occurs
        between 
        """

    missing = [x for x in columns if x not in df.columns]

    for x in missing:
        df[x] = 0

    if (sort_columns):
        return df[sorted(df.columns)]

    return df


def drop_colums(df):

    # Remove uniuqe and meaningless featuers that are not know a pirori
    columns = ['bidprice', 'urlid', 'bidid']

    # remove some very sparse fields to reduce featuers (highest entropy)
    columns.extend(['userid', 'url', 'domain', 'slotid', 'IP'])

    # only remove columns that are in the df
    columns = [column for column in columns if column in df.columns]

    return df.drop(columns=columns)

## Performance metrics

In [5]:
def calcluate_num_impressions(df, grouping = False):
    imp = {}
    
    if(grouping):
        imp = df.groupby(grouping).size().to_dict()
            
    else:
        imp = len(df)
    
    print(imp)
    return imp
    


def calcluate_num_clicks(df, grouping = False):
    clicks = {}
    
    if(grouping):
        for index, group in df.groupby(grouping):
            clicks[index] = np.sum(group['click'])
            
    else:
        clicks = np.sum(df['click'])
    
    print(clicks)
    return clicks
    

def calcluate_ctr(df, grouping = False):
    ctr = {}
    
    if(grouping):
        for index, group in df.groupby(grouping):
            ctr[index] = np.average(group['click']) * 100
            
    else:
        ctr = np.average(df['click']) * 100
    
    print(ctr)
    return ctr
    
    
def average_cost_per_mille(df, grouping = False):
    cpm = {}
    
    if(grouping):
        cpm = df.groupby('weekday').agg({'payprice': np.mean}).to_dict()['payprice']
        
    else:
        cpm = np.mean(df['payprice'])
        
    print(cpm)
    return cpm

In [12]:
train.shape, validation.shape, test.shape

((2430981, 25), (303925, 25), (303375, 22))

## Build feature matrix

In [13]:
"""
other fillna strategies for adexchange should be considered at some point!

the only columns that contain a lot of missing values that are used in the final 
analysis are adexchange and usertag. Different strategies have been considered but 
it was deemed to be the most informative to assign a 'unknown' class which is easely
achieved trhough filling 0's since they do not occur in the dataset

"""

%time train = drop_colums(train)
%time train = train.fillna(0)
%time train = feature_engineering(train)
%time train = pre_process_one_hot_encoding(train)
%time train = pd.get_dummies(train)

memory(train)
print('done')

CPU times: user 269 ms, sys: 212 ms, total: 481 ms
Wall time: 973 ms
CPU times: user 1.89 s, sys: 678 ms, total: 2.56 s
Wall time: 4.19 s
CPU times: user 2min 5s, sys: 25 s, total: 2min 30s
Wall time: 2min 52s
CPU times: user 35.4 s, sys: 51.9 s, total: 1min 27s
Wall time: 1min 51s
CPU times: user 1min 11s, sys: 38.6 s, total: 1min 49s
Wall time: 2min 18s
Memory usage of the dataframe is 6898.31 MB
done


In [14]:
%time test = drop_colums(test)
%time test = test.fillna(0)
%time test = pre_process_one_hot_encoding(test)
%time test = feature_engineering(test)
%time test = pd.get_dummies(test)
print('done')

CPU times: user 98.5 ms, sys: 348 ms, total: 447 ms
Wall time: 627 ms
CPU times: user 162 ms, sys: 83.6 ms, total: 245 ms
Wall time: 257 ms
CPU times: user 3.15 s, sys: 859 ms, total: 4.01 s
Wall time: 4.45 s
CPU times: user 17.1 s, sys: 5.84 s, total: 22.9 s
Wall time: 27.3 s
CPU times: user 6.91 s, sys: 2.03 s, total: 8.94 s
Wall time: 10.7 s
done


In [15]:
%time validation = drop_colums(validation)
%time validation = validation.fillna(0)
%time validation = feature_engineering(validation)
%time validation = pre_process_one_hot_encoding(validation)
%time validation = pd.get_dummies(validation)
print('done')

CPU times: user 105 ms, sys: 368 ms, total: 474 ms
Wall time: 641 ms
CPU times: user 160 ms, sys: 81.7 ms, total: 241 ms
Wall time: 248 ms
CPU times: user 15.8 s, sys: 823 ms, total: 16.6 s
Wall time: 17.4 s
CPU times: user 3.56 s, sys: 692 ms, total: 4.25 s
Wall time: 4.53 s
CPU times: user 6.54 s, sys: 1.54 s, total: 8.08 s
Wall time: 8.89 s
done


In [16]:
# the featuere engineering can construct columns that do not occur in other sets 
# this adds the columns of the joined colomuns
joined_colums = [item for slist in [validation.columns, train.columns, test.columns] for item in slist]

%time train = add_missing_colums(train,joined_colums)
%time validation = add_missing_colums(validation,joined_colums)
%time test = add_missing_colums(test,joined_colums)
print('done')

CPU times: user 22.6 s, sys: 52.2 s, total: 1min 14s
Wall time: 1min 43s
CPU times: user 4.53 s, sys: 7.09 s, total: 11.6 s
Wall time: 16.1 s
CPU times: user 5.9 s, sys: 10.1 s, total: 16 s
Wall time: 23.9 s
done


In [17]:
# this list should be empty!!
[x for x in validation.columns if x not in train.columns]

[]

In [20]:
train.to_hdf('preprocessed.h5', 'train')
validation.to_hdf('preprocessed.h5', 'validation')
test.to_hdf('preprocessed.h5', 'test')
print('done')

done


### OPTION 1: Apply bloom filter on all filters

In [None]:
# test_dict = df_copy.drop(columns='click').T.to_dict().values()
# h = FeatureHasher(n_features=20000)
# maxtix = h.transform(test_dict)

In [529]:
# maxtix.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

### OPTION 2: Get feature matrix of select number of features

# Balance dataset

There is a significant unbalance between the click and the non-click class. In an attempt to increase numerical stability of the training models the majority class will be under sampled (observations will be removed) and the minority class will be oversampled generating new samples. Considering a sample x_i, a new sample x_{new} will be generated considering its k neareast-neighbors. 

SMOTEENN Combines over and under sampling

In [3]:
%%time
train = pd.read_hdf('preprocessed.h5', 'train')
validation = pd.read_hdf('preprocessed.h5', 'validation')
# test = pd.read_hdf('preprocessed.h5', 'test')
print('done')

done
CPU times: user 346 ms, sys: 9.93 s, total: 10.3 s
Wall time: 24.5 s


In [24]:
%%time
training_data = train.loc[:100000]

pre_subsample_size = 10000#int(1e5)
final_resample_ratio = 1/8

print('Original:')
print(training_data['click'].value_counts())
print('')

# random sample from the non click class, pre subsameling will improve speed
# since SMOTETomek does not take an output number samples as parameter
class0_index = training_data[training_data['click'] == 0].index

# take random sample
%time random_subsample = np.random.choice(class0_index, pre_subsample_size, replace=False)
class0_downsampled = training_data.loc[random_subsample]
class1 = training_data[training_data['click'] == 1]

# combine dataframes
training_data = pd.concat([class0_downsampled,class1])

# free memory
del(class0_index, class0_downsampled, class1)

print('Random subsample class0:')
print(training_data['click'].value_counts())
print('')

# use all but one avalible processors
smote = SMOTE(k_neighbors=3, n_jobs=-1)
tomek = TomekLinks(n_jobs=-1)

smote_omek = SMOTETomek(ratio=final_resample_ratio, smote=smote, tomek=tomek)

y = training_data['click'].values
X = training_data.drop(columns=['click', 'payprice'])


%time X_resampled, y_resampled = smote_omek.fit_resample(X, y,)

print('Final balance:')
sample, count = np.unique(y_resampled, return_counts=True)
print(pd.Series(dict(np.array((sample,count)).T)))

print('\nBaseline accuracy: {:.3%}'.format(count[0]/len(y_resampled)))

# export resample
resample = pickle.dumps((X_resampled, y_resampled))

print('\ndone\n')

Original:
0    99930
1       71
Name: click, dtype: int64

CPU times: user 2.54 ms, sys: 234 µs, total: 2.77 ms
Wall time: 2.78 ms
Random subsample class0:
0    10000
1       71
Name: click, dtype: int64

CPU times: user 1min 46s, sys: 2.13 s, total: 1min 48s
Wall time: 1min 4s
Final balance:
0    10000
1     9979
dtype: int64

Baseline accuracy: 50.053%

done

CPU times: user 1min 47s, sys: 3.18 s, total: 1min 50s
Wall time: 1min 6s


## Train LR model

In [None]:
%%time
# load resampled date
# X_resampled, y_resampled = pickle.loads(resample)

lr = LogisticRegression(solver='lbfgs', max_iter=500, class_weight='balanced', n_jobs=-1) 

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled)
%time lr.fit(X_train, y_train)

# export model

# lr_model = pickle.dumps(lr)
print('done')

## Evaluate LR model testset from train

In [None]:
# lr = pickle.loads(lr_model)

# performance
y_hat = lr.predict(X_test)

print(classification_report_imbalanced(y_test, y_hat, target_names=['no click', 'click']))
print("Balanced accuracy score: {:.3%}".format(balanced_accuracy_score(y_test, y_hat) ))
print('ROC AUC score {}'.format(roc_auc_score(y_validate, y_hat_validate)))
print('done')

## Evaluate LR model on validation

In [18]:
y_validate = validation['click'].values
X_validate = validation.drop(columns=['click', 'payprice'])

In [None]:
y_hat_validate = lr.predict(X_validate)
print(classification_report_imbalanced(y_validate, y_hat_validate, target_names=['no click', 'click']))
print('ROC AUC score {}'.format(roc_auc_score(y_validate, y_hat_validate)))
print('done')

## Use LR for pCTR

In [18]:
y_validate_prob = lr.predict_proba(X_validate)

0.5757432849101723

In [9]:
print('idle')

idle
