In [1]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import math

from sklearn.preprocessing import MultiLabelBinarizer
from imblearn.combine import SMOTETomek

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
pd.set_option('display.max_columns', 100)

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
# !pip install imbalanced-learn
# !pip install xgboost

In [2]:
def memory(df):
    print("Memory usage of the dataframe is {:.2f} MB".format(
        df.memory_usage().sum() / 1024**2))
    
    
def entropy(df, base = 2):
    """ Calculate the entropy for every column in a df"""
    
    entropy = {}
    
    for column in df.columns:
        prob = df[column].value_counts(normalize=True, sort=False)
        
        entropy[column] = -(prob * np.log(prob)/np.log(base)).sum()
        
    return pd.Series(entropy).sort_values(ascending=False)

In [3]:
google_drive = False
convert = False

if(google_drive):
    from google.colab import drive
    drive.mount('/content/gdrive')
    
    train = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/we_data/train.csv')
    
elif(convert):
    train = pd.read_csv('../we_data/train.csv')
    train.to_hdf('train.h5', 'train')
    
else:
    train = pd.read_hdf('../train.h5', 'train')
    validation = pd.read_csv('../we_data/validation.csv')
    test = pd.read_csv('../we_data/test.csv')

In [4]:
train.isnull().sum()

NameError: name 'df' is not defined

In [None]:
train['adexchange'].value_counts()

In [None]:
"""
other fillna strategies for adexchange should be considered at some point!

the only columns that contain a lot of missing values that are used in the final 
analysis are adexchange and usertag. Different strategies have been considered but 
it was deemed to be the most informative to assign a 'unknown' class which is easely
achieved trhough filling 0's since they do not occur in the dataset

"""

train = df.fillna(0)

In [None]:
entropy(train)

In [None]:
train.head()

In [None]:
no_click,click = train['click'].value_counts().values
print('Baseline average CTR {:.5f}'.format(click/no_click))

## Feature engineering

In [None]:
def count_featuresize_one_hot_encoding(df):
    """Calcualte the number of featuers nessecary for one hot encoding"""

    total_features = 0
    for column in df.columns:
        total_features += len(df[column].unique())

    print('Rougly {:,} features in the feature space'.format(total_features))

    return total_features


def feature_engineering(df):
    """ Enrich dataframe with additional features
    
        Note that all fields that are joined are slightly redundent when
        implementing more sophisticated models like NN that could pick up
        on these feature combinations, however, it can improve the perforamce
        of simpler models such as logisitc regression"""

    # split user agent into os and browser
    df['os'], df['browser'] = df['useragent'].str.split('_').str

    # apple users
    df['apple'] = df['useragent'].str.match(r'(ios)|(mac)').astype(np.uint8)

    # deterime mobile devivce or not
    df['mobieldevice'] = df['useragent'].str.match(r'(ios)|(android)').astype(
        np.uint8)

    # hour per day
    df['weekdayhour'] = df['weekday'].astype(str) + '_' + df['hour'].astype(
        str)

    # mobile per day of the week
    df['mobileweekday'] = df['mobieldevice'].astype(
        str) + '_' + df['weekday'].astype(str)

    # mobile per day of the week
    df['mobilehour'] = df['mobieldevice'].astype(
        str) + '_' + df['hour'].astype(str)

    # brouwser per day of the week
    df['browserweekday'] = df['browser'] + '_' + df['weekday'].astype(str)

    # brouwser per day of the week
    df['browserhour'] = df['browser'] + '_' + df['hour'].astype(str)

    # os per day of the week
    df['osweekday'] = df['os'] + '_' + df['weekday'].astype(str)

    # os per hour
    df['oshour'] = df['os'] + '_' + df['hour'].astype(str)

    # os per day per hour
    df['osdayhour'] = df['os'] + '_' + df['weekday'].astype(
        str) + '_' + df['hour'].astype(str)

    # bin hours into time of day
    df['timeofday'] = pd.cut(
        df['hour'].astype(int),
        4,
        labels=["night", "morning", "afternoon", "evening"])

    # bin ad surface size categories
    min_ad = min(df['slotwidth'] * df['slotheight']) - 1
    max_ad = max(df['slotwidth'] * df['slotheight'])

    ad_bins = pd.IntervalIndex.from_breaks(
        [min_ad, 65520, 75000, 90000, max_ad])

    replace, with_ = [
        pd.Interval(min_ad, 65520),
        pd.Interval(65520, 75000),
        pd.Interval(75000, 90000),
        pd.Interval(90000, max_ad)
    ], ['small', 'medium', 'large', 'x-large']

    df['adsize'] = pd.cut(
        df['slotwidth'] * df['slotheight'], bins=ad_bins).replace(
            replace, with_)

    # bin slot price into categories
    price_bins = pd.IntervalIndex.from_breaks(
        [min(df['slotprice']), 10, 50, 100,
         max(df['slotprice'])],
        closed='left')
    replace, with_ = [
        pd.Interval(min(df['slotprice']), 10, closed='left'),
        pd.Interval(10, 50, closed='left'),
        pd.Interval(50, 100, closed='left'),
        pd.Interval(100, max(df['slotprice']), closed='left')
    ], ['1', '2', '3', '4']

    df['slotprice'] = pd.cut(
        df['slotprice'], bins=price_bins).replace(replace, with_)

    # ad size category and visability
    df['advisabilitysize'] = df['slotvisibility'].astype(
        str) + '_' + df['adsize'].astype(str)

    return df.drop(columns=['slotwidth', 'slotheight'])


def pre_process_one_hot_encoding(df):
    """ Preprocess the dataframe for one hot encoding
    
        - Split the filed user tags and binary encode
        - Convert numerical categories into strings 
        """

    # convert numerical categories into strings as a quick hack
    # for one hot encoding to work properly on numerical categories
    df['weekday'] = df['weekday'].astype(str)
    df['hour'] = df['hour'].astype(int)
    df['region'] = df['region'].astype(str)
    df['city'] = df['city'].astype(str)
    df['adexchange'] = df['adexchange'].astype(str)

    # already processed and not found in columns
    if ('usertag' not in df.columns):
        return df

    df['usertag'] = df['usertag'].astype(str)
    mlb = MultiLabelBinarizer()

    df = df.join(
        pd.DataFrame(
            mlb.fit_transform(df['usertag'].str.split(',')),
            columns='usertag_' + mlb.classes_,
            index=df.index))

    # drop the usertag column
    df = df.drop(columns='usertag')

    return df


def add_missing_colums(df, columns, sort_columns=True):
    """ Due to the feature engineering there is a chance a discrapency occurs
        between 
        """

    missing = [x for x in columns if x not in df.columns]

    for x in missing:
        df[x] = 0

    if (sort_columns):
        return df[sorted(df.columns)]

    return df


def drop_colums(df):

    # Remove uniuqe and meaningless featuers that are not know a pirori
    columns = ['bidprice', 'urlid', 'bidid']

    # remove some very sparse fields to reduce featuers (highest entropy)
    columns.extend(['userid', 'url', 'domain', 'slotid', 'IP'])

    # only remove columns that are in the df
    columns = [column for column in columns if column in df.columns]

    return df.drop(columns=columns)

## Performance metrics

In [None]:
def calcluate_num_impressions(df, grouping = False):
    imp = {}
    
    if(grouping):
        imp = df.groupby(grouping).size().to_dict()
            
    else:
        imp = len(df)
    
    print(imp)
    return imp
    


def calcluate_num_clicks(df, grouping = False):
    clicks = {}
    
    if(grouping):
        for index, group in df.groupby(grouping):
            clicks[index] = np.sum(group['click'])
            
    else:
        clicks = np.sum(df['click'])
    
    print(clicks)
    return clicks
    

def calcluate_ctr(df, grouping = False):
    ctr = {}
    
    if(grouping):
        for index, group in df.groupby(grouping):
            ctr[index] = np.average(group['click']) * 100
            
    else:
        ctr = np.average(df['click']) * 100
    
    print(ctr)
    return ctr
    
    
def average_cost_per_mille(df, grouping = False):
    cpm = {}
    
    if(grouping):
        cpm = df.groupby('weekday').agg({'payprice': np.mean}).to_dict()['payprice']
        
    else:
        cpm = np.mean(df['payprice'])
        
    print(cpm)
    return cpm

In [None]:
train.shape, validation.shape, test.shape

## Build feature matrix

In [None]:
%time train = drop_colums(train)
%time train = train.fillna(0)
%time train = feature_engineering(train)
%time train = pre_process_one_hot_encoding(train)
%time train = pd.get_dummies(train)

memory(train)
print('done')

NameError: name 'drop_colums' is not defined

CPU times: user 3.59 s, sys: 1.12 s, total: 4.71 s
Wall time: 5.94 s


NameError: name 'feature_engineering' is not defined

NameError: name 'pre_process_one_hot_encoding' is not defined

In [None]:
train.to_csv('../we_data/train_preprocessed.csv')

In [43]:
%time test = drop_colums(test)
%time test = test.fillna(0)
%time test = pre_process_one_hot_encoding(test)
%time test = feature_engineering(test)
%time test = pd.get_dummies(test)
print('done')

In [None]:
%time validation = drop_colums(validation)
%time validation = validation.fillna(0)
%time validation = feature_engineering(validation)
%time validation = pre_process_one_hot_encoding(validation)
%time validation = pd.get_dummies(validation)
# train.to_csv('/we_data/validation_preprocessed.csv')
print('done')

In [44]:
# find features/columns that occur in test set but not in trainset
unique_test = []
for column in test.columns:
    for x in test[column].unique():
        unique_test.append(str(column) + str(x))
        
unique_train = []
for column in train.columns:
    for x in train[column].unique():
        unique_train.append(str(column) + str(x))

# values in test set but not in trainset
print([x for x in unique_test if x not in unique_train])

['osdayhourios_2_19', 'osdayhourios_1_1', 'osdayhourios_0_2', 'osdayhourios_0_11']


In [None]:
# find features/columns that occur in validation set but not in trainset
unique_validation = []
for column in validation.columns:
    for x in validation[column].unique():
        unique_validation.append(str(column) + str(x))
        
unique_train = []
for column in train.columns:
    for x in train[column].unique():
        unique_train.append(str(column) + str(x))

# values in validation set but not in trainset
print([x for x  in unique_validation if x not in unique_train])

In [None]:
# the featuere engineering can construct columns that do not occur in other sets 
# this adds the columns of the joined colomuns
joined_colums = [item for slist in [validation.columns, train.columns, test.columns] for item in slist]

%time train = add_missing_colums(train,joined_colums)
%time validation = add_missing_colums(validation,joined_colums)

CPU times: user 21.8 s, sys: 45.3 s, total: 1min 7s
Wall time: 1min 37s
CPU times: user 4.61 s, sys: 6.88 s, total: 11.5 s
Wall time: 18.1 s


In [None]:
[x for x in validation.columns if x not in train.columns]

[]

### OPTION 1: Apply bloom filter on all filters

In [None]:
# test_dict = df_copy.drop(columns='click').T.to_dict().values()
# h = FeatureHasher(n_features=20000)
# maxtix = h.transform(test_dict)

In [529]:
# maxtix.toarray()[0]

array([0., 0., 0., ..., 0., 0., 0.])

### OPTION 2: Get feature matrix of select number of features



## Balance dataset

There is a significant unbalance between the click and the non-click class. In an attempt to increase numerical stability of the training models the majority class will be under sampled (observations will be removed) and the minority class will be oversampled generating new samples. Considering a sample x_i, a new sample x_{new} will be generated considering its k neareast-neighbors. 

SMOTEENN Combines over and under sampling

In [None]:
training_data = train

print('Original:')
print(training_data['click'].value_counts())
print('')
# random sample from the non click class, pre subsameling will improve speed 
# since SMOTETomek does not take an output number samples as parameter
class0 = training_data.query('click == 0')
%time class0 = class0.sample(n=4e4)
class1 = training_data.query('click == 1')

# combine dataframes
training_data = pd.concat([class0,class1])

print('Random subsample class0:')
print(training_data['click'].value_counts())
print('')

smote_omek = SMOTETomek(random_state=0, ratio=1/8)

y = training_data['click'].values
X = training_data.drop(columns=['click', 'payprice'])

%time X_resampled, y_resampled = smote_omek.fit_resample(X, y)

print('Final balance:')
sample, count = np.unique(y_resampled, return_counts=True)
print(pd.Series(dict(np.array((sample,count)).T)))

print('\nBaseline accuracy: {:.3%}'.format(count[0]/len(y_resampled)))

Original:
0    2429188
1       1793
Name: click, dtype: int64



TypeError: 'float' object cannot be interpreted as an integer

Random subsample class0:
0    2429188
1       1793
Name: click, dtype: int64



## Cross validation accuracy
Takes a long time!

In [141]:
lr = LogisticRegression(solver='lbfgs', max_iter=1e4) # use solver=sag when training on the entire dataset

%time scores = cross_val_score(lr, X_resampled, y_resampled, cv=10) #error_score='f1',
print("Average accuracy: {:.3%} (+/- {:.2f})".format(np.mean(scores), np.std(scores) * 2))

# lr.fit(X_resampled, y_resampled)

CPU times: user 14min 50s, sys: 19.1 s, total: 15min 9s
Wall time: 8min 32s
Average accuracy: 99.383% (+/- 0.00)


## Balanced accuracy score

In [None]:
lr = LogisticRegression(solver='lbfgs', max_iter=1e4) 

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=0)
%time lr.fit(X_train, y_train)

y_hat = lr.predict(X_test)
print("Balanced accuracy score: {:.3%}".format(balanced_accuracy_score(y_test, y_hat)  ))

In [None]:
y_validate = validation['click'].values
X_validate = validation.drop(columns=['click', 'payprice'])

In [None]:
lr.pr

In [None]:
print(kek)