Ref: http://breezedeus.github.io/2014/11/15/breezedeus-feature-processing.html

In [1]:
import pandas as pd
import numpy as np
import collections
import random
from collections import Counter

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
valid = pd.read_csv('./data/validation.csv')
test  = pd.read_csv('./data/test.csv')

In [3]:
t = train.drop(['bidprice','click','payprice'], axis=1)
v = valid.drop(['bidprice','click','payprice'], axis=1)

data = pd.concat([t, v, test], axis=0)
data.shape

(3038281, 22)

## To do list (^: already down, @: currently focused, &: currently skipped)
- ^ weekday
- ^ hour
- ^ bidid
- ^ userid
- ^ useragent
- & IP [list of values]
- ^ region
- ^ city
- ^ adexchange
- & domain
- & url
- ^ urlid
- & slotid
- ^ slotwidth
- ^ slotheight
- ^ slotvisibility
- ^ slotformat
- ^ slotprice
- ^ creative
- ^ keypage
- ^ advertiser
- & usertag (list of values)

## Others
- deal with missing values
- remove less frequent basic non-ordinal feature values

### Questions
- how to deal with categorical variable with a list of values ?

### drop useless features

In [4]:
# 
data = data.drop(['bidid', 'userid', 'urlid'], axis=1)

## deal with ordinal features

In [5]:
ordinal = ['slotwidth', 'slotheight', 'slotvisibility', 'slotprice']

In [6]:
%%time
# most basic treatment: take them as non-ordinal features
ordinal_dm = pd.concat([pd.get_dummies(data[c]) for c in ordinal],
                      axis=1, keys=ordinal)

CPU times: user 10.3 s, sys: 1.18 s, total: 11.5 s
Wall time: 11.6 s


## deal with non-ordinal features

In [7]:
basic_non_ord = ['weekday', 'hour', 'useragent', 
                 'region','city', 'adexchange','slotformat',
                 'creative','keypage','advertiser']

In [8]:
%%time
basic_non_ord_dm = pd.concat([pd.get_dummies(data[c]) for c in basic_non_ord],
                            axis=1, keys=basic_non_ord)

CPU times: user 19.6 s, sys: 2.71 s, total: 22.3 s
Wall time: 22.7 s


In [291]:
for c in basic_non_ord:
    print(c)
    print(basic_non_ord_dm.iloc[0][c])

weekday
0    0
1    0
2    0
3    0
4    0
5    1
6    0
Name: 0, dtype: uint8
hour
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    1
23    0
Name: 0, dtype: uint8
useragent
android_chrome      0
android_firefox     0
android_ie          0
android_maxthon     0
android_opera       0
android_other       0
android_safari      0
android_sogou       0
ios_other           0
ios_safari          0
linux_chrome        0
linux_firefox       0
linux_ie            0
linux_opera         0
linux_other         0
linux_safari        0
mac_chrome          0
mac_firefox         0
mac_maxthon         0
mac_opera           0
mac_other           0
mac_safari          0
mac_sogou           0
other_chrome        0
other_firefox       0
other_ie            0
other_opera         0
other_other         0
other_safari        0
windows_chrome      0
windows_firefox     0
windows_ie

In [271]:
# for x in basic_non_ord:
#     if x not in data.columns:
#         print(x)

## deal with complex non-ordinal features 

In [159]:
def count_freq_values(feat, numb=50):
    """
    @numb: value is frequent if occurs not less than numb times
    
    return
    @uniq_frq:  = number of unique frequent values
    @frq_pct: = number frequent values / number of all presented values
    """
    
    feat_count = Counter(feat)
    value_count = Counter(feat_count.values())
    # occurence of a value, number of such value
    vc_df = pd.DataFrame(value_count.most_common(), columns=['occur','count'])
    frq_vc_df = vc_df.loc[vc_df['occur'] > numb]
    
    uniq_frq = frq_vc_df['count'].sum()
    frq_pct = (frq_vc_df['occur'] * frq_vc_df['count']).sum() / len(feat)
    
    
    return uniq_frq, frq_pct
    

In [174]:
def convt_non_ord(feat, ft_name, numb=500):
    """
    Convert non-ordinal feature into one-hot.
        1. Replace (values having occurence <= numb) with 'infreqValue'
        2. Get dommies of the values

    """
    feat_df = pd.DataFrame(feat)
    feat_count = Counter(feat)
    
    # find all values with occurence > numb
    freq_values = []
    for k in feat_count:
        if feat_count[k] > numb:
            freq_values.append(k)
    
    # 1. Replace (values having occurence <= numb) with 'infreqValue'
    for idx, row in feat_df.iterrows():
        if row[ft_name] not in freq_values:
            row[ft_name] = 'infreqValue'
    
    # 2. Get dommies of the values
    print('start getting dummies')
    feat_dummies = pd.get_dummies(feat_df)
    
    return feat_dummies

In [258]:
# try to split into multiple columns
count_freq_values(data['IP'], 5)

(193079, 0.709816175659855)

In [263]:
# try to find pattern in url
count_freq_values(data['url'], 250)

(1022, 0.34574056843326867)

In [268]:
# too many
count_freq_values(data['usertag'], 115)

(1004, 0.5490216342728009)

In [None]:
pd, str.contain #parellel computing

###  group less frequent values together

In [163]:
count_freq_values(data['domain'], 1000)

(331, 0.8097766467288575)

In [168]:
count_freq_values(data['slotid'], 800)

(475, 0.7625637654976614)

In [171]:
%%time
domain_dummies = convt_non_ord(data['domain'], 'domain' , 1000)

start getting dummies
CPU times: user 2min 56s, sys: 1.89 s, total: 2min 57s
Wall time: 2min 58s


In [175]:
%%time
slotid_dummies = convt_non_ord(data['slotid'], 'slotid', 800)

start getting dummies
CPU times: user 3min 7s, sys: 2.57 s, total: 3min 10s
Wall time: 3min 11s


In [232]:
domain_dummies.shape

(3038281, 332)

In [178]:
X_train = pd.concat([ordinal_dm, basic_non_ord_dm], axis=1)

## Basic Model

### LogisticRegression

In [13]:
# downsample + upsample

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
%%time
lg = LogisticRegression()
lg.fit(X_train, y_train)

CPU times: user 58.8 s, sys: 33.3 s, total: 1min 32s
Wall time: 1min 37s


In [18]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_train, lg.predict(X_train))
auc

0.5

### Undersampling

In [30]:
X_train_pos.shape, X_train_neg.shape

((1793, 975), (2429188, 975))

In [122]:
def undersample(n=np.sum(y_train)):
    # get pos and neg
    X_train_pos = X_train[y_train == 1]
    y_train_pos = y_train[y_train == 1]
    X_train_neg = X_train[y_train != 1]
    y_train_neg = y_train[y_train != 1]
    
    # undersample negative labels
    X_train_neg_und = X_train_neg.sample(n=n, random_state=123)
    y_train_neg_und = y_train_neg.loc[X_train_neg_und.index]
    
    # concat and shuffle
    X_train_und = pd.concat([X_train_neg_und, X_train_pos], axis=0)
    y_train_und = pd.concat([y_train_neg_und, y_train_pos], axis=0)
    
    shuffled_idx = list(X_train_und.index)
    random.shuffle(shuffled_idx)
    X_train_und = X_train_und.loc[shuffled_idx]
    y_train_und = y_train_und.loc[shuffled_idx]
    
    return X_train_und, y_train_und

In [None]:
X_train_und, y_train_und = undersample()

In [118]:
X_train_und.shape

(3586, 975)

In [113]:
lg.fit(X_train_und, y_train_und)

auc_in = roc_auc_score(y_train_und, lg.predict(X_train_und))
print('insample(undersampled) roc auc: {}'.format(auc_in))

auc_in = roc_auc_score(y_train, lg.predict(X_train))
print('insample(original sample) roc auc: {}'.format(auc_in))

insample(undersampled) roc auc: 0.7267150027886224
insample(original sample) roc auc: 0.6774570981637622


In [114]:
auc_out = roc_auc_score(y_valid, lg.predict(X_valid))
print('out-sample(original sample) roc auc: {}'.format(auc_out))

out-sample(original sample) roc auc: 0.6446080135616016


### Oversampling
No significant improvement

In [152]:
X_train_und, y_train_und = undersample(int(np.sum(train['click']) * 1.2))

In [153]:
int(3.9999)

3

In [154]:
from imblearn.over_sampling import SMOTE

In [155]:
sm = SMOTE(random_state=42)
X_train_ovs, y_train_ovs = sm.fit_sample(X_train_und, y_train_und)

In [156]:
np.sum(y_train_ovs), y_train_ovs.shape[0]

(2151, 4302)

In [157]:
# # oversample 20%
lg.fit(X_train_ovs, y_train_ovs)

auc_in = roc_auc_score(y_train_ovs, lg.predict(X_train_ovs))
print('insample(oversampled) roc auc: {}'.format(auc_in))

auc_in = roc_auc_score(y_train, lg.predict(X_train))
print('insample(original sample) roc auc: {}'.format(auc_in))

auc_out = roc_auc_score(y_valid, lg.predict(X_valid))
print('out-sample(original sample) roc auc: {}'.format(auc_out))

insample(oversampled) roc auc: 0.7419804741980474
insample(original sample) roc auc: 0.677703324913299
out-sample(original sample) roc auc: 0.6480063598856997


In [138]:
# oversample 100%

insample(oversampled) roc auc: 0.8013106525376463
insample(original sample) roc auc: 0.6309470145337979
out-sample(original sample) roc auc: 0.6220072350969355


In [56]:
roc_auces # ensemble = 0.88, LR = 0.84 # LR without optimization = 0.81

array([0.51505326, 0.51724118, 0.51202622])

### LightGBM

#### data preprocessing

In [45]:
import lightgbm as lgb

In [46]:
t_lgb = train.drop(['bidprice','payprice'], axis=1)
v_lgb = valid.drop(['bidprice','payprice'], axis=1)

data_lgb = pd.concat([t_lgb, v_lgb], axis=0)

In [50]:
# drop useless features
data_lgb = data_lgb.drop(['bidid', 'userid', 'urlid'], axis=1)

In [51]:
data_lgb.iloc[0]

click                                            0
weekday                                          5
hour                                            22
useragent                               windows_ie
IP                                    125.37.175.*
region                                           2
city                                             2
adexchange                                       2
domain                        trqRTvKaXTKfgg24JKTI
url               6447a7dfa30fe6eb410c91860e7c9b45
slotid                                  2015392487
slotwidth                                      200
slotheight                                     200
slotvisibility                                   2
slotformat                                       0
slotprice                                        5
creative          a4f763f78ef3eedfe614263b94a8924e
keypage           0f951a030abdaedd733ee8d114ce2944
advertiser                                    3427
usertag                        

In [52]:
# ignore complex non-ordinal features
complex_non_ord = ['IP', 'domain', 'url', 'slotid', 'usertag']
data_lgb = data_lgb.drop(complex_non_ord, axis=1)

In [54]:
data_lgb.shape

(2734906, 15)