Ref: http://breezedeus.github.io/2014/11/15/breezedeus-feature-processing.html

In [4]:
import pandas as pd
import numpy as np
import collections
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('./data/train.csv')
valid = pd.read_csv('./data/validation.csv')
test  = pd.read_csv('./data/test.csv')

In [3]:
t = train.drop(['bidprice','click','payprice'], axis=1)
v = valid.drop(['bidprice','click','payprice'], axis=1)

data = pd.concat([t, v, test], axis=0)
data.shape

(3038281, 22)

## To do list (^: already down, @: currently focused, &: currently skipped)
- weekday
- hour
- ^ bidid
- ^ userid
- useragent
- & IP [list of values]
- region
- city
- adexchange
- & domain
- & url
- ^ urlid
- slotid
- slotwidth
- slotheight
- slotvisibility
- slotformat
- slotprice
- creative
- keypage
- advertiser
- & usertag (list of values)

## Others
- deal with missing values

### Questions
- 

### drop useless features

In [11]:
# 
data = data.drop(['bidid', 'userid', 'urlid'], axis=1)

In [12]:
data.head()

Unnamed: 0,weekday,hour,useragent,IP,region,city,adexchange,domain,url,slotid,slotwidth,slotheight,slotvisibility,slotformat,slotprice,creative,keypage,advertiser,usertag
0,5,22,windows_ie,125.37.175.*,2,2,2,trqRTvKaXTKfgg24JKTI,6447a7dfa30fe6eb410c91860e7c9b45,2015392487,200,200,2,0,5,a4f763f78ef3eedfe614263b94a8924e,0f951a030abdaedd733ee8d114ce2944,3427,null
1,1,20,windows_chrome,171.36.92.*,238,239,1,20fc675468712705dbf5d3eda94126da,3ddf173a94bd23c326683b6373c75dd4,mm_10982364_973726_8930541,300,250,FourthView,Na,0,10722,,2821,null
2,3,13,windows_ie,59.46.106.*,40,41,2,trqRTJn7O95I1mKYUV,625d1b5916ea925332c7b326c0574cfa,1720123646,250,250,2,0,5,798b2d49952d77f1eace9f23c210d0b5,0f951a030abdaedd733ee8d114ce2944,3427,10052100061386610110
3,6,23,windows_ie,114.250.226.*,1,1,1,5F97t5E0BTK7XhNrUMpENpn,dedc488b98ca20707bc9a723957e7d1f,mm_10027070_118039_10308280,160,600,2,1,0,cb7c76e7784031272e37af8e7e9b062c,bebefa5efe83beee17a3d245e7c5085b,1458,138661006310111
4,5,6,windows_ie,183.63.192.*,216,233,2,13625cb070ffb306b425cd803c4b7ab4,4199d1227c511fc776b76594dabff9f8,1120200150,728,90,OtherView,Na,133,7330,,2259,null


In [13]:
data.tail()

Unnamed: 0,weekday,hour,useragent,IP,region,city,adexchange,domain,url,slotid,slotwidth,slotheight,slotvisibility,slotformat,slotprice,creative,keypage,advertiser,usertag
303370,4,22,windows_chrome,114.230.167.*,80,90,3,31xSTvprdN1RFt,7bd1674758bce19d2266165af11f3176,ALLINONE_F_Width2,1000,90,0,0,50,c46090c887c257b61ab1fa11baee91d8,0f951a030abdaedd733ee8d114ce2944,3427,100631003110110
303371,3,15,windows_chrome,1.194.113.*,164,181,2,trqRTvKaXTKfgg24JKTI,8ebf4d20bb25f6d0be696299d5176256,1030498436,250,250,2,0,5,8dff45ed862a740986dbe688aafee7e5,15b749127478946f161a54dc0dad27c8,3476,100761006310006100241007510110
303372,2,0,windows_ie,122.96.49.*,80,85,3,trqRTud7jQ19gM9WJFMyNKTRm7,1e91b4c36dd4f3952bce1cc39a0d0cb4,discuz_8810998_008,960,90,0,0,20,fb5afa9dba1274beaf3dad86baf97e89,bebefa5efe83beee17a3d245e7c5085b,1458,10052100061007510077100311002410110
303373,6,8,mac_chrome,4.53.129.*,0,0,3,tK1jlK9rg5scFsf,370fde9a369042ab39157b85cdb69e92,Tech_F_Width1,1000,90,0,0,20,d01411218cc79bc49d2a4078c4093b76,b2e35064f3549d447edbbdfb1f707c8c,3427,
303374,5,12,windows_chrome,111.164.212.*,2,2,2,DFpETuFygZl7gspy,205c311e21f34dde62bfef22a011df77,3721625973,250,250,2,0,39,2abc9eaf57d17a96195af3f63c45dc72,bebefa5efe83beee17a3d245e7c5085b,1458,10031106841006311576100061008310110


### deal with non-ordinal features

In [111]:
def count_freq_values(feat, numb=50):
    """
    @numb: value is frequent if occurs not less than numb times
    
    return
    @uniq_frq:  = number of unique frequent values
    @frq_pct: = number frequent values / number of all presented values
    """
    
    feat_count = Counter(feat)
    value_count = Counter(feat_count.values())
    # occurence of a value, number of such value
    vc_df = pd.DataFrame(value_count.most_common(), columns=['occur','count'])
    frq_vc_df = vc_df.loc[vc_df['occur'] > numb]
    
    uniq_frq = frq_vc_df['count'].sum()
    frq_pct = (frq_vc_df['occur'] * frq_vc_df['count']).sum() / len(feat)
    
    
    return uniq_frq, frq_pct
    

In [226]:
def convt_non_ord(feat, numb=500):
    """
    Convert non-ordinal feature into one-hot.
        1. Replace (values having occurence <= numb) with 'infreqValue'
        2. Get dommies of the values

    """
    feat_df = pd.DataFrame(feat)
    feat_count = Counter(feat)
    
    # find all values with occurence > numb
    freq_values = []
    for k in feat_count:
        if feat_count[k] > numb:
            freq_values.append(k)
    
    # 1. Replace (values having occurence <= numb) with 'infreqValue'
    for idx, row in feat_df.iterrows():
        if row['domain'] not in freq_values:
            row['domain'] = 'infreqValue'
    
    # 2. Get dommies of the values
    print('start getting dummies')
    feat_dummies = pd.get_dummies(feat_df)
    
    return feat_dummies

In [258]:
# try to split into multiple columns
count_freq_values(data['IP'], 5)

(193079, 0.709816175659855)

In [229]:
count_freq_values(data['domain'], 1000)

(331, 0.8097766467288575)

In [263]:
# too many
count_freq_values(data['url'], 250)

(1022, 0.34574056843326867)

In [248]:
count_freq_values(data['slotid'], 500)

(673, 0.8031712669104668)

In [268]:
# too many
count_freq_values(data['usertag'], 115)

(1004, 0.5490216342728009)

In [230]:
%%time
domain_dummies = convt_non_ord(data['domain'], 1000)

start getting dummies
CPU times: user 2min 55s, sys: 2.34 s, total: 2min 58s
Wall time: 2min 59s


In [232]:
domain_dummies.shape

(3038281, 332)