In [12]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [13]:
advertiser_id = '1458'
path = "/Users/denniscimorosi/Desktop/Tesi/IPinYou/make-ipinyou-data-master/" + advertiser_id + "/"
test_name = "test.log.txt"
train_name = "train.log.txt"

In [14]:
train_df = pd.read_csv(path + train_name, delimiter='\t')
test_df = pd.read_csv(path + test_name, delimiter='\t')

In [15]:
upsample = False
encoding = 'one-hot'

In [16]:
train_df = train_df.drop(['bidid', 'logtype', 'ipinyouid', 'IP', 'adexchange',
                          'urlid', 'url', 'slotid', 'slotwidth', 'slotheight', 'slotvisibility',
                          'slotformat', 'creative', 'keypage', 'advertiser','usertag'], axis=1)

test_df = test_df.drop(['bidid', 'logtype', 'ipinyouid', 'IP', 'adexchange',
                          'urlid', 'url', 'slotid', 'slotwidth', 'slotheight', 'slotvisibility',
                          'slotformat', 'creative', 'keypage', 'advertiser', 'usertag', 'nclick', 'nconversation'], axis=1)

In [17]:
test_df['click'].sum()

515

In [18]:
train_df.shape[0]

3083056

In [19]:
# split useragent columns into os and browser 
def split_os_browser(df):
    def extract_os(x):
        return x.split("_")[0]

    def extract_browser(x):
        return x.split("_")[1]

    vfun = np.vectorize(extract_os)
    df['os'] = vfun(df['useragent'])

    vfun = np.vectorize(extract_browser)
    df['browser'] = vfun(df['useragent'])

    df.drop(['useragent'], inplace=True, axis=1)

    return df

train_df = split_os_browser(train_df)
test_df = split_os_browser(test_df)

In [20]:
train_df

Unnamed: 0,click,weekday,hour,timestamp,region,city,domain,slotprice,bidprice,payprice,os,browser
0,0,4,0,20130606000104828,216,219,trqRTJkrBoq7JsNr5SqfNX,0,300,51,windows,ie
1,0,4,0,20130606000105075,124,125,trqRTJjrXqf7FmMs,0,300,87,windows,chrome
2,0,4,0,20130606000105119,94,98,5On-q5uvgN171m58uG,0,300,33,windows,ie
3,0,4,0,20130606000105254,1,1,DFpETuxoGQdcFNKbuKz,0,300,65,windows,ie
4,0,4,0,20130606000105284,368,369,trqRTvNNQIj7gspy,0,300,238,windows,chrome
...,...,...,...,...,...,...,...,...,...,...,...,...
3083051,0,3,23,20130612233441177,333,334,trqRTud7jQ19gM9WJFMyNKTRm7,20,300,20,windows,chrome
3083052,0,3,23,20130612233514614,0,0,,0,300,18,android,safari
3083053,0,3,23,20130612233549014,0,0,ersbQv1RdoTy1m58uG,20,300,20,windows,firefox
3083054,0,3,23,20130612234017010,216,217,3FF-e59aG5syJqKbuKz,70,300,70,other,other


In [21]:
train_df["domain"].isnull().sum()

169140

In [9]:
#based on 1458
#domain: 1000 ==> 206 domain
#city: 8000 ==> 91 cities
#region: all ==> 35 regions

In [10]:
# nan domains handling
train_df['domain'] = train_df['domain'].fillna('unknown')
test_df['domain'] = test_df['domain'].fillna('unknown')

def aggregate(x):
    return 'other' if x not in to_keep else str(x)

# aggregate domains which occur less than 1000 times
domains = train_df.groupby('domain', as_index=False).count()
to_keep = domains[domains['click'] > 1000]['domain']
to_keep = np.array(to_keep)

vfun = np.vectorize(aggregate)
train_df['domain'] = vfun(train_df['domain'])
test_df['domain'] = vfun(test_df['domain'])

# aggregate cities which occur less than 8000 times
cities = train_df.groupby('city', as_index=False).count()
to_keep = cities[cities['click'] > 8000]['city']
to_keep = np.array(to_keep)

vfun = np.vectorize(aggregate)
train_df['city'] = vfun(train_df['city'])
test_df['city'] = vfun(test_df['city']) 

In [11]:
# label encoding
def label_enc(test_df, train_df, feature_name):
    # extract all feature values from both train and test df
    test_feats = test_df[feature_name].unique()
    train_feats = train_df[feature_name].unique()
    all_feats = np.unique(np.concatenate([train_feats, test_feats]))
    # fit label encoder
    encoder = preprocessing.LabelEncoder().fit(all_feats)
    # transform feature
    test_df[feature_name] = encoder.transform(test_df[feature_name])
    train_df[feature_name] = encoder.transform(train_df[feature_name])
    return test_df, train_df

if encoding == 'label':
    test_df, train_df = label_enc(test_df, train_df, 'os')
    test_df, train_df = label_enc(test_df, train_df, 'browser')
    test_df, train_df = label_enc(test_df, train_df, 'domain')

In [12]:
def one_hot_enc(test_df, train_df, feature_name):
    test_cols = []
    train_cols = []

    # extract all feature values from both train and test df
    test_feats = test_df[feature_name].unique()
    train_feats = train_df[feature_name].unique()
    all_feats = np.unique(np.concatenate([train_feats, test_feats]))

    # fit one-hot encoder
    encoder = preprocessing.LabelBinarizer().fit(all_feats)

    transformed = encoder.transform(test_df[feature_name])
    test_ohe = pd.DataFrame(transformed)

    transformed = encoder.transform(train_df[feature_name])
    train_ohe = pd.DataFrame(transformed)

    for col in test_ohe.columns:
        test_cols.append(feature_name + '_' + str(col))
    test_ohe.columns = test_cols
    test_df = test_df.join(test_ohe).drop([feature_name], axis=1)

    for col in train_ohe.columns:
        train_cols.append(feature_name + '_' + str(col))
    train_ohe.columns = train_cols
    train_df = train_df.join(train_ohe).drop([feature_name], axis=1)

    return test_df, train_df

if encoding == 'one-hot':
    test_df, train_df = one_hot_enc(test_df, train_df, 'os')
    test_df, train_df = one_hot_enc(test_df, train_df, 'browser')
    test_df, train_df = one_hot_enc(test_df, train_df, 'domain')
    test_df, train_df = one_hot_enc(test_df, train_df, 'city')
    test_df, train_df = one_hot_enc(test_df, train_df, 'region')


In [13]:
train_df.to_csv(advertiser_id + '.train.final.csv', index = False)
test_df.to_csv(advertiser_id + '.test.final.csv', index = False)

In [14]:
train_df

Unnamed: 0,click,weekday,hour,timestamp,slotprice,bidprice,payprice,os_0,os_1,os_2,...,region_25,region_26,region_27,region_28,region_29,region_30,region_31,region_32,region_33,region_34
0,0,4,0,20130606000104828,0,300,51,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,4,0,20130606000105075,0,300,87,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,4,0,20130606000105119,0,300,33,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,4,0,20130606000105254,0,300,65,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,4,0,20130606000105284,0,300,238,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3083051,0,3,23,20130612233441177,20,300,20,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3083052,0,3,23,20130612233514614,0,300,18,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3083053,0,3,23,20130612233549014,20,300,20,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3083054,0,3,23,20130612234017010,70,300,70,0,0,0,...,0,0,0,0,0,0,0,0,0,0
