In [2]:
# importing necessary libraries
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as CM
import seaborn as sns
from pandas.io.json import json_normalize
from pandas.core.common import SettingWithCopyWarning
import datetime as datetime
from datetime import timedelta, date
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, scale
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split, GroupKFold
import gc
import time
import warnings

warnings.simplefilter('error', SettingWithCopyWarning)
gc.enable()

%matplotlib inline

plt.style.use('fivethirtyeight')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
%%time

# Save the flattened data set
df_train = pd.read_csv("./Data/train-flattened.csv", 
                       dtype={'fullVisitorId': 'str'}, index_col='sessionId')

CPU times: user 6.96 s, sys: 670 ms, total: 7.64 s
Wall time: 7.67 s


In [4]:
pd.set_option('display.max_columns', None)
df_train.head()

Unnamed: 0_level_0,channelGrouping,fullVisitorId,visitId,visitNumber,device_browser,device_deviceCategory,device_isMobile,device_operatingSystem,geoNetwork_city,geoNetwork_continent,geoNetwork_country,geoNetwork_metro,geoNetwork_networkDomain,geoNetwork_region,geoNetwork_subContinent,trafficSource_adContent,trafficSource_adwordsClickInfo.adNetworkType,trafficSource_adwordsClickInfo.gclId,trafficSource_adwordsClickInfo.isVideoAd,trafficSource_adwordsClickInfo.page,trafficSource_adwordsClickInfo.slot,trafficSource_campaign,trafficSource_isTrueDirect,trafficSource_keyword,trafficSource_medium,trafficSource_referralPath,trafficSource_source,totals_hits,totals_pageviews,totals_transactionRevenue,_year,_month,_day,_dayofWeek,_hour
sessionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
0000010278554503158_1477029466,Organic Search,10278554503158,1477029466,1,Chrome,desktop,False,Macintosh,not available in demo dataset,Oceania,New Zealand,not available in demo dataset,xtra.co.nz,not available in demo dataset,Australasia,,,,True,,,(not set),False,(not provided),organic,,google,11,8,0.0,2016,10,20,3,5
0000020424342248747_1480578901,Organic Search,20424342248747,1480578901,1,Chrome,desktop,False,Windows,La Victoria,Americas,Peru,(not set),munitrujillo.gob.pe,Lima Region,South America,,,,True,,,(not set),False,(not provided),organic,,google,17,13,0.0,2016,11,30,2,7
0000027376579751715_1486866293,Organic Search,27376579751715,1486866293,1,Chrome,desktop,False,Macintosh,not available in demo dataset,Americas,United States,not available in demo dataset,comcast.net,not available in demo dataset,Northern America,,,,True,,,(not set),False,(not provided),organic,,google,6,5,0.0,2017,2,11,5,2
0000039460501403861_1490629516,Social,39460501403861,1490629516,1,Chrome,desktop,False,Windows,not available in demo dataset,Americas,Brazil,not available in demo dataset,virtua.com.br,not available in demo dataset,South America,,,,True,,,(not set),False,,referral,/yt/about/pt-BR/,youtube.com,2,2,0.0,2017,3,27,0,15
0000040862739425590_1486836571,Paid Search,40862739425590,1486836571,1,Chrome,desktop,False,Macintosh,Oakland,Americas,United States,San Francisco-Oakland-San Jose CA,comcastbusiness.net,California,Northern America,,Google Search,Cj0KEQiAifvEBRCVx5up6Ojgr5oBEiQALHw1TrnWAHiMtZ...,False,1.0,Top,AW - Dynamic Search Ads Whole Site,False,1hZbAqLCbjwfgOH7,cpc,,google,2,2,0.0,2017,2,11,5,18


In [None]:
y_train = df_train['totals_transactionRevenue'] # Separate the DV
X_train = df_train.drop(['totals_transactionRevenue'], axis=1)

In [None]:
# https://www.kaggle.com/prashantkikani/teach-lightgbm-to-sum-predictions-fe
def browser_mapping(x):
    browsers = ['chrome','safari','firefox','internet explorer','edge','opera','coc coc','maxthon','iron']
    if x in browsers:
        return x.lower()
    elif  ('android' in x) or ('samsung' in x) or ('mini' in x) or ('iphone' in x) or ('in-app' in x) or ('playstation' in x):
        return 'mobile browser'
    elif  ('mozilla' in x) or ('chrome' in x) or ('blackberry' in x) or ('nokia' in x) or ('browser' in x) or ('amazon' in x):
        return 'mobile browser'
    elif  ('lunascape' in x) or ('netscape' in x) or ('blackberry' in x) or ('konqueror' in x) or ('puffin' in x) or ('amazon' in x):
        return 'mobile browser'
    elif '(not set)' in x:
        return x
    else:
        return 'others'
    
    
def adcontents_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('placement' in x) | ('placememnt' in x):
        return 'placement'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'ad' in x:
        return 'ad'
    else:
        return 'others'
    
def source_mapping(x):
    if  ('google' in x):
        return 'google'
    elif  ('youtube' in x):
        return 'youtube'
    elif '(not set)' in x or 'nan' in x:
        return x
    elif 'yahoo' in x:
        return 'yahoo'
    elif 'facebook' in x:
        return 'facebook'
    elif 'reddit' in x:
        return 'reddit'
    elif 'bing' in x:
        return 'bing'
    elif 'quora' in x:
        return 'quora'
    elif 'outlook' in x:
        return 'outlook'
    elif 'linkedin' in x:
        return 'linkedin'
    elif 'pinterest' in x:
        return 'pinterest'
    elif 'ask' in x:
        return 'ask'
    elif 'siliconvalley' in x:
        return 'siliconvalley'
    elif 'lunametrics' in x:
        return 'lunametrics'
    elif 'amazon' in x:
        return 'amazon'
    elif 'mysearch' in x:
        return 'mysearch'
    elif 'qiita' in x:
        return 'qiita'
    elif 'messenger' in x:
        return 'messenger'
    elif 'twitter' in x:
        return 'twitter'
    elif 't.co' in x:
        return 't.co'
    elif 'vk.com' in x:
        return 'vk.com'
    elif 'search' in x:
        return 'search'
    elif 'edu' in x:
        return 'edu'
    elif 'mail' in x:
        return 'mail'
    elif 'ad' in x:
        return 'ad'
    elif 'golang' in x:
        return 'golang'
    elif 'direct' in x:
        return 'direct'
    elif 'dealspotr' in x:
        return 'dealspotr'
    elif 'sashihara' in x:
        return 'sashihara'
    elif 'phandroid' in x:
        return 'phandroid'
    elif 'baidu' in x:
        return 'baidu'
    elif 'mdn' in x:
        return 'mdn'
    elif 'duckduckgo' in x:
        return 'duckduckgo'
    elif 'seroundtable' in x:
        return 'seroundtable'
    elif 'metrics' in x:
        return 'metrics'
    elif 'sogou' in x:
        return 'sogou'
    elif 'businessinsider' in x:
        return 'businessinsider'
    elif 'github' in x:
        return 'github'
    elif 'gophergala' in x:
        return 'gophergala'
    elif 'yandex' in x:
        return 'yandex'
    elif 'msn' in x:
        return 'msn'
    elif 'dfa' in x:
        return 'dfa'
    elif '(not set)' in x:
        return '(not set)'
    elif 'feedly' in x:
        return 'feedly'
    elif 'arstechnica' in x:
        return 'arstechnica'
    elif 'squishable' in x:
        return 'squishable'
    elif 'flipboard' in x:
        return 'flipboard'
    elif 't-online.de' in x:
        return 't-online.de'
    elif 'sm.cn' in x:
        return 'sm.cn'
    elif 'wow' in x:
        return 'wow'
    elif 'baidu' in x:
        return 'baidu'
    elif 'partners' in x:
        return 'partners'
    else:
        return 'others'

X_train['device_browser'] = X_train['device_browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
X_train['trafficSource_adContent'] = X_train['trafficSource_adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
X_train['trafficSource_source'] = X_train['trafficSource_source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

# test['device.browser'] = test['device.browser'].map(lambda x:browser_mapping(str(x).lower())).astype('str')
# test['trafficSource.adContent'] = test['trafficSource.adContent'].map(lambda x:adcontents_mapping(str(x).lower())).astype('str')
# test['trafficSource.source'] = test['trafficSource.source'].map(lambda x:source_mapping(str(x).lower())).astype('str')

# def process_device(data_df):
#     print("process device ...")
#     data_df['source.country'] = data_df['trafficSource.source'] + '_' + data_df['geoNetwork.country']
#     data_df['campaign.medium'] = data_df['trafficSource.campaign'] + '_' + data_df['trafficSource.medium']
#     data_df['browser.category'] = data_df['device.browser'] + '_' + data_df['device.deviceCategory']
#     data_df['browser.os'] = data_df['device.browser'] + '_' + data_df['device.operatingSystem']
#     return data_df

# train = process_device(train)
# test = process_device(test)

# def custom(data):
#     print('custom..')
#     data['device_deviceCategory_channelGrouping'] = data['device.deviceCategory'] + "_" + data['channelGrouping']
#     data['channelGrouping_browser'] = data['device.browser'] + "_" + data['channelGrouping']
#     data['channelGrouping_OS'] = data['device.operatingSystem'] + "_" + data['channelGrouping']
    
#     for i in ['geoNetwork.city', 'geoNetwork.continent', 'geoNetwork.country','geoNetwork.metro', 'geoNetwork.networkDomain', 'geoNetwork.region','geoNetwork.subContinent']:
#         for j in ['device.browser','device.deviceCategory', 'device.operatingSystem', 'trafficSource.source']:
#             data[i + "_" + j] = data[i] + "_" + data[j]
    
#     data['content.source'] = data['trafficSource.adContent'] + "_" + data['source.country']
#     data['medium.source'] = data['trafficSource.medium'] + "_" + data['source.country']
#     return data

# train = custom(train)
# test = custom(test)

X_train = X_train.drop(['device_isMobile'], axis=1)
pd.set_option('display.max_columns', None)
X_train.head()

In [None]:
X_train['geoNetwork_city'].value_counts()

In [7]:
excluded_features = [
    'fullVisitorId', 'totals.transactionRevenue', 
    'visitId', 'visitStartTime'
]

categorical_features = [
    _f for _f in X_train.columns
    if (_f not in excluded_features) & (X_train[_f].dtype == 'object')
]

In [8]:
for f in categorical_features:
    X_train[f], indexer = pd.factorize(X_train[f])


In [None]:
pd.set_option('display.max_columns', None)
X_train.head()

In [9]:
params={'learning_rate': 0.03,
        'objective':'regression',
        'metric':'rmse',
        'num_leaves': 31,
        'verbose': 1,
        "subsample": 0.99,
        "colsample_bytree": 0.99,
        "random_state":42,
        'max_depth': 15,
        'lambda_l2': 0.02085548700474218,
        'lambda_l1': 0.004107624022751344,
        'bagging_fraction': 0.7934712636944741,
        'feature_fraction': 0.686612409641711,
        'min_child_samples': 21
       }

In [21]:
def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids


In [None]:
folds = get_folds(df=X_train, n_splits=5)

train_features = [_f for _f in X_train.columns if _f not in excluded_features]
# print(train_features)

importances = pd.DataFrame()
oof_reg_preds = np.zeros(X_train.shape[0])
# sub_reg_preds = np.zeros(test.shape[0])
for fold_, (trn_, val_) in enumerate(folds):
    print("Fold:", fold_)
    trn_x, trn_y = X_train[train_features].iloc[trn_], y_train.iloc[trn_]
    val_x, val_y = X_train[train_features].iloc[val_], y_train.iloc[val_]
    reg = lgb.LGBMRegressor(**params,
         n_estimators=1000)
    reg.fit(
        trn_x, trn_y,
        eval_set=[(val_x, val_y)],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
#     _preds = reg.predict(X_test[train_features], num_iteration=reg.best_iteration_)
#     _preds[_preds < 0] = 0
#     sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(y_train, oof_reg_preds)**.5


In [None]:
# calculating the important of each feature
importances['gain_log'] = (importances['gain'])
mean_gain = importances[['gain', 'feature']].groupby('feature').mean()
importances['mean_gain'] = importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 12))
sns.barplot(x='gain_log', y='feature', data=importances.sort_values('mean_gain', ascending=False))