In [1]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()


from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
  
from sklearn import model_selection, preprocessing, metrics
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.preprocessing import Imputer 

import lightgbm as lgb

from fastai.imports import *
from fastai.torch_imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from fastai.structured import *
from fastai.column_data import *
PATH = 'data/'

import torch
import gc
%matplotlib inline


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



# Get Data

In [2]:
train = pd.read_csv('data/extracted_fields_train.gz', dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=None)
test = pd.read_csv('data/extracted_fields_test.gz', dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=None)
train.shape, test.shape

((903653, 30), (804684, 30))

# Folding

In [3]:
def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids

# Functions (Feature engineering)

In [4]:
for df in [train, test]:
    df['vis_date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['sess_date_dow'] = df['vis_date'].dt.dayofweek
    df['sess_date_hours'] = df['vis_date'].dt.hour
    df['sess_date_dom'] = df['vis_date'].dt.day
    df.sort_values(['fullVisitorId', 'vis_date'], ascending=True, inplace=True)
    df['next_session_1'] = (
        df['vis_date'] - df[['fullVisitorId', 'vis_date']].groupby('fullVisitorId')['vis_date'].shift(1)
    ).astype(np.int64) // 1e9 // 60 // 60
    df['next_session_2'] = (
        df['vis_date'] - df[['fullVisitorId', 'vis_date']].groupby('fullVisitorId')['vis_date'].shift(-1)
    ).astype(np.int64) // 1e9 // 60 // 60
    
#     df['max_visits'] = df['fullVisitorId'].map(
#         df[['fullVisitorId', 'visitNumber']].groupby('fullVisitorId')['visitNumber'].max()
#     )
    
    df['nb_pageviews'] = df['date'].map(
        df[['date', 'totals.pageviews']].groupby('date')['totals.pageviews'].sum()
    )
    
    df['ratio_pageviews'] = df['totals.pageviews'] / df['nb_pageviews']
    
#     df['nb_sessions'] = df['date'].map(
#         df[['date']].groupby('date').size()
#     )
    
#     df['nb_sessions_28_ma'] = df['date'].map(
#         df[['date']].groupby('date').size().rolling(28, min_periods=7).mean()
#     )

#     df['nb_sessions_28_ma'] = df['nb_sessions'] / df['nb_sessions_28_ma']

#     df['nb_sessions_per_day'] = df['date'].map(
#         df[['date']].groupby('date').size()
#     )
    
#     df['nb_visitors_per_day'] = df['date'].map(
#         df[['date','fullVisitorId']].groupby('date')['fullVisitorId'].nunique()
#     )

# Prepare data for deep-learning regression model

In [5]:
excluded_features = [
    'fullVisitorId', 'sessionId', 'totals_transactionRevenue', 
    'visitId', 'visitStartTime', 'date','vis_date', 'nb_sessions', 'max_visits'
    #excluded after feature importance:
]

cat_cols = [
    _f for _f in train.columns
    if (_f not in excluded_features) & (train[_f].dtype == 'object' or train[_f].dtype == 'int64')
]

num_cols = [c for c in train.columns if c not in cat_cols and c not in excluded_features]

In [6]:
num_cols

['visitNumber',
 'device.isMobile',
 'totals.bounces',
 'totals.hits',
 'totals.newVisits',
 'totals.pageviews',
 'totals.transactionRevenue',
 'trafficSource.isTrueDirect',
 'nb_pageviews',
 'ratio_pageviews']

In [7]:
dep = 'totals.transactionRevenue'
test[dep] = 0

In [8]:
# Model
print("prepare model ...")
X = train[cat_cols + num_cols + ['fullVisitorId']].copy()
X_test = test[cat_cols + num_cols + ['fullVisitorId']].copy()

prepare model ...


In [9]:
for v in cat_cols:
    print(v)
    X[v] = X[v].astype('category').cat.as_ordered()
    

X['fullVisitorId'] = X['fullVisitorId'].astype('category').cat.as_ordered()

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
trafficSource.adContent
trafficSource.campaign
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source
sess_date_dow
sess_date_hours
sess_date_dom
next_session_1
next_session_2


In [10]:
len(X.columns), len(X_test.columns), set(X_test.columns).difference(set(X.columns))

(33, 33, set())

In [11]:
for x in X:
    print(x)
    print(X[x].dtype.name)

channelGrouping
category
device.browser
category
device.deviceCategory
category
device.operatingSystem
category
geoNetwork.city
category
geoNetwork.continent
category
geoNetwork.country
category
geoNetwork.metro
category
geoNetwork.networkDomain
category
geoNetwork.region
category
geoNetwork.subContinent
category
trafficSource.adContent
category
trafficSource.campaign
category
trafficSource.keyword
category
trafficSource.medium
category
trafficSource.referralPath
category
trafficSource.source
category
sess_date_dow
category
sess_date_hours
category
sess_date_dom
category
next_session_1
category
next_session_2
category
visitNumber
float64
device.isMobile
float64
totals.bounces
float64
totals.hits
float64
totals.newVisits
float64
totals.pageviews
float64
totals.transactionRevenue
float64
trafficSource.isTrueDirect
float64
nb_pageviews
float64
ratio_pageviews
float64
fullVisitorId
category


In [12]:
apply_cats(X_test, X)
#if this does not work it is probably because of a duplicated column name ( throws dataframe has no attribute dtype error)

In [13]:
for v in num_cols:
    X[v] = X[v].astype('float32')
    X_test[v] = X_test[v].astype('float32')
    

X[dep] = X[dep].astype('float32')
X[dep] = X[dep].fillna(0)
X_test[dep] = X_test[dep].astype('float32')

## Create Dataframes for deep-learning

In [14]:
cat_sz = [(c, len(X[c].cat.categories)+1) for c in cat_cols]
cat_sz[:5]

[('channelGrouping', 9),
 ('device.browser', 55),
 ('device.deviceCategory', 4),
 ('device.operatingSystem', 21),
 ('geoNetwork.city', 650)]

In [15]:
emb_szs = [(c, max(10, min(50, (c+1)//2))) for _,c in cat_sz]
emb_szs[:5]

[(9, 10), (55, 28), (4, 10), (21, 11), (650, 50)]

In [16]:
import warnings; warnings.simplefilter('ignore')

In [18]:
from sklearn.metrics import mean_squared_error

def exp_rmse(y_pred, targ):
    return math.sqrt(mean_squared_error(targ, y_pred))

## Learning

In [19]:
folds = get_folds(df=X, n_splits=5)

submission = test.copy()

X_indexed = X.set_index("fullVisitorId")
df, y, nas, mapper = proc_df(X_indexed, 'totals.transactionRevenue', do_scale=True)

for fold_, (trn_, val_) in enumerate(folds):
    print("Fold:",fold_)
    submission['pred' + str(fold_)] = np.zeros(test.shape[0])
    
    trn_x = X.iloc[np.concatenate((trn_, val_), axis=0)]
    val_idx = val_
    
    train_indexed = trn_x.set_index("fullVisitorId")
    df, y, nas_, mapper_fold = proc_df(train_indexed, 'totals.transactionRevenue', do_scale=True, mapper=mapper, na_dict=nas)
    X_test_indexed = X_test.set_index("fullVisitorId")
    df_test, _, nas_, mapper_ = proc_df(X_test_indexed, 'totals.transactionRevenue', do_scale=True,
                                  mapper=mapper_fold, na_dict=nas)
    yl = np.log1p(y)
    max_y = np.max(yl)
    y_range = (0, max_y*1.2)
    
    md = ColumnarModelData.from_data_frame(PATH, val_idx, df, yl, cat_flds=cat_cols, bs=512, test_df=df_test)
    m = md.get_learner(emb_szs, len(df.columns)-len(cat_cols),
                   0.04, 1, [1000,500], [0.001, 0.01], y_range=y_range)
    
    m.fit(1e-4, 3, cycle_len = 2, cycle_mult = 1, metrics=[exp_rmse])
    
    _preds = m.predict(True)
    _preds[_preds < 0] = 0
    
    submission['pred' + str(fold_)] = np.expm1(_preds) / len(folds)
    #change_fc_data(learn, train_index, val_index)

Fold: 0


HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmse                                                                              
    0      2.965765   2.862887   1.679486  
    1      2.90406    2.77969    1.654479                                                                              
    2      2.669535   2.630386   1.609037                                                                              
    3      2.452558   2.617925   1.604871                                                                              
    4      2.573132   2.609807   1.601921                                                                              
    5      2.503374   2.623164   1.606143                                                                              

Fold: 1


HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmse                                                                              
    0      3.070762   2.938885   1.700346  
    1      2.805109   2.851498   1.67525                                                                               
    2      2.661179   2.666828   1.6208                                                                                
    3      2.587331   2.615499   1.603931                                                                              
    4      2.605077   2.596526   1.596943                                                                              
    5      2.487355   2.59626    1.597418                                                                              

Fold: 2


HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmse                                                                              
    0      2.851682   3.018563   1.724918  
    1      2.812611   2.932533   1.699725                                                                              
    2      2.625726   2.720748   1.636704                                                                              
    3      2.547041   2.700095   1.630454                                                                              
    4      2.408809   2.809148   1.663421                                                                              
    5      2.413825   2.683109   1.625439                                                                              

Fold: 3


HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmse                                                                              
    0      2.897342   3.124506   1.753102  
    1      2.976796   3.045097   1.730368                                                                              
    2      2.74208    2.810426   1.660368                                                                              
    3      2.618793   2.779965   1.651639                                                                              
    4      2.47389    2.773422   1.649696                                                                              
    5      2.473539   2.762012   1.645477                                                                              

Fold: 4


HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss   exp_rmse                                                                              
    0      2.848853   2.894716   1.689161  
    1      2.906121   2.801531   1.661602                                                                              
    2      2.589933   2.614784   1.604177                                                                              
    3      2.514462   2.596301   1.598332                                                                              
    4      2.635922   2.594426   1.596898                                                                              
    5      2.312667   2.585676   1.594928                                                                              



In [28]:
submission['PredictedLogRevenue'] = submission[['pred0','pred1','pred2','pred3','pred4']].sum(axis=1)

In [29]:
submission['PredictedLogRevenue'].mean(), np.log1p(submission['PredictedLogRevenue']).mean()

(59628.598, 0.17678119)

In [32]:
submission_grouped = submission.groupby(['fullVisitorId'], as_index=False).agg({'PredictedLogRevenue':'sum'})

In [33]:
submission_grouped['PredictedLogRevenue'] = np.log1p(submission_grouped['PredictedLogRevenue'])

In [34]:
submission_grouped['PredictedLogRevenue'].mean(), yl.mean()

(0.19046706, 0.2271182)

In [35]:
len(submission_grouped)

617242

In [36]:
csv_fn=f'{PATH}submission.csv'

submission_grouped[['fullVisitorId','PredictedLogRevenue']].to_csv(csv_fn, index=False)

submission_grouped

Unnamed: 0,fullVisitorId,PredictedLogRevenue
0,0000000259678714014,0.770001
1,0000049363351866189,0.011526
2,0000053049821714864,0.001315
3,0000059488412965267,0.079257
4,0000085840370633780,0.011969
5,0000091131414287111,0.001297
6,0000117255350596610,5.199658
7,0000118334805178127,0.010961
8,0000130646294093000,0.001590
9,0000150005271820273,0.004620


## Learning Rate finder

In [None]:
m.lr_find(end_lr=1e-2)
m.sched.plot()

## FullvisitorId indexed visitor level cv 15% train/validation split) using better features:

### 3e

- 2.908336154736353, 1.6928364618530287

### 6e

- 2.6615966842603833, 1.6192395528417023

### 9e

- 2.6171998919897654, 1.6051034699929898

### 9e3eh 

- 2.604989178404238, 1.601011082221721

### 10e3eh 

- 2.6047352634459537, 1.6008542096483447

## Date indexed visitor level cv (15.32% train/validation split) using better features:

### 3e

- 2.8000360318887103, 1.6550484801023893

### 5e

- 2.640869347781692, 1.6067949968854136


## Date indexed visitor level cv (15.32% train/validation split):

### 3e

- 3.0265201163562265, RMSE 1.3110982701455685 

### 6e

- 2.8375099688976335, RMSE 1.2135674675795136 

### 6e9em

- 2.796606798042731, RMSE 1.1948834400361132

### 9e9em 

- 2.7970978266890083, RMSE 1.1941790717723948

In [None]:
fi = nn_feat_importance(m, md, cat_cols, num_cols)
fi.plot('cols', 'imp', 'barh', figsize=(12,20), legend=False)

# Create user level predictions

In [None]:
log_preds_train = m.predict_dl(m.data.trn_dl)

In [None]:
log_preds_val = m.predict_dl(m.data.val_dl)

In [None]:
log_preds = m.predict(True)

In [None]:
log_preds.mean(), np.concatenate([log_preds_train, log_preds_val]).mean(), np.log(train['totals.transactionRevenue']).mean()

In [None]:
log_preds_train = np.concatenate([log_preds_train, log_preds_val])

In [None]:
train['predictions'] = log_preds_train
test['predictions'] = log_preds

In [None]:
train_user = train[cat_cols + num_cols + ['fullVisitorId', 'predictions']]
test_user = test[cat_cols + num_cols + ['fullVisitorId', 'predictions']]

In [None]:
for f in cat_cols:
    train_user[f], indexer = pd.factorize(train_user[f])
    test_user[f] = indexer.get_indexer(test_user[f])

In [None]:
trn_data = train_user[cat_cols + num_cols + ['fullVisitorId']].groupby('fullVisitorId').mean()

In [None]:
%%time
# Create a list of predictions for each Visitor
trn_pred_list = train_user[['fullVisitorId', 'predictions']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.predictions))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})

In [None]:
trn_all_predictions = pd.DataFrame(list(trn_pred_list.values), index=trn_data.index)
trn_feats = trn_all_predictions.columns
trn_all_predictions['t_mean'] = np.log1p(trn_all_predictions[trn_feats].mean(axis=1))
trn_all_predictions['t_median'] = np.log1p(trn_all_predictions[trn_feats].median(axis=1))
trn_all_predictions['t_sum_log'] = np.log1p(trn_all_predictions[trn_feats]).sum(axis=1)
trn_all_predictions['t_sum_act'] = np.log1p(trn_all_predictions[trn_feats].fillna(0).sum(axis=1))
trn_all_predictions['t_nb_sess'] = trn_all_predictions[trn_feats].isnull().sum(axis=1)
full_data = pd.concat([trn_data, trn_all_predictions], axis=1)
del trn_data, trn_all_predictions
gc.collect()
full_data.shape

In [None]:
%%time
sub_pred_list = test_user[['fullVisitorId', 'predictions']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.predictions))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})

In [None]:
sub_data = test_user[cat_cols + num_cols + ['fullVisitorId']].groupby('fullVisitorId').mean()
sub_all_predictions = pd.DataFrame(list(sub_pred_list.values), index=sub_data.index)
for f in trn_feats:
    if f not in sub_all_predictions.columns:
        sub_all_predictions[f] = np.nan
sub_all_predictions['t_mean'] = np.log1p(sub_all_predictions[trn_feats].mean(axis=1))
sub_all_predictions['t_median'] = np.log1p(sub_all_predictions[trn_feats].median(axis=1))
sub_all_predictions['t_sum_log'] = np.log1p(sub_all_predictions[trn_feats]).sum(axis=1)
sub_all_predictions['t_sum_act'] = np.log1p(sub_all_predictions[trn_feats].fillna(0).sum(axis=1))
sub_all_predictions['t_nb_sess'] = sub_all_predictions[trn_feats].isnull().sum(axis=1)
sub_full_data = pd.concat([sub_data, sub_all_predictions], axis=1)
del sub_data, sub_all_predictions
gc.collect()
sub_full_data.shape

In [None]:
train_user['target'] = yl
trn_user_target = train_user[['fullVisitorId', 'target']].groupby('fullVisitorId').sum()

In [None]:
for f in full_data.columns:
    print(f +  " - " + str(full_data[f].dtype))

In [None]:
for v in cat_cols:
    print(v)
    full_data[v] = full_data[v].astype('category').cat.as_ordered()
    
    
full_data['t_nb_sess'] = full_data['t_nb_sess'].astype('category').cat.as_ordered()

In [None]:
apply_cats(sub_full_data, full_data)
#if this does not work it is probably because of a duplicated column name ( throws dataframe has no attribute dtype error)

## Create Dataframes for deep-learning

In [None]:
full_data['totals.transactionRevenue'] = trn_user_target['target']
sub_full_data['totals.transactionRevenue'] = 0

In [None]:
full_data.shape, sub_full_data.shape

In [None]:
full_data = full_data.apply(lambda x: pd.to_numeric(x, downcast='float') if x.dtype == "float64" else x)

In [None]:
sub_full_data = sub_full_data.apply(lambda x: pd.to_numeric(x, downcast='float') if x.dtype == "float64" else x)

In [None]:
full_data.reset_index().to_feather('Dataframes/full_data')
sub_full_data.reset_index().to_feather('Dataframes/sub_full_data')

In [None]:
full_data = pd.read_feather('Dataframes/full_data')
sub_full_data = pd.read_feather('Dataframes/sub_full_data')

In [None]:
'''excluded_features = [
    'fullVisitorId', 'sessionId', 'totals_transactionRevenue', 
    'visitId', 'visitStartTime', 'date'
    #excluded after feature importance:
]

cat_cols = [
    _f for _f in full_data.columns
    if (_f not in excluded_features) & (full_data[_f].dtype.name == 'category')
]

num_cols = [c for c in full_data.columns if c not in cat_cols and c not in excluded_features]'''

In [None]:
df, y_user, nas, mapper = proc_df(full_data, 'totals.transactionRevenue', do_scale=True)

In [None]:
del full_data
gc.collect()

In [None]:
df_test, _, nas, mapper = proc_df(sub_full_data, 'totals.transactionRevenue', do_scale=True, 
                                  mapper=mapper, na_dict=nas)

In [None]:
del sub_full_data
gc.collect()

In [None]:
n = len(df)
samp_size = n

In [None]:
#train_size = len(df) - 110000  #percent user visitor level: 15.320371868405239
#val_idx = list(range(train_size, len(df))); len(val_idx)

In [None]:
len(val_idx)/len(df) * 100

In [None]:
yl_user = np.log1p(y_user)

max_y_user = np.max(yl_user)

y_range_user = (0, max_y_user*1.2)

In [None]:
md = ColumnarModelData.from_data_frame(PATH, val_idx, df, yl_user, cat_flds=cat_cols, bs=512, test_df=df_test)

In [None]:
cat_sz = [(c, len(full_data[c].cat.categories)+1) for c in cat_cols]
cat_sz

In [None]:
emb_szs = [(c, max(10, min(50, (c+1)//2))) for _,c in cat_sz]
emb_szs

In [None]:
m = md.get_learner(emb_szs, len(df.columns)-len(cat_cols),
                   0.04, 1, [1000,500], [0.001, 0.01], y_range=y_range_user)

#m = md.get_learner(emb_szs, len(df.columns)-len(cat_cols), 0.04, 1, 
#                           [  2000,  3000, 2500, 2250, 2000, 1500, 1000, 500], 
#                           [0.0001,0.0002,0.0005,0.0007,0.001,0.002,0.008,0.004], y_range_user=y_range_user)

## Learning Rate finder

In [None]:
import warnings; warnings.simplefilter('ignore')

In [None]:
m.lr_find(end_lr=1e-2)
m.sched.plot()

In [None]:
lr = 10 ** -4.6
lr

## Learning

In [None]:
from sklearn.metrics import mean_squared_error

def exp_rmse(y_pred, targ):
    return math.sqrt(mean_squared_error(targ, y_pred))

In [None]:
m.fit(lr, 1, cycle_len = 3, metrics=[exp_rmse])

In [None]:
m.save("customer_revenue_3e")

In [None]:
m.load("customer_revenue_3e")

In [None]:
m.fit(lr, 3, cycle_len = 2, cycle_mult = 2, metrics=[exp_rmse])

In [None]:
m.save("customer_revenue_3e_14em")

In [None]:
m.load("customer_revenue_3e_14em") 

In [None]:
m.fit(lr, 5, cycle_len = 1, cycle_mult = 1, metrics=[exp_rmse])

In [None]:
m.save("customer_revenue_3e_45em")

In [None]:
m.load("customer_revenue_3e_45em")

In [None]:
m.fit(lr, 4, cycle_len = 1, cycle_mult = 2, metrics=[exp_rmse])

# Create Submission 

In [None]:
x,y=m.predict_with_targs()
x = torch.from_numpy(x)
y = torch.from_numpy(y)
exp_rmse(x,y)

In [None]:
log_preds = m.predict(True)

log_preds, log_preds.mean(), len(log_preds)

In [None]:
submission = test.copy()

In [None]:
submission['PredictedLogRevenue'] = np.expm1(log_preds)

In [None]:
submission['PredictedLogRevenue'][(np.log(submission['totals.pageviews']) / np.log(submission['totals.hits'])) < 0.4].mean()

In [None]:
#submission['PredictedLogRevenue'][(submission['totals_pageviews'] / submission['totals_hits']) < 0.4] = 0.0

In [None]:
submission_grouped = submission.groupby(['fullVisitorId'], as_index=False).agg({'PredictedLogRevenue':'sum'})

In [None]:
submission_grouped['PredictedLogRevenue'] = np.log1p(submission_grouped['PredictedLogRevenue'])

In [None]:
submission_grouped['PredictedLogRevenue'].mean(), yl.mean()

In [None]:
len(submission_grouped)

In [None]:
csv_fn=f'{PATH}submission.csv'

submission_grouped[['fullVisitorId','PredictedLogRevenue']].to_csv(csv_fn, index=False)

submission_grouped

In [None]:
#plt.figure(figsize=(20,10))
#plt.plot(X['date'], y)
#plt.plot(submission['date'], submission['PredictedLogRevenue'])