In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()


from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
  
from sklearn import model_selection, preprocessing, metrics
from sklearn.metrics import mean_squared_error, roc_auc_score, log_loss
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold
from sklearn.preprocessing import Imputer 

import lightgbm as lgb
PATH = 'data/'

import torch
import gc

%matplotlib inline

# Get Data

In [None]:
train = pd.read_csv('data/extracted_fields_train.gz', 
                    dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=None)
test = pd.read_csv('data/extracted_fields_test.gz', 
                   dtype={'date': str, 'fullVisitorId': str, 'sessionId':str}, nrows=None)
train.shape, test.shape

## Folding

In [None]:
def get_folds(df=None, n_splits=5):
    """Returns dataframe indices corresponding to Visitors Group KFold"""
    # Get sorted unique visitors
    unique_vis = np.array(sorted(df['fullVisitorId'].unique()))

    # Get folds
    folds = GroupKFold(n_splits=n_splits)
    fold_ids = []
    ids = np.arange(df.shape[0])
    for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
        fold_ids.append(
            [
                ids[df['fullVisitorId'].isin(unique_vis[trn_vis])],
                ids[df['fullVisitorId'].isin(unique_vis[val_vis])]
            ]
        )

    return fold_ids

# Feature engineering

### Date

In [None]:
for df in [train, test]:
    df['date'] = pd.to_datetime(df['visitStartTime'], unit='s')
    df['sess_date_dow'] = df['date'].dt.dayofweek
    df['sess_date_hours'] = df['date'].dt.hour
    df['sess_date_dom'] = df['date'].dt.day

### hits per pageviews

In [None]:
for df in [train, test]:
    df['hits_per_pageviews'] = df['totals.hits'] / df['totals.pageviews']
    #df['pageviews_per_hits'] = df['totals.pageviews'] / df['totals.hits']

### Target

In [None]:
y_reg = train['totals.transactionRevenue'].fillna(0)
y_clf = (train['totals.transactionRevenue'].fillna(0) > 0).astype(np.uint8)
del train['totals.transactionRevenue']

if 'totals.transactionRevenue' in test.columns:
    del test['totals.transactionRevenue']

# Session prediction

In [None]:
excluded_features = [
    'fullVisitorId', 'sessionId', 'totals_transactionRevenue', 
    'visitId', 'visitStartTime', 'date', 'target'
    #excluded after feature importance:
]

cat_cols = [
    _f for _f in train.columns
    if (_f not in excluded_features) & (train[_f].dtype == 'object')
]

num_cols = [c for c in train.columns if c not in cat_cols and c not in excluded_features]

In [None]:
for f in cat_cols:
    train[f], indexer = pd.factorize(train[f])
    test[f] = indexer.get_indexer(test[f])

In [None]:
folds = get_folds(df=train, n_splits=5)

train_features = [_f for _f in train.columns if _f not in excluded_features]
print(train_features)

importances = pd.DataFrame()
oof_reg_preds = np.zeros(train.shape[0])
sub_reg_preds = np.zeros(test.shape[0])
for fold_, (trn_, val_) in enumerate(folds):
    trn_x, trn_y = train[train_features].iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = train[train_features].iloc[val_], y_reg.iloc[val_]
    
    reg = lgb.LGBMRegressor(
        num_leaves=31,
        learning_rate=0.03,
        n_estimators=1000,
        subsample=.9,
        colsample_bytree=.9,
        random_state=1
    )
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(trn_x, np.log1p(trn_y)), (val_x, np.log1p(val_y))],
        eval_names=['TRAIN', 'VALID'],
        early_stopping_rounds=50,
        verbose=100,
        eval_metric='rmse'
    )
    imp_df = pd.DataFrame()
    imp_df['feature'] = train_features
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    importances = pd.concat([importances, imp_df], axis=0, sort=False)
    
    oof_reg_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_reg_preds[oof_reg_preds < 0] = 0
    _preds = reg.predict(test[train_features], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)
    
mean_squared_error(np.log1p(y_reg), oof_reg_preds) ** .5

In [None]:
import warnings
warnings.simplefilter('ignore', FutureWarning)

importances['gain_log'] = np.log1p(importances['gain'])
mean_gain = importances[['gain', 'feature']].groupby('feature').mean()
importances['mean_gain'] = importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 12))
sns.barplot(x='gain_log', y='feature', data=importances.sort_values('mean_gain', ascending=False))

# Evaluation Session Level

## K-Fold 10 Splits

### With zero probability

lr 0.005

- TRAIN's l2: 2.50506	TRAIN's rmse: 1.58274	VALID's l2: 2.60051	VALID's rmse: 1.61261
- TRAIN's l2: 2.50436	TRAIN's rmse: 1.58252	VALID's l2: 2.61015	VALID's rmse: 1.61560
- TRAIN's l2: 2.50974	TRAIN's rmse: 1.58421	VALID's l2: 2.61663	VALID's rmse: 1.61760
- TRAIN's l2: 2.56150	TRAIN's rmse: 1.60047	VALID's l2: 2.46969	VALID's rmse: 1.57152
- TRAIN's l2: 2.49708	TRAIN's rmse: 1.58021	VALID's l2: 2.79939	VALID's rmse: 1.67314
- TRAIN's l2: 2.50000	TRAIN's rmse: 1.58114	VALID's l2: 2.64223	VALID's rmse: 1.62549
- TRAIN's l2: 2.52860	TRAIN's rmse: 1.59016	VALID's l2: 2.54535	VALID's rmse: 1.59542
- TRAIN's l2: 2.51655	TRAIN's rmse: 1.58636	VALID's l2: 2.63061	VALID's rmse: 1.62191
- TRAIN's l2: 2.49502	TRAIN's rmse: 1.57956	VALID's l2: 2.64260	VALID's rmse: 1.62561
- TRAIN's l2: 2.51765	TRAIN's rmse: 1.58671	VALID's l2: 2.57660	VALID's rmse: 1.60518

RSME 1.616597042093458

lr 0.01

- TRAIN's l2: 2.51229	TRAIN's rmse: 1.58502	VALID's l2: 2.60184	VALID's rmse: 1.61302
- TRAIN's l2: 2.50326	TRAIN's rmse: 1.58217	VALID's l2: 2.60899	VALID's rmse: 1.61524
- TRAIN's l2: 2.49482	TRAIN's rmse: 1.57950	VALID's l2: 2.61397	VALID's rmse: 1.61678
- TRAIN's l2: 2.55132	TRAIN's rmse: 1.59729	VALID's l2: 2.46891	VALID's rmse: 1.57128
- TRAIN's l2: 2.49887	TRAIN's rmse: 1.58078	VALID's l2: 2.80034	VALID's rmse: 1.67342
- TRAIN's l2: 2.50091	TRAIN's rmse: 1.58143	VALID's l2: 2.64032	VALID's rmse: 1.62491
- TRAIN's l2: 2.52558	TRAIN's rmse: 1.58921	VALID's l2: 2.54599	VALID's rmse: 1.59562
- TRAIN's l2: 2.50528	TRAIN's rmse: 1.58281	VALID's l2: 2.63096	VALID's rmse: 1.62202
- TRAIN's l2: 2.49931	TRAIN's rmse: 1.58092	VALID's l2: 2.64255	VALID's rmse: 1.62559
- TRAIN's l2: 2.52582	TRAIN's rmse: 1.58928	VALID's l2: 2.57615	VALID's rmse: 1.60504

RSME 1.6164814214277488

lr 0.03

- TRAIN's rmse: 1.57665	TRAIN's l2: 2.48581	VALID's rmse: 1.61395	VALID's l2: 2.60484
- TRAIN's rmse: 1.58256	TRAIN's l2: 2.50449	VALID's rmse: 1.61439	VALID's l2: 2.60625
- TRAIN's rmse: 1.58275	TRAIN's l2: 2.50508	VALID's rmse: 1.61374	VALID's l2: 2.60415
- TRAIN's rmse: 1.59719	TRAIN's l2: 2.55101	VALID's rmse: 1.57272	VALID's l2: 2.47346
- TRAIN's rmse: 1.57332	TRAIN's l2: 2.47534	VALID's rmse: 1.67484	VALID's l2: 2.80508
- TRAIN's rmse: 1.57996	TRAIN's l2: 2.49626	VALID's rmse: 1.62192	VALID's l2: 2.63064
- TRAIN's rmse: 1.58678	TRAIN's l2: 2.51787	VALID's rmse: 1.59458	VALID's l2: 2.54268
- TRAIN's rmse: 1.58574	TRAIN's l2: 2.51456	VALID's rmse: 1.62201	VALID's l2: 2.63090
- TRAIN's rmse: 1.57782	TRAIN's l2: 2.48951	VALID's rmse: 1.62361	VALID's l2: 2.63611
- TRAIN's rmse: 1.58394	TRAIN's l2: 2.50887	VALID's rmse: 1.60376	VALID's l2: 2.57204

RSME 1.6157454930569606

### Without zero probability

lr 0.03

- valid_0's l2: 2.60145  rmse: 1.61290
- valid_0's l2: 2.61947  rmse: 1.61848
- valid_0's l2: 2.62710  rmse: 1.62083
- valid_0's l2: 2.46507  rmse: 1.57005
- valid_0's l2: 2.80488  rmse: 1.67478
- valid_0's l2: 2.65395  rmse: 1.62909
- valid_0's l2: 2.55271  rmse: 1.59772
- valid_0's l2: 2.64510  rmse: 1.62638
- valid_0's l2: 2.66030  rmse: 1.63104
- valid_0's l2: 2.58712  rmse: 1.60845

RSME 1.619066459555333

## K-Fold 5 Splits

### with squared features

- valid_0's l2: 2.63554  rmse: 1.62344
- valid_0's l2: 2.59276  rmse: 1.61021
- valid_0's l2: 2.63777  rmse: 1.62412
- valid_0's l2: 2.57201  rmse: 1.60375
- valid_0's l2: 2.70482  rmse: 1.64463

RSME 1.6211580980148415

### without squared features

### Less Features 

### With non_zero_prob

- TRAIN's rmse: 1.57506	TRAIN's l2: 2.48080	VALID's rmse: 1.61943	VALID's l2: 2.62256
- TRAIN's rmse: 1.58358	TRAIN's l2: 2.50773	VALID's rmse: 1.60810	VALID's l2: 2.58598
- TRAIN's rmse: 1.58227	TRAIN's l2: 2.50356	VALID's rmse: 1.62366	VALID's l2: 2.63628
- TRAIN's rmse: 1.58931	TRAIN's l2: 2.52592	VALID's rmse: 1.60130	VALID's l2: 2.56417
- TRAIN's rmse: 1.57891	TRAIN's l2: 2.49294	VALID's rmse: 1.63769	VALID's l2: 2.68204

RSME 1.6181068559555771

### Without non_zero_prob

- valid_0's l2: 2.68867  rmse: 1.63972 
- valid_0's l2: 2.62835  rmse: 1.62122 
- valid_0's l2: 2.65824  rmse: 1.63041 
- valid_0's l2: 2.61109  rmse: 1.61589 
- valid_0's l2: 2.73067  rmse: 1.65248 

RMSE, 0.03
1.6314694795822213

With hits / Pageview

- valid_0's l2: 2.63719  rmse: 1.62394
- valid_0's l2: 2.59768  rmse: 1.61173
- valid_0's l2: 2.64775  rmse: 1.62719
- valid_0's l2: 2.56416  rmse: 1.60130	
- valid_0's l2: 2.69958  rmse: 1.64304

RMSE, 0.03
1.6214463218021848

- Fold0: valid_0's l2: 2.69150  rmse: 1.64058	
- Fold1: valid_0's l2: 2.63377  rmse: 1.62289	
- Fold2: valid_0's l2: 2.66090  rmse: 1.63123	
- Fold3: valid_0's l2: 2.61475  rmse: 1.61702	
- Fold4: valid_0's l2: 2.73576  rmse: 1.65401	

RMSE, 0.03
1.6326631674894676

- Fold0: valid_0's l2: 2.69648  rmse: 1.64210  
- Fold1: valid_0's l2: 2.62972  rmse: 1.62164  
- Fold2: valid_0's l2: 2.65856  rmse: 1.63051  
- Fold3: valid_0's l2: 2.60930  rmse: 1.61533  
- Fold4: valid_0's l2: 2.72992  rmse: 1.65225  

RMSE, 0.01
1.6320138793107153

### All Features

- Fold0: valid_0's l2: 2.69807	rmse: 1.64258
- Fold1: valid_0's l2: 2.65322	rmse: 1.62887
- Fold2: valid_0's l2: 2.69910	rmse: 1.64289
- Fold3: valid_0's l2: 2.63584	rmse: 1.62353
- Fold4: valid_0's l2: 2.75505	rmse: 1.65984

RMSE
1.6392299336526066



### Without unique Feature

- Fold0: valid_0's l2: 2.69732	rmse: 1.64235
- Fold1: valid_0's l2: 2.65308	rmse: 1.62883
- Fold2: valid_0's l2: 2.69850	rmse: 1.64271
- Fold3: valid_0's l2: 2.63364	rmse: 1.62285
- Fold4: valid_0's l2: 2.75439	rmse: 1.65964

RSME
1.638970608432159


In [None]:
train['predictions'] = np.expm1(oof_reg_preds)
test['predictions'] = sub_reg_preds

In [None]:
trn_data = train[train_features + ['fullVisitorId']].groupby('fullVisitorId').mean()

In [None]:
# Create a list of predictions for each Visitor
trn_pred_list = train[['fullVisitorId', 'predictions']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.predictions))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})

In [None]:
trn_all_predictions = pd.DataFrame(list(trn_pred_list.values), index=trn_data.index)
trn_feats = trn_all_predictions.columns
trn_all_predictions['t_mean'] = np.log1p(trn_all_predictions[trn_feats].mean(axis=1))
trn_all_predictions['t_median'] = np.log1p(trn_all_predictions[trn_feats].median(axis=1))
trn_all_predictions['t_sum_log'] = np.log1p(trn_all_predictions[trn_feats]).sum(axis=1)
trn_all_predictions['t_sum_act'] = np.log1p(trn_all_predictions[trn_feats].fillna(0).sum(axis=1))
trn_all_predictions['t_nb_sess'] = trn_all_predictions[trn_feats].isnull().sum(axis=1)
full_data = pd.concat([trn_data, trn_all_predictions], axis=1)
del trn_data, trn_all_predictions
gc.collect()
full_data.shape

In [None]:
sub_pred_list = test[['fullVisitorId', 'predictions']].groupby('fullVisitorId')\
    .apply(lambda df: list(df.predictions))\
    .apply(lambda x: {'pred_'+str(i): pred for i, pred in enumerate(x)})

In [None]:
sub_data = test[train_features + ['fullVisitorId']].groupby('fullVisitorId').mean()
sub_all_predictions = pd.DataFrame(list(sub_pred_list.values), index=sub_data.index)
for f in trn_feats:
    if f not in sub_all_predictions.columns:
        sub_all_predictions[f] = np.nan
sub_all_predictions['t_mean'] = np.log1p(sub_all_predictions[trn_feats].mean(axis=1))
sub_all_predictions['t_median'] = np.log1p(sub_all_predictions[trn_feats].median(axis=1))
sub_all_predictions['t_sum_log'] = np.log1p(sub_all_predictions[trn_feats]).sum(axis=1)
sub_all_predictions['t_sum_act'] = np.log1p(sub_all_predictions[trn_feats].fillna(0).sum(axis=1))
sub_all_predictions['t_nb_sess'] = sub_all_predictions[trn_feats].isnull().sum(axis=1)
sub_full_data = pd.concat([sub_data, sub_all_predictions], axis=1)
del sub_data, sub_all_predictions
gc.collect()
sub_full_data.shape

In [None]:
full_data.shape, sub_full_data.shape

# Visitor Level

In [None]:
train['target'] = y_reg
trn_user_target = train[['fullVisitorId', 'target']].groupby('fullVisitorId').sum()

In [None]:
folds = get_folds(df=full_data[['totals.pageviews']].reset_index(), n_splits=5)

oof_preds = np.zeros(full_data.shape[0])
sub_preds = np.zeros(sub_full_data.shape[0])
vis_importances = pd.DataFrame()

for fold_, (trn_, val_) in enumerate(folds):
    trn_x, trn_y = full_data.iloc[trn_], trn_user_target['target'].iloc[trn_]
    val_x, val_y = full_data.iloc[val_], trn_user_target['target'].iloc[val_]
    
    reg = lgb.LGBMRegressor(
        num_leaves=31,
        learning_rate=0.03,
        n_estimators=2000,
        subsample=.9,
        colsample_bytree=.9,
        random_state=1
    )
    reg.fit(
        trn_x, np.log1p(trn_y),
        eval_set=[(trn_x, np.log1p(trn_y)), (val_x, np.log1p(val_y))],
        eval_names=['TRAIN', 'VALID'],
        early_stopping_rounds=50,
        eval_metric='rmse',
        verbose=100
    )
    
    imp_df = pd.DataFrame()
    imp_df['feature'] = trn_x.columns
    imp_df['gain'] = reg.booster_.feature_importance(importance_type='gain')
    
    imp_df['fold'] = fold_ + 1
    vis_importances = pd.concat([vis_importances, imp_df], axis=0, sort=False)
    
    oof_preds[val_] = reg.predict(val_x, num_iteration=reg.best_iteration_)
    oof_preds[oof_preds < 0] = 0
    
    # Make sure features are in the same order
    _preds = reg.predict(sub_full_data[full_data.columns], num_iteration=reg.best_iteration_)
    _preds[_preds < 0] = 0
    sub_preds += _preds / len(folds)
    
mean_squared_error(np.log1p(trn_user_target['target']), oof_preds) ** .5

In [None]:
vis_importances['gain_log'] = np.log1p(vis_importances['gain'])
mean_gain = vis_importances[['gain', 'feature']].groupby('feature').mean()
vis_importances['mean_gain'] = vis_importances['feature'].map(mean_gain['gain'])

plt.figure(figsize=(8, 20))
sns.barplot(x='gain_log', y='feature', data=vis_importances.sort_values('mean_gain', ascending=False).iloc[:300])

## Evaluation Visitor Level

## K-Fold 10 splits

### With non_zero_prob

- TRAIN's rmse: 1.52148	TRAIN's l2: 2.31489	VALID's rmse: 1.56443	VALID's l2: 2.44744
- TRAIN's rmse: 1.51932	TRAIN's l2: 2.30834	VALID's rmse: 1.58444	VALID's l2: 2.51045
- TRAIN's rmse: 1.51908	TRAIN's l2: 2.30761	VALID's rmse: 1.57039	VALID's l2: 2.46612
- TRAIN's rmse: 1.52238	TRAIN's l2: 2.31764	VALID's rmse: 1.54522	VALID's l2: 2.38772
- TRAIN's rmse: 1.52012	TRAIN's l2: 2.31077	VALID's rmse: 1.59477	VALID's l2: 2.5433
- TRAIN's rmse: 1.51456	TRAIN's l2: 2.29389	VALID's rmse: 1.56749	VALID's l2: 2.45703
- TRAIN's rmse: 1.51942	TRAIN's l2: 2.30865	VALID's rmse: 1.56088	VALID's l2: 2.43634
- TRAIN's rmse: 1.51498	TRAIN's l2: 2.29516	VALID's rmse: 1.58720	VALID's l2: 2.51920
- TRAIN's rmse: 1.52288	TRAIN's l2: 2.31917	VALID's rmse: 1.56484	VALID's l2: 2.44872
- TRAIN's rmse: 1.52454	TRAIN's l2: 2.32422	VALID's rmse: 1.57232	VALID's l2: 2.47219

RSME 1.571257676804114


### Without non_zero_prob

lr 0.001

- TRAIN's rmse: 1.52429	TRAIN's l2: 2.32345	VALID's rmse: 1.55991	VALID's l2: 2.43332
- TRAIN's rmse: 1.51563	TRAIN's l2: 2.29713	VALID's rmse: 1.58561	VALID's l2: 2.51416
- TRAIN's rmse: 1.51117	TRAIN's l2: 2.28363	VALID's rmse: 1.57337	VALID's l2: 2.47548
- TRAIN's rmse: 1.52707	TRAIN's l2: 2.33193	VALID's rmse: 1.54223	VALID's l2: 2.37847
- TRAIN's rmse: 1.51562	TRAIN's l2: 2.29712	VALID's rmse: 1.59281	VALID's l2: 2.53705
- TRAIN's rmse: 1.51171	TRAIN's l2: 2.28526	VALID's rmse: 1.56985	VALID's l2: 2.46442
- TRAIN's rmse: 1.51306	TRAIN's l2: 2.28934	VALID's rmse: 1.56215	VALID's l2: 2.44032
- TRAIN's rmse: 1.50995	TRAIN's l2: 2.27995	VALID's rmse: 1.58890	VALID's l2: 2.52459
- TRAIN's rmse: 1.52052	TRAIN's l2: 2.31198	VALID's rmse: 1.56824	VALID's l2: 2.45939
- TRAIN's rmse: 1.52220	TRAIN's l2: 2.31711	VALID's rmse: 1.57491	VALID's l2: 2.48034

RSME 1.5718626656048433


lr 0.005

- TRAIN's rmse: 1.51956	TRAIN's l2: 2.30907	VALID's rmse: 1.5599	VALID's l2: 2.43329
- TRAIN's rmse: 1.51441	TRAIN's l2: 2.29345	VALID's rmse: 1.58586	VALID's l2: 2.51495
- TRAIN's rmse: 1.49677	TRAIN's l2: 2.24033	VALID's rmse: 1.57383	VALID's l2: 2.47695
- TRAIN's rmse: 1.52621	TRAIN's l2: 2.32931	VALID's rmse: 1.54176	VALID's l2: 2.37701
- TRAIN's rmse: 1.51565	TRAIN's l2: 2.29720	VALID's rmse: 1.59303	VALID's l2: 2.53774
- TRAIN's rmse: 1.50685	TRAIN's l2: 2.27059	VALID's rmse: 1.56957	VALID's l2: 2.46356
- TRAIN's rmse: 1.50989	TRAIN's l2: 2.27977	VALID's rmse: 1.56205	VALID's l2: 2.44001
- TRAIN's rmse: 1.50754	TRAIN's l2: 2.27268	VALID's rmse: 1.58885	VALID's l2: 2.52445
- TRAIN's rmse: 1.51159	TRAIN's l2: 2.28492	VALID's rmse: 1.56813	VALID's l2: 2.45902
- TRAIN's rmse: 1.52198	TRAIN's l2: 2.31642	VALID's rmse: 1.57508	VALID's l2: 2.48088

RSME 1.571872164984722


lr 0.01

- TRAIN's rmse: 1.51699	TRAIN's l2: 2.30127	VALID's rmse: 1.56030	VALID's l2: 2.43455
- TRAIN's rmse: 1.51223	TRAIN's l2: 2.28684	VALID's rmse: 1.58546	VALID's l2: 2.51368
- TRAIN's rmse: 1.50182	TRAIN's l2: 2.25548	VALID's rmse: 1.57297	VALID's l2: 2.47423
- TRAIN's rmse: 1.52759	TRAIN's l2: 2.33353	VALID's rmse: 1.54233	VALID's l2: 2.37878
- TRAIN's rmse: 1.50238	TRAIN's l2: 2.25716	VALID's rmse: 1.59187	VALID's l2: 2.53404
- TRAIN's rmse: 1.51471	TRAIN's l2: 2.29435	VALID's rmse: 1.57057	VALID's l2: 2.46668
- TRAIN's rmse: 1.51364	TRAIN's l2: 2.29109	VALID's rmse: 1.56199	VALID's l2: 2.43981
- TRAIN's rmse: 1.51227	TRAIN's l2: 2.28695	VALID's rmse: 1.58921	VALID's l2: 2.52558
- TRAIN's rmse: 1.51644	TRAIN's l2: 2.29959	VALID's rmse: 1.56804	VALID's l2: 2.45877
- TRAIN's rmse: 1.52187	TRAIN's l2: 2.31608	VALID's rmse: 1.57547	VALID's l2: 2.48210

RSME 1.5718840456948198

## K-Fold 5 splits

### With hits per pageview

### without Squared

lr 0.03

- TRAIN's rmse: 1.51144	TRAIN's l2: 2.28445	VALID's rmse: 1.56907	VALID's l2: 2.46198
- TRAIN's rmse: 1.50291	TRAIN's l2: 2.25875	VALID's rmse: 1.57865	VALID's l2: 2.49214
- TRAIN's rmse: 1.50527	TRAIN's l2: 2.26583	VALID's rmse: 1.58278	VALID's l2: 2.50520
- TRAIN's rmse: 1.50634	TRAIN's l2: 2.26905	VALID's rmse: 1.55656	VALID's l2: 2.42288
- TRAIN's rmse: 1.50482	TRAIN's l2: 2.26447	VALID's rmse: 1.58725	VALID's l2: 2.51936

RSME 1.5748986433552368


lr 0.01

- TRAIN's rmse: 1.50680	TRAIN's l2: 2.27046	VALID's rmse: 1.57004	VALID's l2: 2.46502
- TRAIN's rmse: 1.51557	TRAIN's l2: 2.29695	VALID's rmse: 1.57889	VALID's l2: 2.49290
- TRAIN's rmse: 1.50536	TRAIN's l2: 2.26609	VALID's rmse: 1.58213	VALID's l2: 2.50314
- TRAIN's rmse: 1.50420	TRAIN's l2: 2.26263	VALID's rmse: 1.55571	VALID's l2: 2.42022
- TRAIN's rmse: 1.50815	TRAIN's l2: 2.27452	VALID's rmse: 1.58657	VALID's l2: 2.51720

RSME 1.5747041779812614

lr 0.005

- TRAIN's rmse: 1.50665	TRAIN's l2: 2.27000	VALID's rmse: 1.56936	VALID's l2: 2.46289
- TRAIN's rmse: 1.50025	TRAIN's l2: 2.25076	VALID's rmse: 1.57819	VALID's l2: 2.49069
- TRAIN's rmse: 1.50374	TRAIN's l2: 2.26123	VALID's rmse: 1.58196	VALID's l2: 2.50260
- TRAIN's rmse: 1.50823	TRAIN's l2: 2.27476	VALID's rmse: 1.55478	VALID's l2: 2.41733
- TRAIN's rmse: 1.49849	TRAIN's l2: 2.24546	VALID's rmse: 1.58635	VALID's l2: 2.51651

RSME 1.574166833934208


### with non_zero_prob

lr 0.01

- TRAIN's rmse: 1.52449	TRAIN's l2: 2.32406	VALID's rmse: 1.57015	VALID's l2: 2.46539
- TRAIN's rmse: 1.51340	TRAIN's l2: 2.29036	VALID's rmse: 1.58288	VALID's l2: 2.50551
- TRAIN's rmse: 1.50925	TRAIN's l2: 2.27782	VALID's rmse: 1.58505	VALID's l2: 2.51237
- TRAIN's rmse: 1.52383	TRAIN's l2: 2.32205	VALID's rmse: 1.55757	VALID's l2: 2.42603
- TRAIN's rmse: 1.52238	TRAIN's l2: 2.31764	VALID's rmse: 1.58722	VALID's l2: 2.51927

RSME 1.5766149759196142


### with Squared


- TRAIN's rmse: 1.51003	TRAIN's l2: 2.28020	VALID's rmse: 1.56920	VALID's l2: 2.46238
- TRAIN's rmse: 1.49890	TRAIN's l2: 2.24669	VALID's rmse: 1.57573	VALID's l2: 2.48293
- TRAIN's rmse: 1.51094	TRAIN's l2: 2.28293	VALID's rmse: 1.57968	VALID's l2: 2.49538
- TRAIN's rmse: 1.51107	TRAIN's l2: 2.28335	VALID's rmse: 1.55709	VALID's l2: 2.42452
- TRAIN's rmse: 1.50735	TRAIN's l2: 2.27210	VALID's rmse: 1.59088	VALID's l2: 2.53091

RSME 1.5745538647223587

# Creating Submission

In [None]:
sub_full_data['PredictedLogRevenue'] = sub_preds

In [None]:
sub_full_data['PredictedLogRevenue'].mean()

In [None]:
np.log1p(y_reg).mean()

## Post Processing

In [None]:
#sub_full_data_pp = sub_full_data.copy()

In [None]:
#sub_full_data_pp['totals_pageviews_log'] = np.log1p(sub_full_data_pp['totals_pageviews'])
#sub_full_data_pp['totals_hits_log'] = np.log1p(sub_full_data_pp['totals_hits'])

In [None]:
#sub_full_data_pp['totals_pageviews_log'].isna().sum(), sub_full_data_pp['totals_hits_log'].isna().sum()

In [None]:
#sub_full_data_pp['PredictedLogRevenue']\
#[(sub_full_data_pp['totals_pageviews_log'] / sub_full_data_pp['totals_hits_log'] < 0.4)].mean()

In [None]:
#sub_full_data_pp['PredictedLogRevenue']\
#[(sub_full_data_pp['totals_pageviews_log'] / sub_full_data_pp['totals_hits_log']) < 0.4]= 0.0

In [None]:
#sub_full_data_pp['PredictedLogRevenue'] \
#[(sub_full_data_pp['totals_pageviews_log'] / sub_full_data_pp['totals_hits_log']) < 0.4].mean()

In [None]:
sub_full_data[['PredictedLogRevenue']].to_csv('new_test_pp.csv', index=True)

In [None]:
# LB score for lightgbm with hits per pageview:  1.4350
# LB score for lightgbm with hits per pageview 10 Fold: 1.4356

# LB score for lightgbm with hits per pageview 10 Fold and nonzero prob : 1.4663
# LB score for lightgbm with hits per pageview  5 Fold and nonzero prob : 1.4351    non_zero_probability not helping

# LB score for lightgbm with more complex features: 1.4281 (probably overfitting the LB)