In [1]:
import os
import pandas as pd
import numpy as np
import json
from pandas import json_normalize
from ast import literal_eval
import warnings
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb

warnings.filterwarnings('ignore')

data_path = './dataset/'

def read_df(path, file_name, nrows=None):
    df = pd.read_csv(path + file_name, dtype={'fullVisitorId': 'str', 'visitId': 'str'}, chunksize=nrows)
    return df

train_df = read_df(data_path, 'train.csv')
test_df = read_df(data_path, 'test.csv')

# Drop
train_df = train_df.drop(['visitId', 'visitStartTime', 'campaignCode'], axis=1)
test_df = test_df.drop(['visitId', 'visitStartTime'], axis=1)

# Fill NA
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# Encode non-numerics
non_numerics = train_df.select_dtypes(exclude=np.number).columns.tolist()
for col in non_numerics:
    if col in ['fullVisitorId']: continue
    print(col)
    lb = preprocessing.LabelEncoder()
    lb.fit(list(train_df[col].values) + list(test_df[col].values))
    train_df[col] = lb.transform(list(train_df[col].values))
    test_df[col] = lb.transform(list(test_df[col].values))

channelGrouping
date
browser
operatingSystem
isMobile
mobileDeviceModel
deviceCategory
continent
subContinent
country
region
metro
city
networkDomain
campaign
source
medium
keyword
referralPath
isTrueDirect
value
month


In [46]:
# Split DF
train_x = train_df.drop(['fullVisitorId', 'totalTransactionRevenue','index','campaign'], axis = 1)
train_y = np.log1p(train_df["totalTransactionRevenue"].values)
trn_x, val_x, trn_y, val_y = train_test_split(train_x, train_y, test_size = 0.2)
test_x = test_df.drop(['fullVisitorId', 'totalTransactionRevenue','index','campaign'], axis = 1)
test_y = np.log1p(test_df["totalTransactionRevenue"].values)

# Transform to lgb dataset
train_data = lgb.Dataset(trn_x, label = trn_y)
test_data = lgb.Dataset(val_x, label = val_y, reference = train_data)

# Model parameters
parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 35,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.1,
}

# Train
model = lgb.train(parameters, train_data, valid_sets=test_data, num_boost_round=300, early_stopping_rounds=100)

preds = model.predict(test_x, num_iteration=model.best_iteration)


[1]	valid_0's rmse: 1.69232
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.52391
[3]	valid_0's rmse: 1.51624
[4]	valid_0's rmse: 1.51004
[5]	valid_0's rmse: 1.35996
[6]	valid_0's rmse: 1.32859
[7]	valid_0's rmse: 1.19615
[8]	valid_0's rmse: 1.19343
[9]	valid_0's rmse: 1.07581
[10]	valid_0's rmse: 1.05535
[11]	valid_0's rmse: 1.03616
[12]	valid_0's rmse: 1.02212
[13]	valid_0's rmse: 0.922866
[14]	valid_0's rmse: 0.914056
[15]	valid_0's rmse: 0.904288
[16]	valid_0's rmse: 0.815682
[17]	valid_0's rmse: 0.736784
[18]	valid_0's rmse: 0.72991
[19]	valid_0's rmse: 0.661192
[20]	valid_0's rmse: 0.656565
[21]	valid_0's rmse: 0.594408
[22]	valid_0's rmse: 0.538475
[23]	valid_0's rmse: 0.535743
[24]	valid_0's rmse: 0.487699
[25]	valid_0's rmse: 0.443049
[26]	valid_0's rmse: 0.402143
[27]	valid_0's rmse: 0.366532
[28]	valid_0's rmse: 0.366487
[29]	valid_0's rmse: 0.33322
[30]	valid_0's rmse: 0.333182
[31]	valid_0's rmse: 0.303686
[32]	valid_0's rmse: 0.278888
[

[264]	valid_0's rmse: 0.0594799
[265]	valid_0's rmse: 0.059396
[266]	valid_0's rmse: 0.0593933
[267]	valid_0's rmse: 0.0593811
[268]	valid_0's rmse: 0.0593675
[269]	valid_0's rmse: 0.0592868
[270]	valid_0's rmse: 0.0593036
[271]	valid_0's rmse: 0.0592885
[272]	valid_0's rmse: 0.0592781
[273]	valid_0's rmse: 0.0592769
[274]	valid_0's rmse: 0.0592615
[275]	valid_0's rmse: 0.0592489
[276]	valid_0's rmse: 0.0591932
[277]	valid_0's rmse: 0.0591791
[278]	valid_0's rmse: 0.0591615
[279]	valid_0's rmse: 0.0591128
[280]	valid_0's rmse: 0.0590588
[281]	valid_0's rmse: 0.0590504
[282]	valid_0's rmse: 0.0591016
[283]	valid_0's rmse: 0.0590921
[284]	valid_0's rmse: 0.0590625
[285]	valid_0's rmse: 0.0584114
[286]	valid_0's rmse: 0.0584119
[287]	valid_0's rmse: 0.0583859
[288]	valid_0's rmse: 0.0578309
[289]	valid_0's rmse: 0.0577468
[290]	valid_0's rmse: 0.0577457
[291]	valid_0's rmse: 0.0577456
[292]	valid_0's rmse: 0.057737
[293]	valid_0's rmse: 0.0577163
[294]	valid_0's rmse: 0.0577041
[295]	vali

In [47]:
preds[preds<0] = 0
val_pred_df = pd.DataFrame({"fullVisitorId":test_df["fullVisitorId"].values})
val_pred_df["transactionRevenue"] = test_df["totalTransactionRevenue"].values
val_pred_df["PredictedRevenue"] = np.expm1(preds)
#print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))
val_pred_df = val_pred_df.groupby("fullVisitorId")["transactionRevenue", "PredictedRevenue"].sum().reset_index()
print(np.sqrt(metrics.mean_squared_error(np.log1p(val_pred_df["transactionRevenue"].values), np.log1p(val_pred_df["PredictedRevenue"].values))))

5.686086439253874


In [39]:
submission = test_df[['fullVisitorId']]
submission['PredictedLogRevenue'] = np.expm1(preds)
submission = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum()
submission["PredictedLogRevenue"] = np.log1p(submission["PredictedLogRevenue"])
print(submission.head())

                     PredictedLogRevenue
fullVisitorId                           
0000196310838896290             0.017288
0000268499301061358             1.416643
0000900085223565423             1.391973
000101445922736554              1.388680
0001180870711546826             1.387567
