In [1]:
import os
import pandas as pd
import numpy as np
import json
from pandas import json_normalize
from ast import literal_eval
import warnings
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb

warnings.filterwarnings('ignore')

data_path = './dataset/'

def read_df(path, file_name, nrows=None):
    df = pd.read_csv(path + file_name, dtype={'fullVisitorId': 'str', 'visitId': 'str'}, chunksize=nrows)
    return df

train_df = read_df(data_path, 'train.csv')
test_df = read_df(data_path, 'test.csv')

# Drop
train_df = train_df.drop(['visitId', 'visitStartTime', 'campaignCode'], axis=1)
test_df = test_df.drop(['visitId', 'visitStartTime'], axis=1)

# Fill NA
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# Encode non-numerics
non_numerics = train_df.select_dtypes(exclude=np.number).columns.tolist()
for col in non_numerics:
    if col in ['fullVisitorId']: continue
    print(col)
    lb = preprocessing.LabelEncoder()
    lb.fit(list(train_df[col].values) + list(test_df[col].values))
    train_df[col] = lb.transform(list(train_df[col].values))
    test_df[col] = lb.transform(list(test_df[col].values))

channelGrouping
date
browser
operatingSystem
isMobile
mobileDeviceModel
deviceCategory
continent
subContinent
country
region
metro
city
networkDomain
campaign
source
medium
keyword
referralPath
isTrueDirect
value
month


In [8]:
# Split DF
train_x = train_df.drop(['fullVisitorId', 'totalTransactionRevenue','index','campaign','date','mobileDeviceModel','month','value'], axis = 1)
train_y = np.log1p(train_df["totalTransactionRevenue"].values)
trn_x, val_x, trn_y, val_y = train_test_split(train_x, train_y, test_size = 0.2, shuffle = False)
test_x = test_df.drop(['fullVisitorId', 'totalTransactionRevenue','index','campaign','date','mobileDeviceModel','month','value'], axis = 1)
test_y = np.log1p(test_df["totalTransactionRevenue"].values)

# Transform to lgb dataset
train_data = lgb.Dataset(trn_x, label = trn_y)
test_data = lgb.Dataset(val_x, label = val_y)

# Model parameters
parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 35,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.1,
}

# Train
model = lgb.train(parameters, train_data, valid_sets=test_data, num_boost_round=300, early_stopping_rounds=100)

[1]	valid_0's rmse: 1.80572
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.7733
[3]	valid_0's rmse: 1.73802
[4]	valid_0's rmse: 1.56582
[5]	valid_0's rmse: 1.41023
[6]	valid_0's rmse: 1.27084
[7]	valid_0's rmse: 1.14487
[8]	valid_0's rmse: 1.12802
[9]	valid_0's rmse: 1.01724
[10]	valid_0's rmse: 1.0083
[11]	valid_0's rmse: 0.996219
[12]	valid_0's rmse: 0.985279
[13]	valid_0's rmse: 0.98369
[14]	valid_0's rmse: 0.972969
[15]	valid_0's rmse: 0.878061
[16]	valid_0's rmse: 0.871043
[17]	valid_0's rmse: 0.789139
[18]	valid_0's rmse: 0.78675
[19]	valid_0's rmse: 0.782932
[20]	valid_0's rmse: 0.778548
[21]	valid_0's rmse: 0.704191
[22]	valid_0's rmse: 0.638287
[23]	valid_0's rmse: 0.638103
[24]	valid_0's rmse: 0.637954
[25]	valid_0's rmse: 0.580278
[26]	valid_0's rmse: 0.579804
[27]	valid_0's rmse: 0.531518
[28]	valid_0's rmse: 0.530584
[29]	valid_0's rmse: 0.485343
[30]	valid_0's rmse: 0.484796
[31]	valid_0's rmse: 0.484764
[32]	valid_0's rmse: 0.48425
[3

[266]	valid_0's rmse: 0.0449246
[267]	valid_0's rmse: 0.0448714
[268]	valid_0's rmse: 0.0448646
[269]	valid_0's rmse: 0.0448154
[270]	valid_0's rmse: 0.044786
[271]	valid_0's rmse: 0.0447722
[272]	valid_0's rmse: 0.0447219
[273]	valid_0's rmse: 0.0446129
[274]	valid_0's rmse: 0.0446056
[275]	valid_0's rmse: 0.0445618
[276]	valid_0's rmse: 0.0444917
[277]	valid_0's rmse: 0.0444896
[278]	valid_0's rmse: 0.0444606
[279]	valid_0's rmse: 0.0444498
[280]	valid_0's rmse: 0.0444204
[281]	valid_0's rmse: 0.0444099
[282]	valid_0's rmse: 0.0443964
[283]	valid_0's rmse: 0.0443365
[284]	valid_0's rmse: 0.0443189
[285]	valid_0's rmse: 0.0441127
[286]	valid_0's rmse: 0.0441131
[287]	valid_0's rmse: 0.0440965
[288]	valid_0's rmse: 0.0440309
[289]	valid_0's rmse: 0.0439981
[290]	valid_0's rmse: 0.043993
[291]	valid_0's rmse: 0.0439732
[292]	valid_0's rmse: 0.0439624
[293]	valid_0's rmse: 0.0439506
[294]	valid_0's rmse: 0.0438738
[295]	valid_0's rmse: 0.0438069
[296]	valid_0's rmse: 0.0437683
[297]	vali

In [9]:
from sklearn.metrics import mean_squared_error
preds = model.predict(test_x, num_iteration=model.best_iteration)
print('The rmse of prediction is:', mean_squared_error(test_y, preds) ** 0.5)

The rmse of prediction is: 4.936944908912973


In [7]:
submission = test_df[['fullVisitorId']]
submission['PredictedLogRevenue'] = np.expm1(preds)
submission = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum()
submission["PredictedLogRevenue"] = np.log1p(submission["PredictedLogRevenue"])
print(submission.head())

                     PredictedLogRevenue
fullVisitorId                           
0000196310838896290             0.009220
0000268499301061358             7.154482
0000900085223565423             7.171304
000101445922736554              7.261824
0001180870711546826             7.224999
