In [1]:
import os
import pandas as pd
import numpy as np
import json
from pandas import json_normalize
from ast import literal_eval
import warnings
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb

warnings.filterwarnings('ignore')

data_path = './dataset/'

def read_df(path, file_name, nrows=None):
    df = pd.read_csv(path + file_name, dtype={'fullVisitorId': 'str', 'visitId': 'str'}, chunksize=nrows)
    return df

train_df = read_df(data_path, 'train.csv')
test_df = read_df(data_path, 'test.csv')

# Drop
train_df = train_df.drop(['visitId', 'visitStartTime', 'campaignCode', 'date'], axis=1)
test_df = test_df.drop(['visitId', 'visitStartTime', 'date'], axis=1)

# Fill NA
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# Encode non-numerics
non_numerics = train_df.select_dtypes(exclude=np.number).columns.tolist()
for col in non_numerics:
    if col in ['fullVisitorId']: continue
    print(col)
    lb = preprocessing.LabelEncoder()
    lb.fit(list(train_df[col].values) + list(test_df[col].values))
    train_df[col] = lb.transform(list(train_df[col].values))
    test_df[col] = lb.transform(list(test_df[col].values))

channelGrouping
browser
operatingSystem
isMobile
mobileDeviceModel
deviceCategory
continent
subContinent
country
region
metro
city
networkDomain
campaign
source
medium
keyword
referralPath
isTrueDirect
value
month


In [7]:
# Split DF
train_x = train_df.drop(['fullVisitorId', 'totalTransactionRevenue'], axis=1)
train_y = np.log1p(train_df["totalTransactionRevenue"].values)
trn_x, val_x, trn_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
test_x = test_df.drop(['fullVisitorId', 'totalTransactionRevenue'], axis=1)
test_y = np.log1p(test_df["totalTransactionRevenue"].values)

# Transform to lgb dataset
train_data = lgb.Dataset(trn_x, label = trn_y)
test_data = lgb.Dataset(val_x, label = val_y, reference = train_data)

# Model parameters
parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 35,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.1,
}

# Train
model = lgb.train(parameters, train_data, valid_sets=test_data, num_boost_round=100, early_stopping_rounds=100)

preds_test = model.predict(test_x, num_iteration=model.best_iteration)


[1]	valid_0's rmse: 1.67649
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.50988
[3]	valid_0's rmse: 1.35898
[4]	valid_0's rmse: 1.22319
[5]	valid_0's rmse: 1.10099
[6]	valid_0's rmse: 0.990978
[7]	valid_0's rmse: 0.89267
[8]	valid_0's rmse: 0.803526
[9]	valid_0's rmse: 0.723312
[10]	valid_0's rmse: 0.703631
[11]	valid_0's rmse: 0.634185
[12]	valid_0's rmse: 0.571669
[13]	valid_0's rmse: 0.563092
[14]	valid_0's rmse: 0.50725
[15]	valid_0's rmse: 0.497042
[16]	valid_0's rmse: 0.488415
[17]	valid_0's rmse: 0.44073
[18]	valid_0's rmse: 0.397602
[19]	valid_0's rmse: 0.359398
[20]	valid_0's rmse: 0.324947
[21]	valid_0's rmse: 0.32064
[22]	valid_0's rmse: 0.289893
[23]	valid_0's rmse: 0.262363
[24]	valid_0's rmse: 0.237827
[25]	valid_0's rmse: 0.214958
[26]	valid_0's rmse: 0.195024
[27]	valid_0's rmse: 0.176551
[28]	valid_0's rmse: 0.16005
[29]	valid_0's rmse: 0.145214
[30]	valid_0's rmse: 0.13201
[31]	valid_0's rmse: 0.120546
[32]	valid_0's rmse: 0.11037