In [1]:
import os
import pandas as pd
import numpy as np
import json
from pandas import json_normalize
from ast import literal_eval
import warnings
from sklearn import preprocessing, metrics
from sklearn.model_selection import train_test_split
import lightgbm as lgb

warnings.filterwarnings('ignore')

data_path = './dataset/'

def read_df(path, file_name, nrows=None):
    df = pd.read_csv(path + file_name, dtype={'fullVisitorId': 'str', 'visitId': 'str'}, chunksize=nrows)
    return df

train_df = read_df(data_path, 'train.csv')
test_df = read_df(data_path, 'test.csv')

# Drop
train_df = train_df.drop(['visitId', 'visitStartTime', 'campaignCode'], axis=1)
test_df = test_df.drop(['visitId', 'visitStartTime'], axis=1)

# Fill NA
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

# Encode non-numerics
def func(df):
    cols = df.columns.values
    for col in cols:
        digit_vals={}
        def convert_to_int(val):
            return digit_vals[val]
        if df[col].dtype != np.int64 and df[col].dtype != np.float64:
            cont = df[col].values.tolist()
            uniques = set(cont)
            x = 0
            for unique in uniques:
                if unique not in digit_vals:
                    digit_vals[unique] = x
                    x+=1
            df[col] = list(map(convert_to_int, df[col]))
    return df

train_df = func(train_df)
test_df = func(test_df)

"non_numerics = train_df.select_dtypes(exclude=np.number).columns.tolist()\nfor col in non_numerics:\n    if col in ['fullVisitorId']: continue\n    print(col)\n    lb = preprocessing.LabelEncoder()\n    lb.fit(list(train_df[col].values) + list(test_df[col].values))\n    train_df[col] = lb.transform(list(train_df[col].values))\n    test_df[col] = lb.transform(list(test_df[col].values))\n"

In [6]:
# Split DF
train_x = train_df.drop(['fullVisitorId', 'totalTransactionRevenue','index','campaign','date','mobileDeviceModel','month','value'], axis = 1)
train_y = np.log1p(train_df["totalTransactionRevenue"].values)
trn_x, val_x, trn_y, val_y = train_test_split(train_x, train_y, test_size = 0.2, shuffle = False)
test_x = test_df.drop(['fullVisitorId', 'totalTransactionRevenue','index','campaign','date','mobileDeviceModel','month','value'], axis = 1)
test_y = np.log1p(test_df["totalTransactionRevenue"].values)

# Transform to lgb dataset
train_data = lgb.Dataset(trn_x, label = trn_y)
test_data = lgb.Dataset(val_x, label = val_y)

# Model parameters
parameters = {
    'objective': 'regression',
    'metric': 'rmse',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 35,
    'feature_fraction': 0.3,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.1,
}

# Train
model = lgb.train(parameters, train_data, valid_sets=test_data, num_boost_round=300, early_stopping_rounds=100)

[1]	valid_0's rmse: 1.80494
Training until validation scores don't improve for 100 rounds
[2]	valid_0's rmse: 1.77276
[3]	valid_0's rmse: 1.73829
[4]	valid_0's rmse: 1.56597
[5]	valid_0's rmse: 1.41034
[6]	valid_0's rmse: 1.27107
[7]	valid_0's rmse: 1.14507
[8]	valid_0's rmse: 1.12844
[9]	valid_0's rmse: 1.01759
[10]	valid_0's rmse: 1.00875
[11]	valid_0's rmse: 0.996446
[12]	valid_0's rmse: 0.985934
[13]	valid_0's rmse: 0.984298
[14]	valid_0's rmse: 0.973267
[15]	valid_0's rmse: 0.878527
[16]	valid_0's rmse: 0.871687
[17]	valid_0's rmse: 0.789706
[18]	valid_0's rmse: 0.787349
[19]	valid_0's rmse: 0.78371
[20]	valid_0's rmse: 0.77934
[21]	valid_0's rmse: 0.704614
[22]	valid_0's rmse: 0.638714
[23]	valid_0's rmse: 0.638557
[24]	valid_0's rmse: 0.638368
[25]	valid_0's rmse: 0.580685
[26]	valid_0's rmse: 0.580148
[27]	valid_0's rmse: 0.531823
[28]	valid_0's rmse: 0.530911
[29]	valid_0's rmse: 0.485593
[30]	valid_0's rmse: 0.485035
[31]	valid_0's rmse: 0.48501
[32]	valid_0's rmse: 0.484496


[270]	valid_0's rmse: 0.0441912
[271]	valid_0's rmse: 0.044172
[272]	valid_0's rmse: 0.0440671
[273]	valid_0's rmse: 0.0439978
[274]	valid_0's rmse: 0.0439907
[275]	valid_0's rmse: 0.0439596
[276]	valid_0's rmse: 0.0439147
[277]	valid_0's rmse: 0.043903
[278]	valid_0's rmse: 0.0437604
[279]	valid_0's rmse: 0.0437536
[280]	valid_0's rmse: 0.043714
[281]	valid_0's rmse: 0.043671
[282]	valid_0's rmse: 0.0436661
[283]	valid_0's rmse: 0.0435993
[284]	valid_0's rmse: 0.0435598
[285]	valid_0's rmse: 0.0433799
[286]	valid_0's rmse: 0.0433769
[287]	valid_0's rmse: 0.043357
[288]	valid_0's rmse: 0.0433092
[289]	valid_0's rmse: 0.0432778
[290]	valid_0's rmse: 0.0432709
[291]	valid_0's rmse: 0.0432413
[292]	valid_0's rmse: 0.0432224
[293]	valid_0's rmse: 0.0431974
[294]	valid_0's rmse: 0.0431316
[295]	valid_0's rmse: 0.0430601
[296]	valid_0's rmse: 0.0430129
[297]	valid_0's rmse: 0.0428788
[298]	valid_0's rmse: 0.0428389
[299]	valid_0's rmse: 0.042711
[300]	valid_0's rmse: 0.0426913
Did not meet e

In [7]:
from sklearn.metrics import mean_squared_error
preds = model.predict(test_x, num_iteration=model.best_iteration)
print('The rmse of prediction is:', mean_squared_error(test_y, preds) ** 0.5)

The rmse of prediction is: 4.974281649921838


In [7]:
submission = test_df[['fullVisitorId']]
submission['PredictedLogRevenue'] = np.expm1(preds)
submission = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum()
submission["PredictedLogRevenue"] = np.log1p(submission["PredictedLogRevenue"])
print(submission.head())

                     PredictedLogRevenue
fullVisitorId                           
0000196310838896290             0.009220
0000268499301061358             7.154482
0000900085223565423             7.171304
000101445922736554              7.261824
0001180870711546826             7.224999
