In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
#import statsmodels.api as sm
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [2]:
def load_data(datapath):
    data = pd.read_csv(datapath)
    # Dimensions
    print('Shape:', data.shape)
    # Set of features we have are: date, store, and item
    display(data.sample(10))
    return data
    
# change the path accordingly    
train_df = load_data('./dataset/train.csv')
test_df = load_data('./dataset/test.csv')

Shape: (913000, 4)


Unnamed: 0,date,store,item,sales
628905,2015-02-01,5,35,48
879845,2017-03-20,2,49,32
765452,2013-12-25,10,42,19
524567,2014-05-21,8,29,89
837250,2015-08-01,9,46,95
731454,2015-11-21,1,41,27
417785,2016-12-28,9,23,26
468360,2015-06-24,7,26,39
183659,2015-11-26,1,11,65
308899,2013-11-02,10,17,44


Shape: (45000, 4)


Unnamed: 0,id,date,store,item
42753,42753,2018-01-04,6,48
15567,15567,2018-03-29,3,18
35948,35948,2018-02-08,10,40
22110,22110,2018-03-02,6,25
27672,27672,2018-02-12,8,31
1135,1135,2018-02-25,3,2
23548,23548,2018-02-28,2,27
19576,19576,2018-02-16,8,22
2521,2521,2018-01-02,9,3
29646,29646,2018-02-06,10,33


In [3]:
def Time_visualization(data):
    store_item_df = data.copy()
    # First, let us filterout the required data
    store_id = 10   # Some store
    item_id = 40    # Some item
    print('Before filter:', store_item_df.shape)
    store_item_df = store_item_df[store_item_df.store == store_id]
    store_item_df = store_item_df[store_item_df.item == item_id]
    print('After filter:', store_item_df.shape)
    #display(store_item_df.head())

    # Let us plot this now
    store_item_ts_data = [go.Scatter(
        x=store_item_df.date,
        y=store_item_df.sales)]
    py.iplot(store_item_ts_data)
    return store_item_df

store_item_df = Time_visualization(train_df)

Before filter: (913000, 4)
After filter: (1826, 4)


In [4]:
def sales_monthly(data):
    multi_store_item_df = data.copy()
    # First, let us filterout the required data
    store_ids = [1, 1, 1, 1]   # Some stores
    item_ids = [10, 20, 30, 40]    # Some items
    print('Before filter:', multi_store_item_df.shape)
    multi_store_item_df = multi_store_item_df[multi_store_item_df.store.isin(store_ids)]
    multi_store_item_df = multi_store_item_df[multi_store_item_df.item.isin(item_ids)]
    print('After filter:', multi_store_item_df.shape)
    #display(multi_store_item_df)
    # TODO Monthly avg sales

    # Let us plot this now
    multi_store_item_ts_data = []
    for st,it in zip(store_ids, item_ids):
        flt = multi_store_item_df[multi_store_item_df.store == st]
        flt = flt[flt.item == it]
        multi_store_item_ts_data.append(go.Scatter(x=flt.date, y=flt.sales, name = "Store:" + str(st) + ",Item:" + str(it)))
    py.iplot(multi_store_item_ts_data)
    return (multi_store_item_df)

multi_store_item_df = sales_monthly(train_df)

Before filter: (913000, 4)
After filter: (7304, 4)


In [20]:
def preprocessing_data(train_data,test_data):
    train_data['date'] = pd.to_datetime(train_data['date'])
    test_data['date'] = pd.to_datetime(test_data['date'])

    train_data['month'] = train_data['date'].dt.month
    train_data['day'] = train_data['date'].dt.dayofweek
    train_data['year'] = train_data['date'].dt.year

    test_data['month'] = test_data['date'].dt.month
    test_data['day'] = test_data['date'].dt.dayofweek
    test_data['year'] = test_data['date'].dt.year

    col = [i for i in test_data.columns if i not in ['date','id']]
    y = 'sales'
    train_x, test_x, train_y, test_y = train_test_split(train_data[col],train_data[y], test_size=0.2, random_state=2018)
    return (train_x, test_x, train_y, test_y,col)

train_x, test_x, train_y, test_y,col = preprocessing_data(train_df,test_df)

In [6]:
%%time

def train_model(train_x,train_y,test_x,test_y,col):
    params = {
        'nthread': 10,
         'max_depth': 5,
#         'max_depth': 9,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'metric': 'mape', # this is abs(a-e)/max(1,a)
#         'num_leaves': 39,
        'num_leaves': 64,
        'learning_rate': 0.2,
       'feature_fraction': 0.9,
#         'feature_fraction': 0.8108472661400657,
#         'bagging_fraction': 0.9837558288375402,
       'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'lambda_l1': 3.097758978478437,
        'lambda_l2': 2.9482537987198496,
#       'lambda_l1': 0.06,
#       'lambda_l2': 0.1,
        'verbose': 1,
        'min_child_weight': 6.996211413900573,
        'min_split_gain': 0.037310344962162616,
        }
    
    lgb_train = lgb.Dataset(train_x,train_y)
    lgb_valid = lgb.Dataset(test_x,test_y)
    model = lgb.train(params, lgb_train, 3000, valid_sets=[lgb_train, lgb_valid],early_stopping_rounds=50, verbose_eval=50)
    #y_test = model.predict(test_df[col])
    #return y_test,model
    return model

Wall time: 997 µs


In [7]:
%%time
model = train_model(train_x,train_y,test_x,test_y,col)
y_test = model.predict(test_df[col])


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 730400, number of used features: 5
[LightGBM] [Info] Start training from score 47.000000
Training until validation scores don't improve for 50 rounds
[50]	training's mape: 0.189406	valid_1's mape: 0.189652
[100]	training's mape: 0.153821	valid_1's mape: 0.154188
[150]	training's mape: 0.145924	valid_1's mape: 0.146462
[200]	training's mape: 0.141446	valid_1's mape: 0.142132
[250]	training's mape: 0.137947	valid_1's mape: 0.138808
[300]	training's mape: 0.135904	valid_1's mape: 0.136844
[350]	training's mape: 0.134705	valid_1's mape: 0.135737
[400]	training's mape: 0.134076	valid_1's mape: 0.135189
[450]	training's mape: 0.133576	valid_1's mape: 0.134794
[500]	training's mape: 0.133127	valid_1's mape: 0.134443
[550]	training's mape: 0.132816	valid_1's mape: 0.134214
[600]	traini

[1100]	training's mape: 0.131017	valid_1's mape: 0.133021
[1150]	training's mape: 0.13096	valid_1's mape: 0.133008
Early stopping, best iteration is:
[1115]	training's mape: 0.13099	valid_1's mape: 0.133004
[12.50386182 12.43945988 13.98833984 ... 78.94035431 79.40602315
 84.41339122]
Wall time: 49.8 s


In [18]:
test_df['sales'] = y_test

In [19]:
sales_monthly(test_df)

Before filter: (45000, 8)
After filter: (360, 8)


Unnamed: 0,id,date,store,item,month,day,year,sales
8100,8100,2018-01-01,1,10,1,0,2018,40.671779
8101,8101,2018-01-02,1,10,1,1,2018,47.670344
8102,8102,2018-01-03,1,10,1,2,2018,47.926499
8103,8103,2018-01-04,1,10,1,3,2018,49.864408
8104,8104,2018-01-05,1,10,1,4,2018,54.359824
...,...,...,...,...,...,...,...,...
35185,35185,2018-03-27,1,40,3,1,2018,24.054300
35186,35186,2018-03-28,1,40,3,2,2018,24.853202
35187,35187,2018-03-29,1,40,3,3,2018,25.923080
35188,35188,2018-03-30,1,40,3,4,2018,26.355870
