In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

PATH_DATASET = '../Dataset/'
PATH_RESULTS = '../Results/'

In [2]:
df_item = pd.read_csv(os.path.join(PATH_DATASET,'meta-item.csv'))
df_item.head(2)

Unnamed: 0,I100,I101,I102,I103
0,0,2,1,1
1,1,2,1,1


In [3]:
df_store = pd.read_csv(os.path.join(PATH_DATASET,'meta-store.csv'))
df_store.head(2)

Unnamed: 0,S100,S101,S102,S103
0,0,1,17,10
1,1,1,18,10


In [4]:
df_sales = pd.read_csv(os.path.join(PATH_DATASET,'sales.csv'))
print(df_sales.columns)
df_sales.head(2)

Index(['DATE', 'S100', 'I100', 'C100', 'C101', 'QTT'], dtype='object')


Unnamed: 0,DATE,S100,I100,C100,C101,QTT
0,2017-01-08,0,0,12,76,2
1,2017-01-08,0,0,12,149,3


In [5]:
df_submission_sample = pd.read_csv(os.path.join(PATH_DATASET,'submission_sample.csv'))
df_submission_sample.head(2)

Unnamed: 0,ID,QTT
0,2021-10-03_0_1_12_140,-1
1,2021-10-03_0_1_12_164,-1


In [6]:
df_submission_sample[['DATE', 'S100', 'I100', 'C100', 'C101']] = df_submission_sample['ID'].str.split('_',expand=True)


In [7]:
df_submission_sample.head(2)

Unnamed: 0,ID,QTT,DATE,S100,I100,C100,C101
0,2021-10-03_0_1_12_140,-1,2021-10-03,0,1,12,140
1,2021-10-03_0_1_12_164,-1,2021-10-03,0,1,12,164


In [8]:
def preprocess(df):
    
    df[['year', 'month', 'day']] = df['DATE'].str.split('-',expand=True)
    
    for value in [ 'S100', 'I100', 'C100', 'C101','month', 'year', 'day']:        
        df[value] = df[value].astype(str).astype(int)
    
    df = df.merge(df_item,on=['I100'])
    df = df.merge(df_store,on=['S100'])
    

    df['DATE'] = pd.to_datetime(df['DATE']).dt.strftime('%Y-%m-%d')    
    return df

In [9]:
df_sales = preprocess(df_sales)
df_submission_sample = preprocess(df_submission_sample)

In [10]:
df_sales.head(1)

Unnamed: 0,DATE,S100,I100,C100,C101,QTT,year,month,day,I101,I102,I103,S101,S102,S103
0,2017-01-08,0,0,12,76,2,2017,1,8,2,1,1,1,17,10


In [11]:
features_names = ['S100', 'I100', 'C100', 'C101', 'month', 'year', 'day','I101', 'I102', 'I103', 'S101', 'S102', 'S103']
X_train = df_sales[features_names].copy()
y_train = df_sales['QTT']
X_submission = df_submission_sample[features_names].copy()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print('X_train      :',X_train.shape)
print('X_validation :',X_validation.shape)
print('X_test       :',X_test.shape)


X_train      : (1924558, 13)
X_validation : (412405, 13)
X_test       : (412406, 13)


In [26]:
hyper_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': ['rmse'],
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.7,
    'bagging_freq': 10,
    'verbose': 0,
    "max_depth": 10,
    "num_leaves": 128,  
    "max_bin": 512,
    "num_iterations": 8000
}

In [27]:
gbm = lgb.LGBMRegressor(**hyper_params)

In [28]:
gbm.fit(X_train, y_train,
        eval_set=[(X_validation, y_validation)],
        eval_metric='rmse',
        early_stopping_rounds=30, verbose=50)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[50]	valid_0's rmse: 2.35995
[100]	valid_0's rmse: 2.29117
[150]	valid_0's rmse: 2.25971
[200]	valid_0's rmse: 2.23708
[250]	valid_0's rmse: 2.22312
[300]	valid_0's rmse: 2.20523
[350]	valid_0's rmse: 2.19034
[400]	valid_0's rmse: 2.18184
[450]	valid_0's rmse: 2.17554
[500]	valid_0's rmse: 2.16553
[550]	valid_0's rmse: 2.15663
[600]	valid_0's rmse: 2.15219
[650]	valid_0's rmse: 2.14766
[700]	valid_0's rmse: 2.14186
[750]	valid_0's rmse: 2.13992
[800]	valid_0's rmse: 2.13724
[850]	valid_0's rmse: 2.13418
[900]	valid_0's rmse: 2.13189
[950]	valid_0's rmse: 2.12973
[1000]	valid_0's rmse: 2.12773
[1050]	valid_0's rmse: 2.12616
[1100]	valid_0's rmse: 2.12428


LGBMRegressor(bagging_fraction=0.7, bagging_freq=10, feature_fraction=0.9,
              max_bin=512, max_depth=10, metric=['rmse'], num_iterations=8000,
              num_leaves=128, objective='regression', task='train', verbose=0)

In [29]:
X_train_pred = gbm.predict(X_train, num_iteration=gbm.best_iteration_)
X_validation_pred = gbm.predict(X_validation, num_iteration=gbm.best_iteration_)
X_test_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

X_submission_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration_)

In [32]:
from sklearn.metrics import mean_squared_error

df_train_rmse = mean_squared_error(y_train,X_train_pred, squared=False)
df_val_rmse = mean_squared_error(y_validation,X_validation_pred, squared=False)
df_test_rmse = mean_squared_error(y_test,X_test_pred, squared=False)

print('Final score mean_squared_error')
print('Score train:',df_train_rmse)
print('Score val  :',df_val_rmse)
print('Score test :',df_test_rmse)

Final score mean_squared_error
Score train: 1.9949069554186738
Score val  : 2.1238996593838664
Score test : 2.3451908017645593


In [33]:
df_submission_sample['QTT'] = X_submission_pred
df_submission_sample.head()

Unnamed: 0,ID,QTT,DATE,S100,I100,C100,C101,year,month,day,I101,I102,I103,S101,S102,S103
0,2021-10-03_0_1_12_140,1.262511,2021-10-03,0,1,12,140,2021,10,3,2,1,1,1,17,10
1,2021-10-03_0_1_12_164,1.200621,2021-10-03,0,1,12,164,2021,10,3,2,1,1,1,17,10
2,2021-10-03_0_1_12_339,1.065869,2021-10-03,0,1,12,339,2021,10,3,2,1,1,1,17,10
3,2021-10-03_0_1_13_128,1.444024,2021-10-03,0,1,13,128,2021,10,3,2,1,1,1,17,10
4,2021-10-03_0_1_14_164,1.43507,2021-10-03,0,1,14,164,2021,10,3,2,1,1,1,17,10


In [38]:
filename = os.path.join(PATH_RESULTS,'submissions','submission_'+str(np.round(df_test_rmse,2))+'_local.csv')
print('Saving submission ',filename)
df_submission_sample[['ID','QTT']].to_csv(filename,index=None)

Saving submission  ../Results/submissions/submission_2.35_local.csv


In [39]:
df_submission_sample.shape

(69121, 16)