In [1]:
pip install lightgbm holidays

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-manylinux1_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m64.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting holidays
  Downloading holidays-0.15-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.3/181.3 KB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting korean-lunar-calendar
  Downloading korean_lunar_calendar-0.2.1-py3-none-any.whl (8.0 kB)
Collecting convertdate>=2.3.0
  Downloading convertdate-2.4.0-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.9/47.9 KB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hijri-converter
  Downloading hijri_converter-2.2.4-py3-none-any.whl (14 kB)
Collecting pymeeus<=1,>=0.3.13
  Downloading PyMeeus-0.5.11.tar.gz (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━

In [59]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import holidays
import time

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor


s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/train.csv'
file_key_2 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/test.csv'
file_key_3 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/sample_submission.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['date'] = pd.to_datetime(train['date'], format = '%Y-%m-%d')

test = pd.read_csv(file_content_stream_2)
test['date'] = pd.to_datetime(test['date'], format = '%Y-%m-%d')

submission = pd.read_csv(file_content_stream_3)

## Basic feature engineering 
train['weekday'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['weekend'] = np.where(train['weekday'] >= 5, 1, 0)
train['dayOfMonth'] = train['date'].dt.day
train['dayOfYear'] = train['date'].dt.dayofyear

test['weekday'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['weekend'] = np.where(test['weekday'] >= 5, 1, 0)
test['dayOfMonth'] = test['date'].dt.day
test['dayOfYear'] = test['date'].dt.dayofyear

## Extracting holidays
be_holidays = holidays.BE(years = [2017, 2018, 2019, 2020, 2021])
fr_holidays = holidays.FR(years = [2017, 2018, 2019, 2020, 2021])
de_holidays = holidays.DE(years = [2017, 2018, 2019, 2020, 2021])
it_holidays = holidays.IT(years = [2017, 2018, 2019, 2020, 2021])
pl_holidays = holidays.PL(years = [2017, 2018, 2019, 2020, 2021])
es_holidays = holidays.ES(years = [2017, 2018, 2019, 2020, 2021])

train_list = list()
test_list = list()
countries = ['Belgium', 'France', 'Germany', 'Italy', 'Poland', 'Spain']

for i in range(0, len(countries)):
    
    train_temp = train[train['country'] == countries[i]].reset_index(drop = True)
    train_temp['is_holiday'] = np.nan
    
    test_temp = test[test['country'] == countries[i]].reset_index(drop = True)
    test_temp['is_holiday'] = np.nan
    
    if (i == 0):
        
        holiday_to_use = be_holidays
        
    elif (i == 1):
        
        holiday_to_use = fr_holidays
        
    elif (i == 2):
        
        holiday_to_use = de_holidays
        
    elif (i == 3):
        
        holiday_to_use = it_holidays
        
    elif (i == 4):
        
        holiday_to_use = pl_holidays
        
    else:
        
        holiday_to_use = es_holidays
    
    for j in range(0, train_temp.shape[0]):
        
        train_temp['is_holiday'][j] = np.where(train_temp['date'][j] in holiday_to_use, 1, 0)
        
    train_list.append(train_temp)
    
    for k in range(0, test_temp.shape[0]):
        
        test_temp['is_holiday'][k] = np.where(test_temp['date'][k] in holiday_to_use, 1, 0)
        
    test_list.append(test_temp)
    
## Putting train and test in the right format
train = pd.concat(train_list)
train['is_holiday'] = train['is_holiday'].astype(int)

test = pd.concat(test_list)
test['is_holiday'] = test['is_holiday'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_temp['is_holiday'][j] = np.where(train_temp['date'][j] in holiday_to_use, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_temp['is_holiday'][k] = np.where(test_temp['date'][k] in holiday_to_use, 1, 0)


In [61]:
train.head(10)

Unnamed: 0,row_id,date,country,store,product,num_sold,weekday,month,weekend,dayOfMonth,dayOfYear,is_holiday
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663,6,1,1,1,1,1
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615,6,1,1,1,1,1
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480,6,1,1,1,1,1
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710,6,1,1,1,1,1
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240,6,1,1,1,1,1
5,5,2017-01-01,Belgium,KaggleRama,Kaggle Getting Started,187,6,1,1,1,1,1
6,6,2017-01-01,Belgium,KaggleRama,Kaggle Recipe Book,158,6,1,1,1,1,1
7,7,2017-01-01,Belgium,KaggleRama,Kaggle for Kids: One Smart Goose,267,6,1,1,1,1,1
8,48,2017-01-02,Belgium,KaggleMart,Kaggle Advanced Techniques,514,0,1,0,2,2,0
9,49,2017-01-02,Belgium,KaggleMart,Kaggle Getting Started,408,0,1,0,2,2,0


In [62]:
test.head(10)

Unnamed: 0,row_id,date,country,store,product,weekday,month,weekend,dayOfMonth,dayOfYear,is_holiday
0,70128,2021-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,4,1,0,1,1,1
1,70129,2021-01-01,Belgium,KaggleMart,Kaggle Getting Started,4,1,0,1,1,1
2,70130,2021-01-01,Belgium,KaggleMart,Kaggle Recipe Book,4,1,0,1,1,1
3,70131,2021-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,4,1,0,1,1,1
4,70132,2021-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,4,1,0,1,1,1
5,70133,2021-01-01,Belgium,KaggleRama,Kaggle Getting Started,4,1,0,1,1,1
6,70134,2021-01-01,Belgium,KaggleRama,Kaggle Recipe Book,4,1,0,1,1,1
7,70135,2021-01-01,Belgium,KaggleRama,Kaggle for Kids: One Smart Goose,4,1,0,1,1,1
8,70176,2021-01-02,Belgium,KaggleMart,Kaggle Advanced Techniques,5,1,1,2,2,0
9,70177,2021-01-02,Belgium,KaggleMart,Kaggle Getting Started,5,1,1,2,2,0


In [63]:
from sklearn.preprocessing import LabelEncoder

all_data = pd.concat([train, test], axis = 0)

le = LabelEncoder()
cols = ['country', 'store', 'product']
for col in cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])

In [64]:
all_data = all_data.drop(['date', 'row_id'], axis = 1)
train = all_data.iloc[:70128,:]
test = all_data.iloc[70128:,:].drop(['num_sold'], axis = 1)

X = train.drop(['num_sold'], axis = 1)
Y = np.log(train['num_sold'])

In [None]:
t1 = time.time()
kf = KFold(n_splits = 4, shuffle = True, random_state = 888)
score_list_lgb = []
test_preds_lgb = []
fold = 1

for train_index, test_index in kf.split(X, Y):
    
    ## Splitting the data
    X_train , X_val = X.iloc[train_index], X.iloc[test_index]  
    Y_train, Y_val = Y.iloc[train_index], Y.iloc[test_index]    
    
    print("X_train shape is :", X_train.shape, "X_val shape is", X_val.shape)
    y_pred_list = []
    
    model_lgb = LGBMRegressor(n_estimators = 5000, 
                              learning_rate = 0.01,
                              num_leaves = 40,
                              max_depth = 9, 
                              lambda_l1 = 3, 
                              lambda_l2 = 1, 
                              bagging_fraction = 0.95, 
                              feature_fraction = 0.96)

    model = model_lgb.fit(X_train, Y_train)
    result = model_lgb.predict(X_val)
    
    result = pd.DataFrame(result)
    result.iloc[:, 0] = [0 if i <= 0 else i for i in result.iloc[:,0]]
    
    score = np.sqrt(mean_squared_error(Y_val, result))
    print('Fold ', str(fold), ' result is:', score, '\n')
    score_list_lgb.append(score)

    test_preds_lgb.append(model_lgb.predict(test))
    fold +=1

t2 = time.time()
print("LGBM model with cross validation take : {:.3f} sn.".format(t2-t1))

X_train shape is : (52596, 9) X_val shape is (17532, 9)
Fold  1  result is: 0.16741492190606994 

X_train shape is : (52596, 9) X_val shape is (17532, 9)
Fold  2  result is: 0.16802381937281866 

X_train shape is : (52596, 9) X_val shape is (17532, 9)
Fold  3  result is: 0.17118008066856474 

X_train shape is : (52596, 9) X_val shape is (17532, 9)


In [70]:
mean = sum(score_list_lgb) / len(score_list_lgb)
variance = sum([((x - mean) ** 2) for x in score_list_lgb]) / len(score_list_lgb)
res = variance ** 0.5
print("Cross validation mean score:", sum(score_list_lgb) / len(score_list_lgb))
print("Cross validation score's Standart deviation is:", res)

Cross validation mean score: 0.16836193225141136
Cross validation score's Standart deviation is: 0.002159624447095771


In [20]:
test_preds_lgb = pd.DataFrame(test_preds_lgb)
print(test_preds_lgb.shape)

test_preds_lgb = test_preds_lgb.mean(axis = 0)
print(test_preds_lgb.head(5))

(4, 17520)
0    6.304793
1    6.019955
2    5.862366
3    6.342428
4    5.261072
dtype: float64


In [21]:
submission['num_sold'] = np.exp(test_preds_lgb)
submission.to_csv('LightGBM_submission_6.csv', index = False)
submission.head()

Unnamed: 0,row_id,num_sold
0,70128,547.188256
1,70129,411.559895
2,70130,351.555074
3,70131,568.174115
4,70132,192.687941
