In [2]:
pip install holidays

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting holidays
  Using cached holidays-0.16-py3-none-any.whl (184 kB)
Collecting korean-lunar-calendar
  Using cached korean_lunar_calendar-0.3.1-py3-none-any.whl (9.0 kB)
Collecting hijri-converter
  Using cached hijri_converter-2.2.4-py3-none-any.whl (14 kB)
Collecting convertdate>=2.3.0
  Using cached convertdate-2.4.0-py3-none-any.whl (47 kB)
Collecting pymeeus<=1,>=0.3.13
  Using cached PyMeeus-0.5.11-py3-none-any.whl
Installing collected packages: pymeeus, korean-lunar-calendar, hijri-converter, convertdate, holidays
Successfully installed convertdate-2.4.0 hijri-converter-2.2.4 holidays-0.16 korean-lunar-calendar-0.3.1 pymeeus-0.5.11
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/tensorflow2_p38/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import holidays
import time

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import tensorflow as tf


s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/train.csv'
file_key_2 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/test.csv'
file_key_3 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/sample_submission.csv'
file_key_4 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/TPSSEP22_GDP_data_2017_to_2021.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

bucket_object_4 = bucket.Object(file_key_4)
file_object_4 = bucket_object_4.get()
file_content_stream_4 = file_object_4.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['date'] = pd.to_datetime(train['date'], format = '%Y-%m-%d')

test = pd.read_csv(file_content_stream_2)
test['date'] = pd.to_datetime(test['date'], format = '%Y-%m-%d')

submission = pd.read_csv(file_content_stream_3)
country_gdp = pd.read_csv(file_content_stream_4)

## Basic feature engineering 
train['weekday'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['weekend'] = np.where(train['weekday'] >= 5, 1, 0)
train['dayOfMonth'] = train['date'].dt.day
train['dayOfYear'] = train['date'].dt.dayofyear
train['year'] = train['date'].dt.year

test['weekday'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['weekend'] = np.where(test['weekday'] >= 5, 1, 0)
test['dayOfMonth'] = test['date'].dt.day
test['dayOfYear'] = test['date'].dt.dayofyear
test['year'] = test['date'].dt.year

## Appending GDP
train = pd.merge(train, country_gdp, on = ['country', 'year'], how = 'left')
train = train.drop(columns = ['year'], axis = 1)

test = pd.merge(test, country_gdp, on = ['country', 'year'], how = 'left')
test = test.drop(columns = ['year'], axis = 1)

## Extracting holidays
be_holidays = holidays.BE(years = [2017, 2018, 2019, 2020, 2021])
fr_holidays = holidays.FR(years = [2017, 2018, 2019, 2020, 2021])
de_holidays = holidays.DE(years = [2017, 2018, 2019, 2020, 2021])
it_holidays = holidays.IT(years = [2017, 2018, 2019, 2020, 2021])
pl_holidays = holidays.PL(years = [2017, 2018, 2019, 2020, 2021])
es_holidays = holidays.ES(years = [2017, 2018, 2019, 2020, 2021])

train_list = list()
test_list = list()
countries = ['Belgium', 'France', 'Germany', 'Italy', 'Poland', 'Spain']

for i in range(0, len(countries)):
    
    train_temp = train[train['country'] == countries[i]].reset_index(drop = True)
    train_temp['is_holiday'] = np.nan
    
    test_temp = test[test['country'] == countries[i]].reset_index(drop = True)
    test_temp['is_holiday'] = np.nan
    
    if (i == 0):
        
        holiday_to_use = be_holidays
        
    elif (i == 1):
        
        holiday_to_use = fr_holidays
        
    elif (i == 2):
        
        holiday_to_use = de_holidays
        
    elif (i == 3):
        
        holiday_to_use = it_holidays
        
    elif (i == 4):
        
        holiday_to_use = pl_holidays
        
    else:
        
        holiday_to_use = es_holidays
    
    for j in range(0, train_temp.shape[0]):
        
        train_temp['is_holiday'][j] = np.where(train_temp['date'][j] in holiday_to_use, 1, 0)
        
    train_list.append(train_temp)
    
    for k in range(0, test_temp.shape[0]):
        
        test_temp['is_holiday'][k] = np.where(test_temp['date'][k] in holiday_to_use, 1, 0)
        
    test_list.append(test_temp)
    
## Putting train and test in the right format
train = pd.concat(train_list)
train['is_holiday'] = train['is_holiday'].astype(int)

test = pd.concat(test_list)
test['is_holiday'] = test['is_holiday'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_temp['is_holiday'][j] = np.where(train_temp['date'][j] in holiday_to_use, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_temp['is_holiday'][k] = np.where(test_temp['date'][k] in holiday_to_use, 1, 0)


In [4]:
all_data = pd.concat([train, test], axis = 0)

le = LabelEncoder()
cols = ['country', 'store', 'product']
for col in cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])

scaler = MinMaxScaler()    

all_data = all_data.drop(['date', 'row_id'], axis = 1)
train = all_data.iloc[:70128,:]
test = all_data.iloc[70128:,:].drop(['num_sold'], axis = 1)
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)

X = train.drop(['num_sold'], axis = 1)
Y = np.log(train['num_sold'])

t1 = time.time()
kf = KFold(n_splits = 4, shuffle = True, random_state = 888)
score_list_tf = []
test_preds_tf = []
fold = 1

## Defining model 
model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(16, input_dim = 10, activation = 'relu'),
        tf.keras.layers.Dense(16, activation = 'relu'),
        tf.keras.layers.Dense(1)
])

model.compile(optimizer = 'adam', loss = 'mean_squared_error')


for train_index, test_index in kf.split(X, Y):
    
    ## Splitting the data
    X_train , X_val = X.iloc[train_index], X.iloc[test_index]  
    Y_train, Y_val = Y.iloc[train_index], Y.iloc[test_index]    
    
    print("X_train shape is :", X_train.shape, "X_val shape is", X_val.shape)
    y_pred_list = []
    
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_val = pd.DataFrame(scaler.fit_transform(X_val), columns = X_val.columns)

    model.fit(X_train, Y_train, verbose = 1, epochs = 20, validation_data = (X_val, Y_val))
    result = model.predict(X_val)
    
    result = pd.DataFrame(result)
    result.iloc[:, 0] = [0 if i <= 0 else i for i in result.iloc[:,0]]
    
    score = np.sqrt(mean_squared_error(Y_val, result))
    print('Fold ', str(fold), ' result is:', score, '\n')
    score_list_tf.append(score)

    test_preds_tf.append(model.predict(test))
    fold += 1

t2 = time.time()
print('TF model with cross validation take : {:.3f} sn.'.format(t2-t1))

2022-09-17 15:56:34.221618: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-09-17 15:56:34.221686: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (ip-172-16-89-68.ec2.internal): /proc/driver/nvidia/version does not exist
2022-09-17 15:56:34.223634: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


X_train shape is : (52596, 10) X_val shape is (17532, 10)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold  1  result is: 0.22458589892699388 

X_train shape is : (52596, 10) X_val shape is (17532, 10)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold  2  result is: 0.2181329994732533 

X_train shape is : (52596, 10) X_val shape is (17532, 10)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold  3  result is: 0.2107229327707961 

X_train s

In [5]:
mean = sum(score_list_tf) / len(score_list_tf)
variance = sum([((x - mean) ** 2) for x in score_list_tf]) / len(score_list_tf)
res = variance ** 0.5
print("Cross validation mean score:", sum(score_list_tf) / len(score_list_tf))
print("Cross validation score's Standart deviation is:", res)

Cross validation mean score: 0.21325685613896875
Cross validation score's Standart deviation is: 0.009293118115544122


In [6]:
test_preds_tf = pd.DataFrame(np.concatenate(test_preds_tf, axis = 1))
print(test_preds_tf.shape)

test_preds_tf = test_preds_tf.mean(axis = 1)
print(test_preds_tf.head())

(17520, 4)
0    6.168720
1    5.864390
2    5.729867
3    6.246539
4    5.156485
dtype: float32


In [7]:
submission['num_sold'] = np.exp(test_preds_tf)
submission.to_csv('TF_submission_2.csv', index = False)
submission.head()

Unnamed: 0,row_id,num_sold
0,70128,477.574554
1,70129,352.267181
2,70130,307.928284
3,70131,516.2229
4,70132,173.55336
