In [None]:
pip install holidays

In [9]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import holidays
import time

from Help_Funs import smape, is_holiday

from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import tensorflow as tf


s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/train.csv'
file_key_2 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/test.csv'
file_key_3 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/sample_submission.csv'
file_key_4 = 'Tabular-Playground-Series/Tabular-Playground-Sep-2022/TPSSEP22_GDP_data_2017_to_2021.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

bucket_object_4 = bucket.Object(file_key_4)
file_object_4 = bucket_object_4.get()
file_content_stream_4 = file_object_4.get('Body')

## Reading data-files
train = pd.read_csv(file_content_stream_1)
train['date'] = pd.to_datetime(train['date'], format = '%Y-%m-%d')

test = pd.read_csv(file_content_stream_2)
test['date'] = pd.to_datetime(test['date'], format = '%Y-%m-%d')

submission = pd.read_csv(file_content_stream_3)
country_gdp = pd.read_csv(file_content_stream_4)

## Basic feature engineering 
train['weekday'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['weekend'] = np.where(train['weekday'] >= 5, 1, 0)
train['dayOfMonth'] = train['date'].dt.day
train['dayOfYear'] = train['date'].dt.dayofyear
train['quarter'] = train['date'].dt.quarter
train['year'] = train['date'].dt.year

test['weekday'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['weekend'] = np.where(test['weekday'] >= 5, 1, 0)
test['dayOfMonth'] = test['date'].dt.day
test['dayOfYear'] = test['date'].dt.dayofyear
test['quarter'] = test['date'].dt.quarter
test['year'] = test['date'].dt.year

## Appending GDP
train = pd.merge(train, country_gdp, on = ['country', 'year'], how = 'left')
train = train.drop(columns = ['year'], axis = 1)

test = pd.merge(test, country_gdp, on = ['country', 'year'], how = 'left')
test = test.drop(columns = ['year'], axis = 1)

## Appending holidays
data_holidays = is_holiday(train, test)
train = data_holidays[0]
test = data_holidays[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_temp['is_holiday'][j] = np.where(train_temp['date'][j] in holiday_to_use, 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_temp['holiday_season'][j] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_temp['holiday_season'][j] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_temp

In [10]:
all_data = pd.concat([train, test], axis = 0)

le = LabelEncoder()
cols = ['country', 'store', 'product']
for col in cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col])

scaler = MinMaxScaler()    

all_data = all_data.drop(['date', 'row_id'], axis = 1)
train = all_data.iloc[:70128,:]
test = all_data.iloc[70128:,:].drop(['num_sold'], axis = 1)
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)

X = train.drop(['num_sold'], axis = 1)
Y = train['num_sold']

In [11]:
t1 = time.time()
kf = KFold(n_splits = 4, shuffle = True, random_state = 888)
score_list_tf = []
test_preds_tf = []
fold = 1

## Defining model 
model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(16, input_dim = 12, activation = 'relu'),
        tf.keras.layers.Dense(16, activation = 'relu'),
        tf.keras.layers.Dense(1)
])

model.compile(optimizer = 'adam', loss = 'mean_absolute_error')


for train_index, test_index in kf.split(X, Y):
    
    ## Splitting the data
    X_train , X_val = X.iloc[train_index], X.iloc[test_index]  
    Y_train, Y_val = Y.iloc[train_index], Y.iloc[test_index]    
    
    print("X_train shape is :", X_train.shape, "X_val shape is", X_val.shape)
    y_pred_list = []
    
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
    X_val = pd.DataFrame(scaler.fit_transform(X_val), columns = X_val.columns)

    model.fit(X_train, Y_train, verbose = 1, epochs = 20, batch_size = 32, validation_data = (X_val, Y_val))
    result = model.predict(X_val)
    print(result)
    
    result = pd.DataFrame({'pred_num_sold': result})
    result['pred_num_sold'] = [0 if i <= 0 else i for i in result['pred_num_sold']]
    
    Y_val = pd.DataFrame(Y_val.reset_index(drop = True))
    score = smape(Y_val['num_sold'], result['pred_num_sold'])
    print('Fold ', str(fold), ' result is:', score, '\n')
    score_list_tf.append(score)

    test_preds_tf.append(model.predict(test))
    fold += 1

t2 = time.time()
print('TF model with cross validation take : {:.3f} sn.'.format(t2-t1))

X_train shape is : (52596, 12) X_val shape is (17532, 12)
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
[[368.27823 ]
 [ 86.535194]
 [144.74878 ]
 ...
 [225.33092 ]
 [108.736115]
 [224.54227 ]]


ValueError: If using all scalar values, you must pass an index

In [None]:
mean = sum(score_list_tf) / len(score_list_tf)
variance = sum([((x - mean) ** 2) for x in score_list_tf]) / len(score_list_tf)
res = variance ** 0.5
print("Cross validation mean score:", sum(score_list_tf) / len(score_list_tf))
print("Cross validation score's Standart deviation is:", res)

In [None]:
test_preds_tf = pd.DataFrame(np.concatenate(test_preds_tf, axis = 1))
print(test_preds_tf.shape)

test_preds_tf = test_preds_tf.mean(axis = 1)
print(test_preds_tf.head())

In [None]:
submission['num_sold'] = np.exp(test_preds_tf)
submission.to_csv('TF_submission_5.csv', index = False)
submission.head()