In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-storm-30/validation_data.csv
/kaggle/input/data-storm-30/train_data.csv
/kaggle/input/data-storm-30/test_data.csv
/kaggle/input/data-storm-30/DataStorm3.0_KaggleProblemStatement.pdf


In [2]:
retail_file_path = "/kaggle/input/data-storm-30/train_data.csv"
df = pd.read_csv(retail_file_path)
df.describe()

Unnamed: 0,ItemCode,DailySales
count,19921.0,19921.0
mean,611061.0,7.35189
std,454986.8,14.605342
min,3418.0,1.0
25%,117610.0,2.0
50%,837943.0,3.0
75%,1058713.0,7.0
max,1105027.0,434.0


In [3]:
data_one = df.loc[df['ItemCode'] == 145978]
data_one

Unnamed: 0,CategoryCode,ItemCode,DateID,DailySales
6,category_1,145978,10/30/2021,3
55,category_1,145978,12/9/2021,4
231,category_1,145978,10/23/2021,4
410,category_1,145978,12/7/2021,11
426,category_1,145978,1/24/2022,3
...,...,...,...,...
19230,category_1,145978,11/7/2021,3
19377,category_1,145978,10/10/2021,4
19514,category_1,145978,11/5/2021,1
19665,category_1,145978,10/2/2021,3


In [4]:
dates_col = pd.to_datetime(df["DateID"], format = "%m/%d/%Y")
added = pd.DataFrame({"month": dates_col.dt.month,
              "day": dates_col.dt.day,
              "yearday": dates_col.dt.dayofyear,
              "weekday": dates_col.dt.dayofweek
             })
is_weekend = added["weekday"].map(lambda x: int(x in [5,6]))
added["isweekend"] = is_weekend
added

Unnamed: 0,month,day,yearday,weekday,isweekend
0,11,6,310,5,1
1,11,18,322,3,0
2,1,24,24,0,0
3,10,30,303,5,1
4,10,30,303,5,1
...,...,...,...,...,...
19916,10,4,277,0,0
19917,10,12,285,1,0
19918,10,1,274,4,0
19919,10,1,274,4,0


In [5]:
df_with_dates = pd.concat([df, added], axis = 1)
df_with_dates = df_with_dates.drop("DateID", axis = 1)

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

In [7]:
object_cols = ["CategoryCode"]

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(df_with_dates[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = df_with_dates.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = df_with_dates.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
new_df = pd.concat([num_X_train, OH_cols_train], axis=1)

output = new_df["DailySales"]
new_df = new_df.drop("DailySales", axis = 1)

In [8]:
scaler = MinMaxScaler()
scaled_inp_df = pd.DataFrame(scaler.fit_transform(new_df))
scaled_inp_df.columns = new_df.columns
scaled_inp_df.head()



Unnamed: 0,ItemCode,month,day,yearday,weekday,isweekend,0,1,2,3
0,0.103659,0.909091,0.166667,0.848901,0.833333,1.0,0.0,1.0,0.0,0.0
1,0.756317,0.909091,0.566667,0.881868,0.5,0.0,0.0,0.0,0.0,1.0
2,0.332947,0.0,0.766667,0.063187,0.0,0.0,1.0,0.0,0.0,0.0
3,0.153561,0.818182,0.966667,0.82967,0.833333,1.0,0.0,1.0,0.0,0.0
4,0.910115,0.818182,0.966667,0.82967,0.833333,1.0,0.0,1.0,0.0,0.0


In [9]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

In [10]:
class SalesDataset(Dataset):
    
    def __init__(self, df, output = None, mode = 'train'):
        self.mode = mode
        self.df = df
        
        print(df.head())
        self.inp = df.values
        
        if self.mode == 'train':
            self.oup = output.values.reshape((len(df), 1))
            
    def __len__(self):
        return len(self.inp)
    
    def __getitem__(self, idx):
        result = {}
        result['inp'] = torch.Tensor(self.inp[idx])
        if self.mode == 'train':
            result['oup'] = torch.Tensor(self.oup[idx])
            
        return result
    
    def column_count(self):
        return self.inp.shape[1]

In [11]:
BATCH_SIZE = 32
EPOCHS = 200
VALID_SIZE = 0

data = SalesDataset(scaled_inp_df, output)
sub_train, sub_valid = torch.utils.data.random_split(data, [len(data) - VALID_SIZE, VALID_SIZE])
data_train = DataLoader(dataset = sub_train, batch_size = BATCH_SIZE, shuffle = False)
#data_valid = DataLoader(dataset = sub_valid, batch_size = VALID_SIZE, shuffle = False)

   ItemCode     month       day   yearday   weekday  isweekend    0    1    2  \
0  0.103659  0.909091  0.166667  0.848901  0.833333        1.0  0.0  1.0  0.0   
1  0.756317  0.909091  0.566667  0.881868  0.500000        0.0  0.0  0.0  0.0   
2  0.332947  0.000000  0.766667  0.063187  0.000000        0.0  1.0  0.0  0.0   
3  0.153561  0.818182  0.966667  0.829670  0.833333        1.0  0.0  1.0  0.0   
4  0.910115  0.818182  0.966667  0.829670  0.833333        1.0  0.0  1.0  0.0   

     3  
0  0.0  
1  1.0  
2  0.0  
3  0.0  
4  0.0  


In [12]:
layers = [data.column_count(), 32, 32, 4, 1]

class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(layers[0], layers[1])
        self.fc2 = nn.Linear(layers[1], layers[2])
        self.fc3 = nn.Linear(layers[2], layers[3])
        self.fc4 = nn.Linear(layers[3], layers[4])
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x

In [13]:
def train(model, x, y, optimizer, criterion):
    
    # set all gradients to 0
    model.zero_grad()
    # forward pass
    output = model(x)   # this implicitly calls forward()
    # calculate loss
    loss = criterion(output, y)
    # backward pass
    loss.backward()
    # update parameters
    optimizer.step()
    
    return loss, output

In [14]:
def predict(model, x):
    return model(x)
    

In [15]:
model = Network()
batch = next(iter(data_train))
x = batch['inp']
y = batch['oup']
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
criterion = nn.MSELoss()     # loss function

In [16]:
model = Network()

optimizer = torch.optim.Adam(model.parameters(), lr = 0.005)
criterion = nn.MSELoss()     # loss function

# Training loop
losses = []
for epoch in range(50):
    epoch_loss = 0
    correct = 0
    
    for bidx, batch in enumerate(data_train):
        x_train, y_train = batch['inp'], batch['oup']
        loss, predictions = train(model, x_train, y_train, optimizer, criterion)
        
        correct += ((y_train - predictions).abs() / y_train.abs()).sum()
        epoch_loss += loss
        
    accuracy = correct / len(sub_train)
    losses.append(float(epoch_loss))
    if (epoch + 1)%1 == 0:
        print('Epoch {} Accuracy: {} Loss: {}'.format(epoch + 1, accuracy, epoch_loss))
        

Epoch 1 Accuracy: 1.8597270250320435 Loss: 122471.1796875
Epoch 2 Accuracy: 1.8655589818954468 Loss: 115995.7109375
Epoch 3 Accuracy: 1.860190749168396 Loss: 115379.515625
Epoch 4 Accuracy: 1.856542944908142 Loss: 115028.4609375
Epoch 5 Accuracy: 1.8540433645248413 Loss: 114764.109375
Epoch 6 Accuracy: 1.856067180633545 Loss: 114550.1328125
Epoch 7 Accuracy: 1.8599222898483276 Loss: 114399.3046875
Epoch 8 Accuracy: 1.8626254796981812 Loss: 114317.8203125
Epoch 9 Accuracy: 1.8622721433639526 Loss: 114232.0625
Epoch 10 Accuracy: 1.8628348112106323 Loss: 114151.109375
Epoch 11 Accuracy: 1.862570881843567 Loss: 114069.78125
Epoch 12 Accuracy: 1.8651140928268433 Loss: 113879.6484375
Epoch 13 Accuracy: 1.8764081001281738 Loss: 112772.53125
Epoch 14 Accuracy: 1.8875840902328491 Loss: 109933.78125
Epoch 15 Accuracy: 1.8624402284622192 Loss: 106708.2734375
Epoch 16 Accuracy: 1.8507050275802612 Loss: 103274.0546875
Epoch 17 Accuracy: 1.8285667896270752 Loss: 99446.1953125
Epoch 18 Accuracy: 1.81

In [17]:
predictions

tensor([[ 7.8405],
        [ 5.3172],
        [ 5.0098],
        [ 4.7122],
        [ 4.9364],
        [ 4.7939],
        [ 7.5320],
        [13.3868],
        [ 6.5893],
        [ 3.8189],
        [ 3.8189],
        [ 6.2803],
        [12.2454],
        [ 3.9222],
        [ 8.9310],
        [ 9.1811],
        [ 4.9535]], grad_fn=<AddmmBackward>)

In [18]:
'''
w1: 14th February 2022 – 20th February 2022
ii. w2: 21st February 2022 – 27th February 2022
iii. w3: 28th February 2022 – 6
th March 2022
iv. w4: 7th March 2022 – 13th March 2022
'''

weeks = {'w1': pd.date_range('2022-02-14', '2022-02-20'),
'w2': pd.date_range('2022-02-21', '2022-02-27'),
'w3': pd.date_range('2022-02-28', '2022-03-06'),
'w4': pd.date_range('2022-03-07', '2022-03-13')}


In [19]:
def prepare_test(path, mode = 'valid'):
    val_df = pd.read_csv(path)

    rows_list = []
    for i, row in val_df.iterrows():
        for date in weeks[row['Week']]:
            rowdata = {}
            rowdata['CategoryCode'] = row['CategoryCode']
            rowdata['ItemCode'] = row['ItemCode']
            rowdata['DateID'] = date
            rows_list.append(rowdata)

    df = pd.DataFrame(rows_list)

    dates_col = df["DateID"]
    added = pd.DataFrame({"month": dates_col.dt.month,
                  "day": dates_col.dt.day,
                  "yearday": dates_col.dt.dayofyear,
                  "weekday": dates_col.dt.dayofweek
                 })
    is_weekend = added["weekday"].map(lambda x: int(x in [5,6]))
    added["isweekend"] = is_weekend

    df_with_dates = pd.concat([df, added], axis = 1)
    df_with_dates = df_with_dates.drop("DateID", axis = 1)
    
    object_cols = ["CategoryCode"]

    OH_cols_train = pd.DataFrame(OH_encoder.transform(df_with_dates[object_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = df_with_dates.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = df_with_dates.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    new_df = pd.concat([num_X_train, OH_cols_train], axis=1)
    
    scaled_inp_df = pd.DataFrame(scaler.transform(new_df))
    scaled_inp_df.columns = new_df.columns
    
    print(scaled_inp_df.head())
    
    return val_df, scaled_inp_df



In [20]:
def convert_to_weeks(val_df, predictions):
    weekly = []
    for i in range(0, len(predictions), 7):
        weekly.append(round(float(predictions[i: i+7].sum())))
    val_df['WeeklySales'] = weekly
        

In [21]:
val_df, scaled_inp_df = prepare_test('/kaggle/input/data-storm-30/test_data.csv')
val_data = SalesDataset(scaled_inp_df, mode = 'test')
data_valid = DataLoader(dataset = val_data, batch_size = len(val_data), shuffle = False)

   ItemCode     month       day   yearday   weekday  isweekend    0    1    2  \
0  0.036601  0.181818  0.200000  0.178571  0.000000        0.0  1.0  0.0  0.0   
1  0.036601  0.181818  0.233333  0.181319  0.166667        0.0  1.0  0.0  0.0   
2  0.036601  0.181818  0.266667  0.184066  0.333333        0.0  1.0  0.0  0.0   
3  0.036601  0.181818  0.300000  0.186813  0.500000        0.0  1.0  0.0  0.0   
4  0.036601  0.181818  0.333333  0.189560  0.666667        0.0  1.0  0.0  0.0   

     3  
0  0.0  
1  0.0  
2  0.0  
3  0.0  
4  0.0  
   ItemCode     month       day   yearday   weekday  isweekend    0    1    2  \
0  0.036601  0.181818  0.200000  0.178571  0.000000        0.0  1.0  0.0  0.0   
1  0.036601  0.181818  0.233333  0.181319  0.166667        0.0  1.0  0.0  0.0   
2  0.036601  0.181818  0.266667  0.184066  0.333333        0.0  1.0  0.0  0.0   
3  0.036601  0.181818  0.300000  0.186813  0.500000        0.0  1.0  0.0  0.0   
4  0.036601  0.181818  0.333333  0.189560  0.666667   



In [22]:
valid = next(iter(data_valid))
x_valid = valid['inp']
predictions = predict(model, x_valid)/3

In [23]:
convert_to_weeks(val_df, predictions)
val_df['final'] = val_df.apply(lambda row: row['CategoryCode'] + '_' + str(row['ItemCode']) + '_' + row['Week'], axis = 1)
val_df


Unnamed: 0,CategoryCode,ItemCode,Week,PredictedSales,WeeklySales,final
0,category_1,43738,w4,,13,category_1_43738_w4
1,category_2,1006090,w1,,9,category_2_1006090_w1
2,category_2,1076929,w4,,9,category_2_1076929_w4
3,category_1,1081321,w3,,11,category_1_1081321_w3
4,category_2,216151,w4,,9,category_2_216151_w4
...,...,...,...,...,...,...
372,category_2,1101571,w1,,9,category_2_1101571_w1
373,category_2,1090258,w4,,9,category_2_1090258_w4
374,category_2,906595,w1,,9,category_2_906595_w1
375,category_2,32245,w1,,11,category_2_32245_w1


In [24]:
submission_df = pd.DataFrame({'ID': val_df['final'], 'WeeklySales': val_df['WeeklySales']})
submission_df.to_csv('submission.csv', index=False)