In [22]:
import pandas as pd
import matplotlib.pyplot as plt
# Modeling
from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM #DeepAR,NHITS,RNN,

import torch
from typing import Union
from neuralforecast.losses.pytorch import MSE
from neuralforecast.losses.pytorch import BasePointLoss
from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM

from neuralforecast.utils import augment_calendar_df

from utils.losses import customLoss

In [57]:
df = pd.read_csv('data/01_input_history.csv')
df.head()

Unnamed: 0,Country,Product,Month,Quantity
0,Japan,MorningMint,Jan2004,0
1,Japan,MorningMint,Feb2004,0
2,Japan,MorningMint,Mar2004,0
3,Japan,MorningMint,Apr2004,0
4,Japan,MorningMint,May2004,0


In [58]:
# pre process
from utils import preprocess
df_train_null, df_train_inactive, df_train_active, df_validation = preprocess.preprocess_ex1(df)

In [25]:
# merge the inactive to the active
df_train_merged = pd.merge(df_train_active, df_train_inactive, how='outer', on=['unique_id', 'ds', 'Quantity', 'Country', 'Product'])

# create a static dataframe
df_train_static = df_train_merged[['unique_id', 'Country', 'Product']].drop_duplicates().reset_index(drop=True)
df_train_static = pd.get_dummies(df_train_static, columns=['Country', 'Product'], drop_first=True)
assert df_train_static.shape[0] == df_train_merged['unique_id'].nunique(), 'The number of unique_id in static and merged dataframes do not match!'
# df_static.head()

In [26]:
df_train_merged = augment_calendar_df(df_train_merged, freq='M')[0]
df_validation = augment_calendar_df(df_validation, freq='M')[0]

In [27]:
FORECASTING_HORIZON = 12 # one year ahead forecast








In [28]:
custom_loss = customLoss()

nf = NeuralForecast(
    models=[
        # Model 1: Long-short term memory
        LSTM(
            h =FORECASTING_HORIZON,
            input_size= FORECASTING_HORIZON*2,
            loss=custom_loss,
            encoder_n_layers = 2,
            encoder_hidden_size= 128,
            decoder_hidden_size=  128,
            decoder_layers= 2,
            futr_exog_list = ['month'],
            stat_exog_list = df_train_static.columns.tolist()[1:],
            batch_size = 128,
            learning_rate= 1e-3,
            max_steps = 2000,
            scaler_type = None,
            random_seed=42
        )
        # Model 2: 
        
    ],
    freq='MS',

)

Seed set to 42


In [29]:
# nf.fit(
#     df = df_train_merged[['unique_id','ds','Quantity',  'month']],
#     static_df= df_train_static,
#     #val_size= 12
#     # id_col = 'unique_id'
#     target_col = 'Quantity'
# )

In [30]:
# save the model 
# nf.save('models/lstm_model')

In [31]:
nf = NeuralForecast.load('models/lstm_model')

Seed set to 42


In [32]:
future_df = nf.make_future_dataframe()

In [33]:
future_df = augment_calendar_df(future_df, freq='M')[0]

In [34]:
future_df

Unnamed: 0,unique_id,ds,month
0,Australia_BrightBreeze Insect Repellent,2023-01-01,-0.500000
1,Australia_BrightBreeze Insect Repellent,2023-02-01,-0.409091
2,Australia_BrightBreeze Insect Repellent,2023-03-01,-0.318182
3,Australia_BrightBreeze Insect Repellent,2023-04-01,-0.227273
4,Australia_BrightBreeze Insect Repellent,2023-05-01,-0.136364
...,...,...,...
6595,United Kingdom_SunShield SPF 50 Lotion,2023-08-01,0.136364
6596,United Kingdom_SunShield SPF 50 Lotion,2023-09-01,0.227273
6597,United Kingdom_SunShield SPF 50 Lotion,2023-10-01,0.318182
6598,United Kingdom_SunShield SPF 50 Lotion,2023-11-01,0.409091


In [35]:
y_hat = nf.predict(
    futr_df = future_df,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Predicting: |          | 0/? [00:00<?, ?it/s]

In [36]:
y_hat=y_hat.loc[y_hat['unique_id'].isin(df_train_active['unique_id'].unique()),:]

In [37]:
df_forecast = df_validation[['unique_id', 'ds']].copy()
df_forecast['Quantity'] = 0

for id in y_hat['unique_id'].unique():
    df_forecast.loc[df_forecast['unique_id'] == id, 'Quantity'] = y_hat.loc[y_hat['unique_id'] == id, 'LSTM'].values.astype(int)
    

In [38]:
def restore_original_format(date_column):
    return date_column.dt.strftime('%b%Y')

In [39]:
def submission_formatter(df):
    restored_df = df.copy()
    restored_df['Country'] = df['unique_id'].str.split('_').str[0]
    restored_df['Product'] = df['unique_id'].str.split('_').str[1]
    restored_df['Month'] = restore_original_format(restored_df['ds'])
    restored_df.drop(columns=['unique_id', 'ds'], inplace=True)
    return restored_df

    
df_forecast = submission_formatter(df_forecast)
df_validation = submission_formatter(df_validation)

In [40]:
# save to csv
df_forecast.to_csv('outputs/01_output_prediction_1239.csv', index=False)
df_validation.to_csv('outputs/01_output_validation.csv', index=False)

In [41]:
#NEW TEST... USE A GLM instead of NN

In [42]:
import statsmodels

In [43]:
# fit a GLM on df_train_active,
# using Quantity as the response variable, 
# Negative binomial

In [88]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families.family import NegativeBinomial
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrices

In [89]:

# Convert date column to datetime
df_train_active['ds'] = pd.to_datetime(df_train_active['ds'])

# Extract useful features from the date
df_train_active['year'] = df_train_active['ds'].dt.year
df_train_active['month'] = df_train_active['ds'].dt.month
df_train_active['quarter'] = df_train_active['ds'].dt.quarter

# Create dummy variables for categorical features and clean column names
# Replace spaces and special characters in column names after one-hot encoding
df_model = pd.get_dummies(df_train_active, columns=['Country', 'Product'])

# Clean column names - replace spaces and special characters with underscores
df_model.columns = [col.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '') 
                   for col in df_model.columns]

# Define the formula for the model more carefully
# Only include columns that are properly formatted for the formula
feature_cols = []
for col in df_model.columns:
    if col.startswith('Country_') or col.startswith('Product_'):
        # Make sure the column name is valid for formulas
        if ' ' not in col and '(' not in col and ')' not in col and '-' not in col:
            feature_cols.append(col)

In [90]:
formula = 'Quantity ~ year + C(month) + C(quarter) + ' + ' + '.join(feature_cols)

In [91]:
nb_model = smf.glm(formula=formula, 
                  data=df_model, 
                  family=sm.families.NegativeBinomial(alpha=1.0))

nb_results = nb_model.fit()

print(nb_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               Quantity   No. Observations:                49776
Model:                            GLM   Df Residuals:                    49656
Model Family:        NegativeBinomial   Df Model:                          119
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -4.0244e+05
Date:                Sun, 06 Apr 2025   Deviance:                       7414.6
Time:                        10:19:53   Pearson chi2:                 6.08e+03
No. Iterations:                    10   Pseudo R-squ. (CS):            0.04513
Covariance Type:            nonrobust                                         
                                                               coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [48]:
# fit also a Poisson model
poisson_model = smf.glm(formula=formula, 
                  data=df_model, 
                  family=sm.families.Poisson())
poisson_results = poisson_model.fit()
print(poisson_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               Quantity   No. Observations:                49776
Model:                            GLM   Df Residuals:                    49656
Model Family:                 Poisson   Df Model:                          119
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.7984e+06
Date:                Sun, 06 Apr 2025   Deviance:                   7.1564e+06
Time:                        10:13:04   Pearson chi2:                 6.87e+06
No. Iterations:                     5   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                                               coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------

In [92]:
# best prediction, check if the product is active. if it is predict with glm, else leave it as 0
df_forecast_glm = df_validation.copy()

In [93]:
df_forecast_glm['Quantity'].sum(), df_validation['Quantity'].sum()

(np.int64(5852515), np.int64(5852515))

In [94]:
# now use the glm to predict the values

df_forecast_glm['year'] = df_forecast_glm['ds'].dt.year
df_forecast_glm['month'] = df_forecast_glm['ds'].dt.month
df_forecast_glm['quarter'] = df_forecast_glm['ds'].dt.quarter
df_forecast_glm['Country'] = df_forecast_glm['unique_id'].str.split('_').str[0]
df_forecast_glm['Product'] = df_forecast_glm['unique_id'].str.split('_').str[1]

# Create dummy variables for categorical features and clean column names
df_forecast_glm = pd.get_dummies(df_forecast_glm, columns=['Country', 'Product'])
# Clean column names - replace spaces and special characters with underscores
df_forecast_glm.columns = [col.replace(' ', '_').replace('-', '_').replace('(', '').replace(')', '') 
                   for col in df_forecast_glm.columns]
# Define the formula for the model more carefully
# Only include columns that are properly formatted for the formula
feature_cols = []
for col in df_forecast_glm.columns:
    if col.startswith('Country_') or col.startswith('Product_'):
        # Make sure the column name is valid for formulas
        if ' ' not in col and '(' not in col and ')' not in col and '-' not in col:
            feature_cols.append(col)
# Predict the values
df_forecast_glm['predicted'] = poisson_results.predict(df_forecast_glm)
df_forecast_glm['predicted'] = df_forecast_glm['predicted'].clip(lower=0)
df_forecast_glm['predicted'] = df_forecast_glm['predicted'].astype(int)

In [95]:
# now take only the prediction for the active values
for id in df_train_active['unique_id'].unique():
    # for those ids, replace 'Quantity' with the values of predicted
    mask = df_forecast_glm['unique_id'] == id
    df_forecast_glm.loc[mask, 'Quantity'] = df_forecast_glm.loc[mask, 'predicted']

In [96]:
# Ensure the 'ds' column is in datetime format
df_forecast_glm['ds'] = pd.to_datetime(df_forecast_glm['ds'])

In [97]:
# save to csv in the proper format
df_forecast_glm = submission_formatter(df_forecast_glm)

In [98]:
df_forecast_glm_to_save = df_forecast_glm[['Country', 'Product', 'Month', 'Quantity']]

In [99]:
df_forecast_glm_to_save.to_csv('outputs/01_output_prediction_glm.csv', index=False)