In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import os

output_path = "../outputs/m2/"
os.makedirs(output_path, exist_ok=True)

sns.set(style="whitegrid")
pd.set_option('display.max_columns', None)


In [2]:
orders = pd.read_csv('../data/raw/olist_orders_dataset.csv')
order_items = pd.read_csv("../data/raw/olist_order_items_dataset.csv")
products = pd.read_csv('../data/raw/olist_products_dataset.csv')
category_translation = pd.read_csv('../data/raw/product_category_name_translation.csv')


In [3]:
orders['order_purchase_timestamp'] = pd.to_datetime(
    orders['order_purchase_timestamp']
)

orders['year_month'] = orders['order_purchase_timestamp'].dt.to_period('M')

In [4]:
# Merge order_items with orders
df = order_items.merge(
    orders[['order_id', 'year_month']],
    on='order_id',
    how='left'
)

# Merge products
df = df.merge(
    products[['product_id', 'product_category_name']],
    on='product_id',
    how='left'
)

# Translate categories to English
df = df.merge(
    category_translation,
    on='product_category_name',
    how='left'
)

df['category'] = df['product_category_name_english']


In [5]:
monthly_demand = df.groupby(
    ['year_month', 'category']
).size().reset_index(name='order_count')

monthly_demand['year_month'] = monthly_demand['year_month'].dt.to_timestamp()

monthly_demand.head()

monthly_demand.to_csv(
    output_path + "monthly_demand_data.csv",
    index=False
)


In [6]:
top_categories = (
    monthly_demand.groupby('category')['order_count']
    .sum()
    .sort_values(ascending=False)
    .head(3)
    .index.tolist()
)

top_categories


['bed_bath_table', 'health_beauty', 'sports_leisure']

In [7]:
def moving_average_forecast(train, test, window=3):
    forecast_value = train.rolling(window).mean().iloc[-1]
    forecast = np.repeat(forecast_value, len(test))
    return forecast


In [8]:
def holt_linear_forecast(train, test):
    model = ExponentialSmoothing(
        train,
        trend='add',
        seasonal=None
    )
    fit = model.fit()
    forecast = fit.forecast(len(test))
    return forecast


In [9]:
def safe_mape(actual, forecast):
    actual, forecast = np.array(actual), np.array(forecast)
    
    mask = actual != 0   # ignore zero-demand months
    
    return np.mean(
        np.abs((actual[mask] - forecast[mask]) / actual[mask])
    ) * 100


In [10]:
results = []

for cat in top_categories:
    
    ts = monthly_demand[
        monthly_demand['category'] == cat
    ].set_index('year_month')['order_count']
    
    ts = ts.sort_index()
    
    train = ts[:-3]
    test = ts[-3:]
    
    # Moving Average
    ma_forecast = moving_average_forecast(train, test)
    
    # Holt Linear
    holt_forecast = holt_linear_forecast(train, test)
    
    for name, forecast in [('MA_3', ma_forecast),
                           ('Holt_Linear', holt_forecast)]:
        
        mae = mean_absolute_error(test, forecast)
        rmse = sqrt(mean_squared_error(test, forecast))
        mape = safe_mape(test, forecast)

        
        results.append([cat, name, mae, rmse, mape])


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


In [11]:
forecast_results = pd.DataFrame(
    results,
    columns=['Category', 'Method', 'MAE', 'RMSE', 'MAPE']
)

forecast_results
forecast_results.to_csv(
    output_path + "forecast_evaluation_results.csv",
    index=False
)



In [12]:
for cat in top_categories:
    
    ts = monthly_demand[
        monthly_demand['category'] == cat
    ].set_index('year_month')['order_count']
    
    ts = ts.sort_index()
    
    train = ts[:-3]
    test = ts[-3:]
    
    holt_forecast = holt_linear_forecast(train, test)
    
    plt.figure(figsize=(10,5))
    
    plt.plot(train.index, train, label='Train')
    plt.plot(test.index, test, label='Actual', marker='o')
    plt.plot(test.index, holt_forecast, label='Forecast', marker='o')
    
    plt.title(f"{cat} - Monthly Demand Forecast")
    plt.xlabel("Month")
    plt.ylabel("Order Count")
    plt.legend()
    
    # ðŸ”¥ SAVE IMAGE
    plt.savefig(output_path + f"{cat}_forecast.png", bbox_inches='tight')
    
    plt.close()


  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
