In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
from math import sqrt
import itertools
from tqdm import tqdm  # Use regular tqdm
from statsmodels.tsa.stattools import adfuller
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
from joblib import Parallel, delayed  # For parallelization

# Suppress specific warnings
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.simplefilter('ignore', UserWarning)

# Load the data (update the path)
data = pd.read_csv('../data/processed/inventory.csv')  

# Filter data based on 'Material' Type and 'BFP' being 0
data = data[(data['Type'] == 'Material') & (data['BFP'] == 0)].copy()

# Convert the 'Date' column to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Set the 'Date' column as the index
data.set_index('Date', inplace=True)

# Get unique material codes
material_codes = data['Material Code'].unique()

# Create a dictionary to hold DataFrames for each material code
material_data = {material: data[data['Material Code'] == material] for material in material_codes}

# Define the parameter grid for SARIMAX
p = range(0, 5)
d = range(0, 2)
q = range(0, 5)
P = range(0, 2)
D = range(0, 2)
Q = range(0, 2)
s = 7  # Assuming weekly seasonality with daily data
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], s) for x in itertools.product(P, D, Q)]

# Initialize a DataFrame to store the results
sarimax_results = pd.DataFrame(columns=['Material Code', 'p', 'd', 'q', 'P', 'D', 'Q', 's', 'AIC', 'BIC', 'RMSE'])

# Function to process each material (for parallelization)
def process_material(material, df):
    # Drop missing values
    df = df.dropna()
    if df.empty:
        print(f"No data available for material {material}")
        return None  # Return None if no data

    try:
        # Check for stationarity and apply differencing if necessary
        result = adfuller(df['Material Issued'])
        if result[1] > 0.05:
            print(f"Data for material {material} is non-stationary. Applying differencing.")
            df['Material Issued'] = df['Material Issued'].diff().dropna()
    except Exception as e:
        print(f"Error performing ADF test or differencing for material {material}: {e}")
        return None  # Return None on error

    df = df.asfreq('D')
    # Initialize best metrics
    # Initialize best metrics
    best_aic = float('inf')
    best_bic = float('inf')
    best_rmse = float('inf')
    best_params = None
    best_seasonal_params = None

    for param in pdq:
        for seasonal_param in seasonal_pdq:
            try:
                # Fit SARIMAX model
                model = SARIMAX(df['Material Issued'], order=param, seasonal_order=seasonal_param)
                model_fit = model.fit(disp=False)

                aic = model_fit.aic
                bic = model_fit.bic

                # Forecast the next steps
                forecast = model_fit.forecast(steps=len(df))

                # Calculate RMSE
                if len(df['Material Issued']) == len(forecast):
                    rmse = sqrt(mean_squared_error(df['Material Issued'], forecast))

                    # Update the best parameters if the current model is better
                    if aic < best_aic:
                        best_aic = aic
                        best_bic = bic
                        best_rmse = rmse
                        best_params = param
                        best_seasonal_params = seasonal_param

            except Exception as e:
                print(f"Error fitting SARIMAX model for material {material} with params {param} and seasonal params {seasonal_param}: {e}")
                continue

    # Check if best_params is not None before creating result_dict
    if best_params is not None:
        result_dict = {
            'Material Code': material,
            'p': best_params[0],
            'd': best_params[1],
            'q': best_params[2],
            'P': best_seasonal_params[0],
            'D': best_seasonal_params[1],
            'Q': best_seasonal_params[2],
            's': best_seasonal_params[3],
            'AIC': best_aic,
            'BIC': best_bic,
            'RMSE': best_rmse
        }
        print(result_dict)  # Print result for current material
        return result_dict
    else:
        print(f"No valid SARIMAX model found for material {material}")
        return None  # Return None if no valid model

# Parallelize the processing
results = Parallel(n_jobs=-1)(delayed(process_material)(material, df)
                               for material, df in tqdm(material_data.items(),
                                                        total=len(material_data),
                                                        desc="Processing Material Codes"))

# Remove None results (from errors or empty data)
results = [result for result in results if result is not None]

# Create the DataFrame from the results
sarimax_results = pd.DataFrame(results)

# Display the SARIMAX model metrics DataFrame
display(sarimax_results)

# Optionally, save the DataFrame to a CSV file
sarimax_results.to_csv('../result/sarimax/sarimax_results.csv', index=False)