# Import Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from azureml.opendatasets import OjSalesSimulated
from sklearn.metrics import mean_squared_error
import pmdarima as pm # Used for AutoARIMA model

# Set plot style for better visualization
plt.style.use('seaborn-v0_8-darkgrid')
# Optional: Suppress warnings from pmdarima for cleaner output
import warnings
warnings.filterwarnings("ignore")

print("--- Starting Orange Juice Sales Forecasting Notebook ---")

# Import Data



In [None]:
# --- 1. Data Importing and Initial Inspection ---

# Step 1.1 & 1.2: Fetch the OjSalesSimulated dataset and convert it to a Pandas DataFrame in a single line.
# This command connects to Azure's open datasets, retrieves the dataset, and directly loads all its data
# into memory as a Pandas DataFrame for immediate use.
print("\n1.1 & 1.2 Fetching OjSalesSimulated dataset and loading into Pandas DataFrame...")
oj_sales_df = OjSalesSimulated.get_file_dataset().to_pandas_dataframe()




1.1 & 1.2 Fetching OjSalesSimulated dataset and loading into Pandas DataFrame...


NameError: name 'OjSalesSimulated' is not defined

In [None]:
# Step 1.3: Display the first few rows of the DataFrame.
# This helps in quickly understanding the columns, their names, and the type of data they contain.
print("\n--- 1.3 Initial Data Head ---")
print(oj_sales_df.head())


In [None]:

# Step 1.4: Display basic information about the DataFrame.
# 'info()' provides a summary including column data types, number of non-null values, and memory usage.
# This is useful for identifying missing values or incorrect data types.
print("\n--- 1.4 DataFrame Info ---")
oj_sales_df.info()

In [None]:
# Step 1.5: Display descriptive statistics for numerical columns.
# 'describe()' provides statistics like count, mean, standard deviation, min, max, and quartiles.
# This gives a quick overview of the distribution and range of numerical data.
print("\n--- 1.5 Descriptive Statistics ---")
print(oj_sales_df.describe())

# --- Data Preparation ---

In [None]:
# Step 2.1: Convert the 'WeekStarting' column to datetime objects.
# This is a critical step for any time series analysis, as it allows Pandas to recognize
# the column as a time index and enables time-based operations.
print("\n2.1 Converting 'WeekStarting' to datetime...")
oj_sales_df['WeekStarting'] = pd.to_datetime(oj_sales_df['WeekStarting'])

# Step 2.2: Sort the entire DataFrame.
# Sorting by 'Store', 'Brand', and then 'WeekStarting' ensures that for each
# unique combination of store and brand, the sales data is in chronological order.
# This is essential for correct time series modeling.
print("2.2 Sorting data by Store, Brand, and WeekStarting...")
oj_sales_df = oj_sales_df.sort_values(by=['Store', 'Brand', 'WeekStarting'])

# Step 2.3: Identify all unique combinations of 'Store' and 'Brand'.
# Each unique 'Store'-'Brand' pair represents an individual time series for which
# we want to build a separate forecasting model.
unique_series_ids = oj_sales_df[['Store', 'Brand']].drop_duplicates().reset_index(drop=True)
print(f"\nTotal unique Store-Brand combinations found: {len(unique_series_ids)}")

# --- Configuration for Forecasting ---
target_column = 'Quantity'  # The variable we want to predict.
time_column = 'WeekStarting' # The time axis.
forecast_horizon = 10       # Number of future weeks to forecast (e.g., 10 weeks).
season_length = 52          # Number of periods in a seasonal cycle (52 weeks in a year for weekly data).

# --- List to store aggregated results ---
all_forecast_results = []


# -- Iterate Train Predict Evaluate and PLot the computes data -->

In [None]:
# --- 3. Iterate, Train, Predict, Evaluate, and Plot for Each Time Series ---

print("\n--- 3. Starting Model Training and Prediction for Each Series ---")
print("    (This process will be very time-consuming for all series and will generate many plots.)")
print("    Consider using 'unique_series_ids = unique_series_ids.head(5)' to test with a small subset.")

# Uncomment the line below to test with a smaller number of series (e.g., 5 for quick check)
# unique_series_ids = unique_series_ids.head(5) # REMOVE OR COMMENT FOR FULL RUN

# Step 3.1: Loop through each unique time series (Store-Brand combination).
for index, row in unique_series_ids.iterrows():
    current_store = row['Store']
    current_brand = row['Brand']

    # Step 3.2: Filter the DataFrame for the current store and brand.
    # This isolates the data for a single, independent time series.
    single_series_df = oj_sales_df[
        (oj_sales_df['Store'] == current_store) &
        (oj_sales_df['Brand'] == current_brand)
    ].copy() # .copy() avoids warnings about modifying a slice

    # Step 3.3: Basic check for sufficient data.
    # We need enough data to split into training and testing sets.
    if len(single_series_df) < 2 * forecast_horizon:
        print(f"  Skipping Store: {current_store}, Brand: {current_brand} - Not enough data ({len(single_series_df)} points).")
        continue

    # Step 3.4: Data Splitting (Time-based).
    # For time series, we split data chronologically. The last 'forecast_horizon' periods
    # are reserved for testing (evaluation), and the rest for training.
    split_date = single_series_df[time_column].max() - pd.Timedelta(weeks=forecast_horizon)
    train_data = single_series_df[single_series_df[time_column] <= split_date].copy()
    test_data = single_series_df[single_series_df[time_column] > split_date].copy()

    # Get the actual target values for comparison.
    actual_values = test_data[target_column].values

    # Check if training data is empty after split (e.g., very short series).
    if train_data.empty:
        print(f"  Skipping Store: {current_store}, Brand: {current_brand} - Empty training data after split.")
        continue

    print(f"\nProcessing Store: {current_store}, Brand: {current_brand}")
    print(f"  Train data points: {len(train_data)}")
    print(f"  Test data points (forecast horizon): {len(test_data)}")

    # --- 4. Model Training, Prediction, and Evaluation for Each Model Type ---

    # --- Model Type 1: Naive Forecast ---
    # The Naive forecast simply predicts the last observed value from the training data
    # for all future forecast steps.
    last_train_value = train_data[target_column].iloc[-1]
    naive_predictions = np.full(shape=len(test_data), fill_value=last_train_value)

    # Calculate Root Mean Squared Error (RMSE) for Naive model.
    rmse_naive = np.sqrt(mean_squared_error(actual_values, naive_predictions))
    print(f"  Naive Forecast RMSE: {rmse_naive:.2f}")

    # --- Model Type 2: Seasonal Naive Forecast ---
    # The Seasonal Naive forecast predicts the value from the same period in the last
    # completed seasonal cycle in the training data.
    seasonal_train_values = np.array([])
    if len(train_data) >= season_length:
        # Extract the last full season's worth of data from the training set.
        seasonal_train_values = train_data[target_column].iloc[-season_length:].values
    else:
        # Fallback: if not enough data for a full season, use a simpler approach or default.
        # Here, we'll repeat the last known value for the season length if not enough.
        print(f"    Warning: Not enough data for full season ({season_length} weeks) for Seasonal Naive. Using last available value for some forecasts.")
        # Create a proxy for seasonal values by repeating the last observed value
        seasonal_train_values = np.full(shape=season_length, fill_value=last_train_value)

    # Generate predictions by repeating the seasonal pattern.
    seasonal_naive_predictions = np.array([
        seasonal_train_values[i % len(seasonal_train_values)] # Use modulo to loop through seasonal values
        for i in range(len(test_data))
    ])

    # Calculate RMSE for Seasonal Naive model.
    rmse_seasonal_naive = np.sqrt(mean_squared_error(actual_values, seasonal_naive_predictions))
    print(f"  Seasonal Naive Forecast RMSE: {rmse_seasonal_naive:.2f}")

    # --- Model Type 3: AutoARIMA Forecast ---
    # AutoARIMA (from pmdarima) automatically searches for the best ARIMA (p,d,q)(P,D,Q)m
    # parameters for the given time series based on information criteria (e.g., AIC).
    y_train_arima = train_data[target_column].values

    arima_predictions = np.full(shape=len(test_data), fill_value=np.nan) # Initialize with NaNs
    rmse_arima = np.nan # Initialize RMSE as NaN

    try:
        # Fit the AutoARIMA model.
        arima_model = pm.auto_arima(y_train_arima,
                                    start_p=1, start_q=1, # Starting non-seasonal AR/MA orders
                                    max_p=5, max_q=5,     # Maximum non-seasonal AR/MA orders
                                    m=season_length,      # Seasonal period
                                    seasonal=True,        # Enable search for seasonal components
                                    d=None, D=None,       # Let auto_arima determine differencing orders
                                    trace=False,          # Do not print search progress for each trial
                                    error_action='ignore',# Ignore errors if a particular ARIMA order fails
                                    suppress_warnings=True, # Suppress warnings from internal statsmodels
                                    stepwise=True)        # Use stepwise (greedy) search for efficiency

        # Make predictions for the forecast horizon using the trained ARIMA model.
        arima_predictions = arima_model.predict(n_periods=len(test_data))

        # Calculate RMSE for AutoARIMA model.
        rmse_arima = np.sqrt(mean_squared_error(actual_values, arima_predictions))
        print(f"  AutoARIMA Forecast RMSE: {rmse_arima:.2f}")

        # Note: In a real-world scenario (especially with Azure ML Pipelines),
        # you would save this 'arima_model' using joblib.dump for later deployment.
        # Example: joblib.dump(arima_model, f'./models/arima_store_{current_store}_brand_{current_brand}.pkl')
        # For this local notebook, we just train and predict.

    except Exception as e:
        # If AutoARIMA fails for a specific series (e.g., due to flat data or convergence issues),
        # print an error and record NaN for RMSE.
        print(f"  AutoARIMA failed for Store: {current_store}, Brand: {current_brand}. Error: {e}")

    # --- 5. Store and Visualize Results for the Current Series ---

    # Step 5.1: Create a DataFrame to hold predictions and actuals for the current series.
    series_forecasts_df = pd.DataFrame({
        'Store': current_store,
        'Brand': current_brand,
        'WeekStarting': test_data[time_column],
        'ActualQuantity': actual_values,
        'NaiveForecast': naive_predictions,
        'SeasonalNaiveForecast': seasonal_naive_predictions,
        'AutoARIMAForecast': arima_predictions
    })
    # Also store the RMSEs
    series_forecasts_df['RMSE_Naive'] = rmse_naive
    series_forecasts_df['RMSE_SeasonalNaive'] = rmse_seasonal_naive
    series_forecasts_df['RMSE_AutoARIMA'] = rmse_arima

    # Append the results of the current series to the aggregated list.
    all_forecast_results.append(series_forecasts_df)

    # Step 5.2: Plotting Actual vs. Forecasted Quantities for the current series.
    # This loop will generate a plot for each series.
    # It's highly recommended to only plot for a small subset of series during development
    # to avoid overwhelming your display with thousands of plots.
    plt.figure(figsize=(14, 7))
    plt.plot(train_data[time_column], train_data[target_column], label='Training Data', color='blue', alpha=0.7)
    plt.plot(test_data[time_column], actual_values, label='Actual Test Data', color='green', linewidth=2)
    plt.plot(test_data[time_column], naive_predictions, label='Naive Forecast', linestyle='--', color='red')
    plt.plot(test_data[time_column], seasonal_naive_predictions, label='Seasonal Naive Forecast', linestyle=':', color='purple')
    if not np.isnan(rmse_arima): # Only plot ARIMA if it successfully ran
        plt.plot(test_data[time_column], arima_predictions, label='AutoARIMA Forecast', linestyle='-.', color='orange')

    plt.title(f'Sales Forecast for Store {current_store}, Brand {current_brand}')
    plt.xlabel('Week Starting')
    plt.ylabel('Quantity')
    plt.legend()
    plt.grid(True)
    plt.tight_layout() # Adjust layout to prevent overlapping
    plt.show() # Display the plot for the current series


# summarization

In [None]:

# --- 6. Aggregate and Summarize Overall Performance ---

# Step 6.1: Concatenate all individual forecast results into a single DataFrame.
final_forecast_output = pd.concat(all_forecast_results, ignore_index=True)

print("\n--- 6.1 All Forecasting Completed and Results Aggregated ---")
print("\nFinal Forecasts (first 5 rows of aggregated output):")
print(final_forecast_output.head())

print("\nFinal Forecasts (DataFrame Info):")
final_forecast_output.info()

# Step 6.2: Calculate average RMSE for each model type across all series.
# We'll drop rows where AutoARIMA might have failed (NaN RMSE).
avg_rmse_naive = final_forecast_output['RMSE_Naive'].mean()
avg_rmse_seasonal_naive = final_forecast_output['RMSE_SeasonalNaive'].mean()
avg_rmse_arima = final_forecast_output['RMSE_AutoARIMA'].mean() # This will be NaN if any ARIMA failed, filter first

# Filter out NaN for more accurate average for AutoARIMA if some failed
filtered_arima_rmse = final_forecast_output['RMSE_AutoARIMA'].dropna()
avg_rmse_arima = filtered_arima_rmse.mean()

print("\n--- 6.3 Overall Model Performance (Average RMSE) ---")
print(f"Average RMSE for Naive Forecast: {avg_rmse_naive:.2f}")
print(f"Average RMSE for Seasonal Naive Forecast: {avg_rmse_seasonal_naive:.2f}")
print(f"Average RMSE for AutoARIMA Forecast: {avg_rmse_arima:.2f} (based on {len(filtered_arima_rmse)} successful runs)")


# --- 7. Save Aggregated Results (Optional) ---
# You can save the final combined predictions and metrics to a CSV file.
output_csv_path = 'oj_sales_all_forecast_results.csv'
final_forecast_output.to_csv(output_csv_path, index=False)
print(f"\nAggregated forecast results saved to '{output_csv_path}'")

print("\n--- Notebook Execution Complete ---")