In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA


In [6]:
barstov_merged = pd.read_csv('../Capstone Project/Data/barstov_merged.csv', sep=',')



In [3]:
seasonal_data = pd.read_csv('../Capstone Project/Data/seasonality_analysis.csv', sep=',')


In [7]:
product_data = pd.read_csv('../Capstone Project/Data/product_analysis.csv', sep=',')


In [None]:
seasonal_data.info()

In [None]:
product_data.info()

In [None]:
barstov_merged.info()

In [4]:
total_sales_article_id = seasonal_data.groupby(['article_id', 'product_type_name'])['total_sales'].sum().reset_index()


In [5]:
total_sales_id = total_sales_article_id.sort_values(by='total_sales', ascending=False)


In [6]:
#30% of all the top sold items

# Calculate the threshold for the top 30%
top_30_percent = int(len(total_sales_id) * 0.3)

# Sort the DataFrame by 'total_sales' in descending order
total_sales_id_sorted = total_sales_article_id.sort_values(by='total_sales', ascending=False)

# Get the article_ids of the top 30% selling products
#top_30_article_ids = total_sales_id_sorted.head(top_30_percent)['article_id']

# Display the top 30% article IDs
#print(top_30_article_ids)

In [None]:
total_sales_id.head(top_30_percent)

In [25]:
# Ensure `sale_date` is in datetime format
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])

# List of top 30% selling article IDs
top_30_article_ids = total_sales_id.head(top_30_percent)['article_id'].values  # Assuming this list was calculated as shown in your code

# Step 1: Filter `seasonal_data` to include only top 30% article IDs
top_selling_data = seasonal_data[seasonal_data['article_id'].isin(top_30_article_ids)]

In [27]:
#ARIMA BY DAILY FREQUENCY PREDICTIONS

import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")  # Suppress non-critical warnings

# Ensure `sale_date` is in datetime format
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])

# List of top 30% selling article IDs
top_30_article_ids = total_sales_id_sorted.head(top_30_percent)['article_id'].values  # Assuming this list was calculated as shown in your code

# Step 1: Filter `seasonal_data` to include only top 30% article IDs
top_selling_data = seasonal_data[seasonal_data['article_id'].isin(top_30_article_ids)]

# Step 2: Initialize a dictionary to store forecasts
forecasts = {}

# Step 3: Loop through each article in the top 30% and apply ARIMA
for article_id in top_30_article_ids:
    # Filter data for the current article_id
    article_data = top_selling_data[top_selling_data['article_id'] == article_id]
    
    # Sort data by `sale_date`, set it as index, and ensure it has a consistent frequency
    article_data = article_data.set_index('sale_date').sort_index()
    article_data = article_data.asfreq('D')  # Set frequency to daily; adjust to 'W' or 'M' if needed
    
    # Fill any missing dates with zero sales
    article_data['total_sales'] = article_data['total_sales'].fillna(0)
    
    # Extract `total_sales` as the target time series for forecasting
    ts_data = article_data['total_sales']
    
    # Check for sufficient data points before applying ARIMA
    if len(ts_data) < 10:  # Adjust the minimum threshold as needed
        continue  # Skip this article if there isn't enough data

    # Step 4: Fit ARIMA Model (Using basic (1, 1, 1) configuration, can be tuned)
    try:
        model = ARIMA(ts_data, order=(1, 1, 1))
        model_fit = model.fit()
        
        # Forecasting the next 30 time periods
        forecast_steps = 30
        forecast = model_fit.forecast(steps=forecast_steps)
        
        # Store the forecasted values
        forecasts[article_id] = forecast.values  # Storing only forecasted values
    
    except Exception as e:
        print(f"ARIMA failed for article_id {article_id}: {e}")
        continue

# Step 5: Convert forecasts dictionary to DataFrame
forecast_df = pd.DataFrame(forecasts).T  # Transpose so each article_id is a row
forecast_df.columns = [f"day_{i+1}" for i in range(forecast_steps)]  # Rename columns for each forecasted day

# Optional: Save forecast to a CSV file
forecast_df.to_csv("top_30_article_forecasts.csv")

print("Forecasting completed and saved for each article_id in the top 30% best-sellers.")


Forecasting completed and saved for each article_id in the top 30% best-sellers.


In [None]:
import pandas as pd
from pmdarima import auto_arima
import warnings
warnings.filterwarnings("ignore")  # Suppress non-critical warnings

# Ensure `sale_date` is in datetime format
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])

# List of top 30% selling article IDs
top_30_article_ids = total_sales_id_sorted.head(top_30_percent)['article_id'].values  # Assuming this list was calculated as shown in your code

# Step 1: Filter `seasonal_data` to include only top 30% article IDs
top_selling_data = seasonal_data[seasonal_data['article_id'].isin(top_30_article_ids)]

# Step 2: Initialize a dictionary to store forecasts
forecasts = {}

# Step 3: Loop through each article in the top 30% and apply optimized auto_arima with yearly seasonality
for article_id in top_30_article_ids:
    # Filter data for the current article_id
    article_data = top_selling_data[top_selling_data['article_id'] == article_id]
    
    # Sort data by `sale_date`, set it as index, and ensure it has a consistent frequency
    article_data = article_data.set_index('sale_date').sort_index()
    article_data = article_data.asfreq('D')  # Set frequency to daily; adjust to 'W' or 'M' if needed
    
    # Fill any missing dates with zero sales
    article_data['total_sales'] = article_data['total_sales'].fillna(0)
    
    ts_data = article_data['total_sales']
    
    # Check for sufficient data points before applying auto_arima
    if len(ts_data) < 10:  # Adjust the minimum threshold as needed
        continue  # Skip this article if there isn't enough data

    # Step 4: Use auto_arima with seasonal parameters for yearly seasonality
    try:
        model = auto_arima(ts_data, 
                           seasonal=True,          # Enable seasonal ARIMA
                           m=365,                  # Seasonal period for yearly seasonality in daily data
                           stepwise=True,          # Use stepwise search to speed up
                           suppress_warnings=True, 
                           max_p=2, max_q=2,       # Limit p and q to a maximum of 2
                           max_d=1,                # Limit differencing to 1 (or adjust based on data)
                           max_P=1, max_Q=1, max_D=1, max_order=5)  # Seasonal parameter limits
        
        # Forecast the next 30 time periods
        forecast_steps = 30
        forecast = model.predict(n_periods=forecast_steps)
        
        # Store the forecasted values
        forecasts[article_id] = forecast  # Storing only forecasted values
    
    except Exception as e:
        print(f"ARIMA failed for article_id {article_id}: {e}")
        continue

# Step 5: Convert forecasts dictionary to DataFrame
forecast_df = pd.DataFrame(forecasts).T  # Transpose so each article_id is a row
forecast_df.columns = [f"day_{i+1}" for i in range(forecast_steps)]  # Rename columns for each forecasted day

# Optional: Save forecast to a CSV file
forecast_df.to_csv("autoarima_top_30_article_forecasts.csv")

print("Forecasting completed and saved for each article_id in the top 30% best-sellers.")


In [7]:
###AUTO ARIMA BASED ON DAILY OBSERVATIONS - OLD CODE 1187 Minutes 15.4 Seconds

import pandas as pd
from pmdarima import auto_arima
import warnings
warnings.filterwarnings("ignore")  # Suppress non-critical warnings

# Ensure `sale_date` is in datetime format
seasonal_data['sale_date'] = pd.to_datetime(seasonal_data['sale_date'])

# List of top 30% selling article IDs
top_30_article_ids = total_sales_id_sorted.head(top_30_percent)['article_id'].values  # Assuming this list was calculated as shown in your code

# Step 1: Filter `seasonal_data` to include only top 30% article IDs
top_selling_data = seasonal_data[seasonal_data['article_id'].isin(top_30_article_ids)]

# Step 2: Initialize a dictionary to store forecasts
forecasts = {}

# Step 3: Loop through each article in the top 30% and apply auto_arima
for article_id in top_30_article_ids:
    # Filter data for the current article_id
    article_data = top_selling_data[top_selling_data['article_id'] == article_id]
    
    # Sort data by `sale_date`, set it as index, and ensure it has a consistent frequency
    article_data = article_data.set_index('sale_date').sort_index()
    article_data = article_data.asfreq('D')  # Set frequency to daily; adjust to 'W' or 'M' if needed
    
    # Fill any missing dates with zero sales
    article_data['total_sales'] = article_data['total_sales'].fillna(0)
    
    ts_data = article_data['total_sales']
    
    # Check for sufficient data points before applying ARIMA
    if len(ts_data) < 10:  # Adjust the minimum threshold as needed
        continue  # Skip this article if there isn't enough data

    # Step 4: Automatically select the best ARIMA model using auto_arima
    try:
        # Using seasonal=False since it’s a standard ARIMA, set it to True if seasonality is detected
        model = auto_arima(ts_data, seasonal=False, stepwise=True, suppress_warnings=True, trace=True)
        
        # Forecast the next 30 time periods
        forecast_steps = 30
        forecast = model.predict(n_periods=forecast_steps)
        
        # Store the forecasted values
        forecasts[article_id] = forecast  # Storing only forecasted values
    
    except Exception as e:
        print(f"ARIMA failed for article_id {article_id}: {e}")
        continue

# Step 5: Convert forecasts dictionary to DataFrame
forecast_df = pd.DataFrame(forecasts).T  # Transpose so each article_id is a row
forecast_df.columns = [f"day_{i+1}" for i in range(forecast_steps)]  # Rename columns for each forecasted day

# Optional: Save forecast to a CSV file
forecast_df.to_csv("auto_arima_top_30_article_forecasts.csv")

print("Forecasting completed and saved for each article_id in the top 30% best-sellers.")


Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=2106.309, Time=0.31 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=3141.791, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=2201.644, Time=0.04 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=2630.811, Time=0.04 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=2120.560, Time=0.13 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=2203.823, Time=0.12 sec
 ARIMA(3,0,2)(0,0,0)[0]             : AIC=2105.205, Time=0.28 sec
 ARIMA(3,0,1)(0,0,0)[0]             : AIC=2109.238, Time=0.19 sec
 ARIMA(4,0,2)(0,0,0)[0]             : AIC=2112.325, Time=0.50 sec
 ARIMA(3,0,3)(0,0,0)[0]             : AIC=2108.223, Time=0.44 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=2103.533, Time=0.41 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=2113.059, Time=0.18 sec
 ARIMA(2,0,4)(0,0,0)[0]             : AIC=2114.426, Time=0.45 sec
 ARIMA(1,0,4)(0,0,0)[0]             : AIC=2108.371, Time=0.25 sec
 ARIMA(3,0,4)(0,0,0)[0]          

KeyboardInterrupt: 