# Demand Forecasting using PySpark and Prophet

## Question 1

### 1. Ingest Data from Source

Install Dependencies

In [None]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
!pip install pyspark

# Install Prophet
!pip install prophet




Set Environment Variables and Import Libraries

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql.types import *

import pandas as pd
import numpy as np

import logging
logging.getLogger('py4j').setLevel(logging.ERROR)


Create SparkSession



In [None]:
spark = SparkSession.builder \
    .appName('DemandForecasting') \
    .getOrCreate()

sc = spark.sparkContext


Download and Read Data


In [None]:
# Download the data
!wget https://storage.googleapis.com/bdt-demand-forecast/sales-data.csv
# Read the CSV file into a Spark DataFrame
df = spark.read.csv('sales-data.csv', header=True, inferSchema=True)


--2024-10-27 12:20:17--  https://storage.googleapis.com/bdt-demand-forecast/sales-data.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.143.207, 173.194.69.207, 173.194.79.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.143.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17333449 (17M) [text/csv]
Saving to: ‘sales-data.csv’


2024-10-27 12:20:18 (110 MB/s) - ‘sales-data.csv’ saved [17333449/17333449]



Verify Data Schema


In [None]:
df.printSchema()


root
 |-- date: date (nullable = true)
 |-- store: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- sales: integer (nullable = true)



Convert Date Column to DateType


In [None]:
from pyspark.sql.functions import to_date

df = df.withColumn('date', to_date('date', 'yyyy-MM-dd'))


Verify Data Schema

In [None]:
df.printSchema()


root
 |-- date: date (nullable = true)
 |-- store: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- sales: integer (nullable = true)



In [None]:
df.createOrReplaceTempView('sales_data')


### Prepare Data by Partitioning


In [None]:
sql_statement = '''
  SELECT
    store,
    item,
    CAST(date as date) as ds,
    SUM(sales) as y
  FROM sales_data
  GROUP BY store, item, ds
  ORDER BY store, item, ds
  '''

store_item_history = spark.sql(sql_statement)


Aggregate Data at Store-Item-Date Level


In [None]:
# Get the default parallelism
default_parallelism = sc.defaultParallelism
print("Default parallelism (number of partitions to use):", default_parallelism)


Default parallelism (number of partitions to use): 2


Repartition the DataFrame


In [None]:
# Repartition the DataFrame
store_item_history = store_item_history.repartition(default_parallelism, ['store', 'item']).cache()


### Question 2: Number of Partitions

In [None]:
num_partitions = store_item_history.rdd.getNumPartitions()
print("Number of partitions:", num_partitions)


Number of partitions: 2


### Apply Model Fit and Forecast to Each Store-Item Combination


Define Schema for Forecast Output

In [None]:
result_schema =StructType([
  StructField('ds',DateType()),
  StructField('store',IntegerType()),
  StructField('item',IntegerType()),
  StructField('y',FloatType()),
  StructField('yhat',FloatType()),
  StructField('yhat_upper',FloatType()),
  StructField('yhat_lower',FloatType())
  ])


Define the forecast_store_item Function


In [None]:
def forecast_store_item( history_pd ):

  from prophet import Prophet

  # Remove missing values
  history_pd = history_pd.dropna()

  # Configure the model
  model = Prophet(
    interval_width=0.95,
    growth='linear',
    daily_seasonality=False,
    weekly_seasonality=True,
    yearly_seasonality=True,
    seasonality_mode='multiplicative'
    )

  # Train the model
  model.fit( history_pd )

  # Make predictions
  future_pd = model.make_future_dataframe(
    periods=90,
    freq='D',
    include_history=True
    )
  forecast_pd = model.predict( future_pd )

  # Get relevant fields from forecast
  f_pd = forecast_pd[ ['ds','yhat', 'yhat_upper', 'yhat_lower'] ].set_index('ds')

  # Get relevant fields from history
  h_pd = history_pd[['ds','store','item','y']].set_index('ds')

  # Join history and forecast
  results_pd = f_pd.join( h_pd, how='left' )
  results_pd.reset_index(level=0, inplace=True)

  # Get store & item from incoming data set
  results_pd['store'] = history_pd['store'].iloc[0]
  results_pd['item'] = history_pd['item'].iloc[0]

  # Return expected dataset
  return results_pd[ ['ds', 'store', 'item', 'y', 'yhat', 'yhat_upper', 'yhat_lower'] ]


Apply the Forecast Function to Each Group


In [None]:
from pyspark.sql.functions import current_date

results = (
  store_item_history
    .groupBy('store', 'item')
      .applyInPandas(forecast_store_item, schema=result_schema)
    .withColumn('training_date', current_date() )
    )


### Persist the Forecasts for Evaluation


In [None]:
# Write the forecasts to the local filesystem
results.write.csv('forecasts.csv', header=True, mode='overwrite')


### Apply the Model Evaluation Function to Each Model Result


Define the evaluate_forecast Function

In [None]:
def evaluate_forecast( evaluation_pd ):

  from sklearn.metrics import mean_squared_error, mean_absolute_error
  from math import sqrt

  # Calculate evaluation metrics
  mae = mean_absolute_error( evaluation_pd['y'], evaluation_pd['yhat'] )
  mse = mean_squared_error( evaluation_pd['y'], evaluation_pd['yhat'] )
  rmse = sqrt( mse )

  # Assemble result set
  results = {'store':[evaluation_pd['store'].iloc[0]], 'item':[evaluation_pd['item'].iloc[0]], 'mae':[mae], 'mse':[mse], 'rmse':[rmse]}
  return pd.DataFrame( results )


Apply the Evaluation Function


In [None]:
eval_results = (
  results
    .filter(F.col('ds') < F.lit('2018-01-01')) # Limit evaluation to periods where we have historical data
    .groupBy('store', 'item')
    .applyInPandas(evaluate_forecast, schema='store int, item int, mae double, mse double, rmse double')
    )


### Print the Evaluation Results


In [None]:
eval_results.show()


+-----+----+------------------+------------------+------------------+
|store|item|               mae|               mse|              rmse|
+-----+----+------------------+------------------+------------------+
|    1|   1| 3.486720561981201|19.388858795166016|4.4032781873470155|
|    1|   2| 6.057506084442139| 58.63664627075195| 7.657456906228853|
|    1|   3| 4.644035339355469| 34.85238265991211| 5.903590658227594|
|    1|   4|  3.64391827583313| 20.51084327697754| 4.528889850391323|
|    1|   7| 6.077047348022461| 58.45271301269531| 7.645437398389664|
|    1|   8|7.0304155349731445| 77.56575775146484| 8.807142428249065|
|    1|   9| 5.609813213348389| 49.88712692260742|7.0630819705428465|
|    1|  11| 6.539265155792236| 67.67538452148438| 8.226505000392596|
|    1|  16| 3.902801990509033|24.225801467895508| 4.921971298971125|
|    1|  19| 5.102372646331787| 40.71337127685547| 6.380703039388016|
|    1|  20| 5.218245029449463| 43.94417953491211| 6.629040619494808|
|    1|  22|      7.

## Question 3: Parallelise the Workload


### Demonstrate Parallel Execution


In [None]:
# Import time to measure execution time
import time

start_time = time.time()

# Re-run the forecast application to measure time
results = (
  store_item_history
    .groupBy('store', 'item')
      .applyInPandas(forecast_store_item, schema=result_schema)
    .withColumn('training_date', current_date() )
    )

end_time = time.time()

print("Execution time:", end_time - start_time, "seconds")

# Not shown here but it executes in a different (faster time), showcasing the parallelisation

Execution time: 0.06285619735717773 seconds
