## Import Libraries

In [38]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.sql.types import StringType, DoubleType
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col, dayofmonth, month, year,  to_date, to_timestamp, weekofyear, dayofweek

In [39]:
# Initialize SparkSession
spark = SparkSession.builder.appName("ModelComparison").getOrCreate()

# Importing sales data
sales_data = spark.read.csv(
    "Online Retail.csv", header=True, inferSchema=True, sep=",")

# Convert InvoiceDate to datetime 
sales_data = sales_data.withColumn("InvoiceDate", to_date(
    to_timestamp(col("InvoiceDate"), "d/M/yyyy H:mm")))

# Aggregate data into daily intervals
daily_sales_data = sales_data.groupBy("Country", "StockCode", "InvoiceDate", "Year", "Month", "Day", "Week", "DayOfWeek").agg({"Quantity": "sum",                                                                                                           "UnitPrice": "avg"})
# Rename the target column
daily_sales_data = daily_sales_data.withColumnRenamed(
    "sum(Quantity)", "Quantity")

In [40]:
split_date_train_test = "2011-09-25"

# Creating the train and test datasets
train_data = daily_sales_data.filter(
    col("InvoiceDate") <= split_date_train_test)

test_data = daily_sales_data.filter(
    col("InvoiceDate") > split_date_train_test)

pd_daily_train_data = train_data.toPandas()
pd_daily_train_data.head()

Unnamed: 0,Country,StockCode,InvoiceDate,Year,Month,Day,Week,DayOfWeek,avg(UnitPrice),Quantity
0,United Kingdom,22912,2010-01-12,2010,1,12,2,1,4.95,3
1,France,22659,2010-01-12,2010,1,12,2,1,1.95,24
2,United Kingdom,21544,2010-01-12,2010,1,12,2,1,0.85,12
3,United Kingdom,21098,2010-01-12,2010,1,12,2,1,1.25,16
4,Norway,85150,2010-01-12,2010,1,12,2,1,2.55,12


## Define a fuction to evaluate each model

In [41]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor

def train_and_evaluate_regression_model(train_data, test_data, model_type):
    
    # Creating indexer for categorical columns
    country_indexer = StringIndexer(
        inputCol="Country", outputCol="CountryIndex").setHandleInvalid("keep")
    stock_code_indexer = StringIndexer(
        inputCol="StockCode", outputCol="StockCodeIndex").setHandleInvalid("keep")
    
    # Define feature columns
    feature_cols = ["CountryIndex", "StockCodeIndex", "Month", "Year", "DayOfWeek", "Day", "Week"]

    # Assemble features
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

    # Initialize the specified regression model
    if model_type == "LinearRegression":
        model = LinearRegression(labelCol="Quantity", featuresCol="features")
    elif model_type == "DecisionTree":
        model = DecisionTreeRegressor(labelCol="Quantity", featuresCol="features", maxBins=4000)
    elif model_type == "RandomForest":
        model = RandomForestRegressor(labelCol="Quantity", featuresCol="features", maxBins=4000)
    elif model_type == "GBT":
        model = GBTRegressor(labelCol="Quantity", featuresCol="features", maxBins=4000)
    else:
        raise ValueError("Invalid model type. Choose from 'LinearRegression', 'DecisionTree', 'RandomForest', or 'GBT'.")

    # Create a pipeline for the model
    pipeline = Pipeline(stages=[country_indexer, stock_code_indexer, assembler, model])

    # Train the model
    trained_model = pipeline.fit(train_data)

    # Make predictions on test data
    test_predictions = trained_model.transform(test_data)

    # Initialize evaluators
    evaluators = {
        "rmse": RegressionEvaluator(labelCol="Quantity", predictionCol="prediction", metricName="rmse"),
        "r2": RegressionEvaluator(labelCol="Quantity", predictionCol="prediction", metricName="r2"),
        "mse": RegressionEvaluator(labelCol="Quantity", predictionCol="prediction", metricName="mse"),
        "mae": RegressionEvaluator(labelCol="Quantity", predictionCol="prediction", metricName="mae")
    }

    # Initialize results dictionary
    results = {}

    # Obtain evaluation metrics
    for metric, evaluator in evaluators.items():
        results[metric] = evaluator.evaluate(test_predictions)

    return results


In [42]:
import pandas as pd

# Initialize an empty dictionary to store evaluation results
evaluation_results_dict = {}

# Define model types
model_types = ["RandomForest", "GBT", "LinearRegression", "DecisionTree"]

# Loop through each model type
for model_type in model_types:
    # Assuming train_and_evaluate_regression_model returns a dictionary of evaluation results
    evaluation_results = train_and_evaluate_regression_model(train_data, test_data, model_type)
    # Store evaluation results in the dictionary
    evaluation_results_dict[model_type] = evaluation_results

# Create a DataFrame from the dictionary
df = pd.DataFrame(evaluation_results_dict)

df


24/03/25 15:16:54 WARN DAGScheduler: Broadcasting large task binary with size 1615.0 KiB
24/03/25 15:16:55 WARN DAGScheduler: Broadcasting large task binary with size 2.5 MiB


                                                                                

24/03/25 15:17:04 WARN DAGScheduler: Broadcasting large task binary with size 1002.8 KiB
24/03/25 15:17:04 WARN DAGScheduler: Broadcasting large task binary with size 1083.2 KiB
24/03/25 15:17:04 WARN DAGScheduler: Broadcasting large task binary with size 1122.6 KiB
24/03/25 15:17:04 WARN DAGScheduler: Broadcasting large task binary with size 1123.5 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1192.0 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1226.0 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1296.5 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1333.6 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1334.5 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1365.4 KiB
24/03/25 15:17:05 WARN DAGScheduler: Broadcasting large task binary with size 1451.4 KiB
24/03/25 15:17:06 WAR

Unnamed: 0,RandomForest,GBT,LinearRegression,DecisionTree
rmse,15.277836,18.591713,17.863819,14.943716
r2,0.276143,-0.071933,0.010359,0.307458
mse,233.412266,345.651795,319.116026,223.31466
mae,9.402512,11.292199,11.501712,8.940368


In [43]:
best_model = df.transpose()
best_model = best_model.sort_values("mae")
best_model

Unnamed: 0,rmse,r2,mse,mae
DecisionTree,14.943716,0.307458,223.31466,8.940368
RandomForest,15.277836,0.276143,233.412266,9.402512
GBT,18.591713,-0.071933,345.651795,11.292199
LinearRegression,17.863819,0.010359,319.116026,11.501712


Hence, the best model in performance is Decision Tree though higher computational cost than RandomForest

In [44]:
from pyspark.ml.regression import DecisionTreeRegressor

# Creating indexer for categorical columns
country_indexer = StringIndexer(
    inputCol="Country", outputCol="CountryIndex").setHandleInvalid("keep")
stock_code_indexer = StringIndexer(
    inputCol="StockCode", outputCol="StockCodeIndex").setHandleInvalid("keep")

# Selectiong features columns
feature_cols = ["CountryIndex", "StockCodeIndex", "Month", "Year",
                "DayOfWeek", "Day", "Week"]

# Using vector assembler to combine features
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

best_model = DecisionTreeRegressor(labelCol="Quantity", featuresCol="features", maxBins=4000)

# Create a pipeline for staging the processes
pipeline = Pipeline(stages=[country_indexer, stock_code_indexer, assembler, best_model])

# Training the model
model = pipeline.fit(train_data)

In [46]:
model.save("best_model")

In [None]:
import pickle
import threading

# Save the model using pickle
model._lock = threading.Lock()  # Add this line to create a lock object
with open("best_model.pkl", "wb") as f:
    pickle.dump(model, f)

TypeError: cannot pickle '_thread.RLock' object

In [47]:
# Stop the Spark session

spark.stop()