# Collaborative Filtering With ALS

#### Initialize Spark Session

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Intialize Spark Session
# Set Driver & Execute Memory
spark = SparkSession.builder \
    .appName("RecommendationSystem") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.extraJavaOptions", "-Dfile.encoding=UTF-8") \
    .getOrCreate()

#### Review Transactions Data

In [None]:
# Upload Dataset to colab
from google.colab import files
uploaded = files.upload()

Saving transactions_data.zip to transactions_data.zip


In [None]:
import os
import zipfile

# Get the name of the uploaded zip file
zip_file_name = next(iter(uploaded))

# Unzip the file
with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall('.')

# Optionally, remove the zip file to save space
os.remove(zip_file_name)

In [None]:
# Transactions data. Parquet is columnar, enabling faster read/write and efficient compression.
df = spark.read.parquet("transactions_data")

# Select Relevant Columns for recommendation
# We focus on customer_id, product_name, and rating as they are key for collaborative filtering.
df = df.select("customer_id", "product_name", "rating").dropna()
df.show(5)

+-----------+------------+------+
|customer_id|product_name|rating|
+-----------+------------+------+
|      C1041|       Flour|     1|
|      C1006|     Chicken|     5|
|      C1027|  Toothpaste|     1|
|      C1008|        Beef|     4|
|      C1015|Paper Towels|     2|
+-----------+------------+------+
only showing top 5 rows



#### Prepare Transactions Data for Collaborative Filtering

In [None]:
# Convert String Identifiers to Numeric Indexes
# ALS requires numeric IDs, so we index customer_id and product_name
def index_column(df, column_name):
  indexer = StringIndexer(inputCol=column_name, outputCol=column_name + "_index")
  df = indexer.fit(df).transform(df)
  return df

df = index_column(df, "customer_id")
df = index_column(df, "product_name")
df.show(5)

+-----------+------------+------+-----------------+------------------+
|customer_id|product_name|rating|customer_id_index|product_name_index|
+-----------+------------+------+-----------------+------------------+
|      C1041|       Flour|     1|             43.0|              20.0|
|      C1006|     Chicken|     5|             10.0|              31.0|
|      C1027|  Toothpaste|     1|             38.0|               7.0|
|      C1008|        Beef|     4|             45.0|              17.0|
|      C1015|Paper Towels|     2|             12.0|               2.0|
+-----------+------------+------+-----------------+------------------+
only showing top 5 rows



#### Split Data into Training and Test

In [None]:
# Train-Test Split
# Split the dataset into 80% training and 20% testing.
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

#### Define and Train ALS Model

In [None]:
# Define ALS Model
als = ALS(
    userCol="customer_id_index",
    itemCol="product_name_index",
    ratingCol="rating",
    nonnegative=True, # Ensure rating are non-negative
    implicitPrefs=False, # Work with explicit range
)

# Train ALS Model
model = als.fit(train_data)

# Generate Predictions
predictions = model.transform(test_data)
predictions.show(5)

+-----------+------------+------+-----------------+------------------+----------+
|customer_id|product_name|rating|customer_id_index|product_name_index|prediction|
+-----------+------------+------+-----------------+------------------+----------+
|      C1001|      Apples|     1|             22.0|              34.0| 2.8361955|
|      C1001|      Apples|     2|             22.0|              34.0| 2.8361955|
|      C1001|      Apples|     2|             22.0|              34.0| 2.8361955|
|      C1001|      Apples|     3|             22.0|              34.0| 2.8361955|
|      C1001|      Apples|     4|             22.0|              34.0| 2.8361955|
+-----------+------------+------+-----------------+------------------+----------+
only showing top 5 rows



#### Evaluate Trained Model

In [None]:
# Evaluate Model Performance Using RMSE
# Root Mean Square Error (RMSE) helps measure prediction accuracy.
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Square Error (RMSE): {rmse}")

Root Mean Square Error (RMSE): 1.422258566252489


#### Implement Cold Start Strategy

In [None]:
# Define ALS Model considering cold start problem
als = ALS(
    userCol="customer_id_index",
    itemCol="product_name_index",
    ratingCol="rating",
    nonnegative=True, # Ensure rating are non-negative
    coldStartStrategy="drop", # Handle unseen user-item pairs
    implicitPrefs=False, # Work with explicit ratings
)

# Train ALS Model
model = als.fit(train_data)

# Generate Predictions
predictions = model.transform(test_data)

# Evaluate Model Performance Using RMSE
# Root Mean Square Error (RMSE) helps measure prediction accuracy.
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Square Error (RMSE): {rmse}")

Root Mean Square Error (RMSE): 1.422258566252486


#### Hyperparameter Tuning with `ParamGridBuilder` and `CrossValidator`

In [None]:
# Define parameter grid for hyperparameter tuning
paramGrid = (ParamGridBuilder()
            .addGrid(als.rank, [5, 10, 15]) # Different latent factor values
            .addGrid(als.regParam, [0.01, 0.1, 0.5]) # Regularization parameter
            .addGrid(als.maxIter, [10, 20]) # Number of iterations
            .build()
            )

# Define evaluator for RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Set up cross-validator
crossval = CrossValidator(
    estimator=als,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=3 # 3-fold cross-validation
)

In [None]:
# Train ALS model using cross-validation
cvModel = crossval.fit(train_data)

# Get the best model from cross-validation
bestModel = cvModel.bestModel

# Print best hyperparameters
print(f"Best Rank: {bestModel.rank}")
print(f"Best Regularization: {bestModel._java_obj.parent().getRegParam()}")
print(f"Best Iterations: {bestModel._java_obj.parent().getMaxIter()}")

# Evaluate on test data
predictions = bestModel.transform(test_data)

rmse = evaluator.evaluate(predictions)

print(f"Optimized RMSE: {rmse}")

Best Rank: 5
Best Regularization: 0.01
Best Iterations: 20
Optimized RMSE: 1.4173119141981854
