In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sys
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.tuning import CrossValidator
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import FloatType, IntegerType, LongType

from recommenders.datasets import movielens
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation
from recommenders.tuning.parameter_sweep import generate_param_grid
from recommenders.datasets.spark_splitters import spark_random_split

print(f"System version: {sys.version}")
print(f"Pandas version: {pd.__version__}")
print(f"PySpark version: {pyspark.__version__}")

System version: 3.12.3 (main, Feb  4 2025, 14:48:35) [GCC 13.3.0]
Pandas version: 2.2.3
PySpark version: 3.5.4


In [6]:
RANK = 10
MAX_ITER = 15
REG_PARAM = 0.05

k = 10

In [7]:
# Processing of behavior dataset

import re

def aggregateRating(x): 
  items = x.split(" ")
  clicked = re.findall(r"\w+-1", x)
  return len(items) // len(clicked)

def getClicked(x):
  return re.findall(r"(\w+)-1", x)

def str2int(x):
  m = re.search(r"\d+", x)
  if not m:
    raise Exception("didnt find number in the index")

  return int(m.group())

def load_impression(path):

  impressions = pd.read_csv(
      path,
      sep='\t',
      header=None,
      names=["impressionId", "userId", "time", "history", "impressions"]
    )

  impressions = impressions.dropna()




  impressions["rating"] = impressions["impressions"].apply(aggregateRating)
  impressions["newsId"] = impressions["impressions"].apply(getClicked)
  impressions = impressions.explode("newsId").reset_index()



  # Convert ids to int for als
  impressions["user"] = impressions["userId"].apply(str2int)
  impressions["item"] = impressions["newsId"].apply(str2int)

  impressions = impressions[["user", "item", "rating"]]

  return impressions

In [8]:
impression_train = load_impression("../MIND/train/behaviors.tsv")
impression_test = load_impression("../MIND/test/behaviors.tsv")

print("train")
print(impression_train.info())
print("\ntest\n")
print(impression_test.info())

train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231530 entries, 0 to 231529
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    231530 non-null  int64
 1   item    231530 non-null  int64
 2   rating  231530 non-null  int64
dtypes: int64(3)
memory usage: 5.3 MB
None

test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107968 entries, 0 to 107967
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   user    107968 non-null  int64
 1   item    107968 non-null  int64
 2   rating  107968 non-null  int64
dtypes: int64(3)
memory usage: 2.5 MB
None


In [9]:
# Import SparkSession if you haven't already
from pyspark.sql import SparkSession

# Create a SparkSession if not already created
spark = SparkSession.builder \
    .appName("Recommendation System") \
    .getOrCreate()

# Convert your pandas DataFrame to a Spark DataFrame
impressions_spark_train = spark.createDataFrame(impression_train)
impressions_spark_test = spark.createDataFrame(impression_test)

# Now use the Spark DataFrame with ALS
als = ALS(
    maxIter=MAX_ITER,
    rank=RANK,
    regParam=REG_PARAM,
    userCol="user",
    itemCol="item",
    ratingCol="rating",
    coldStartStrategy="drop"
)

# Use the Spark DataFrame, not the pandas DataFrame
model = als.fit(impressions_spark_train)

25/04/03 10:58:46 WARN Utils: Your hostname, sondre-ThinkPad-E580 resolves to a loopback address: 127.0.1.1; using 10.252.38.200 instead (on interface wlp5s0)
25/04/03 10:58:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/03 10:58:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/04/03 10:59:03 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
25/04/03 10:59:14 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

## Prediction

In [10]:
prediction = model.transform(impressions_spark_train).drop("rating")

print(prediction.columns)

['user', 'item', 'prediction']


In [20]:
evaluations = SparkRatingEvaluation(
  impressions_spark_test,
  prediction,
  col_user="user",
  col_item="item",
  col_rating="rating",
  col_prediction="prediction"
)

print(
    f"RMSE score = {evaluations.rmse()}",
    f"MAE score = {evaluations.mae()}",
    f"R2 score = {evaluations.rsquared()}",
    f"Explained variance score = {evaluations.exp_var()}",
    sep="\n"
)

[Stage 548:>                                                        (0 + 1) / 1]

RMSE score = 35.43585540187076
MAE score = 24.79759323332045
R2 score = -0.1470159775856399
Explained variance score = -0.14337300674306608


                                                                                

[Stage 548:>                                                        (0 + 1) / 1]

In [None]:
COL_USER = "user"
COL_ITEM = "item"

dfs_train = impressions_spark_train





# Get the cross join of all user-item pairs and score them.
users = dfs_train.select(COL_USER).distinct().limit(1000)  # Limit to 1000 users
items = dfs_train.select(COL_ITEM).distinct().limit(1000)  # Limit to 1000 items
user_item = users.crossJoin(items)
dfs_pred = model.transform(user_item)


# Remove seen items.
dfs_train = dfs_train.selectExpr("user as train_user", "item as train_item", "rating as train_rating")

dfs_pred_exclude_train = dfs_pred.alias("pred").join(
    dfs_train.alias("train"),
    (col("pred.user") == col("train.train_user")) & (col("pred.item") == col("train.train_item")),
    how="outer"
)

dfs_pred_final = dfs_pred_exclude_train.filter(col("train.train_rating").isNull()) \
    .selectExpr("pred.user", "pred.item", "pred.prediction")

dfs_pred_final.show()


[Stage 548:>                (0 + 1) / 1][Stage 999:>                (0 + 1) / 1]

+----+-----+----------+
|user| item|prediction|
+----+-----+----------+
|  19| 2869|-57.652687|
|  19| 4034|  36.51317|
|  19| 4637| 2.4916067|
|  19| 5497| 11.431451|
|  19| 5582| 2.9806218|
|  19| 8509|-5.3680344|
|  19| 8855| -58.02987|
|  19| 9392|  5.129809|
|  19| 9464|  -8.60764|
|  19|10754| -68.82276|
|  19|10929|  4.175267|
|  19|11830| 16.848124|
|  19|13299|-30.541786|
|  19|14329|  -23.0581|
|  19|14436|  45.06962|
|  19|14977| 44.303524|
|  19|15279|  9.623946|
|  19|16804| 46.107254|
|  19|17312| 1.7360251|
|  19|18144| 14.178882|
+----+-----+----------+
only showing top 20 rows



                                                                                

[Stage 548:>                                                        (0 + 1) / 1]

In [23]:
dfs_test = impressions_spark_test
COL_RATING="rating"
COL_PREDICTION="prediction"

K = 10

evaluations = SparkRankingEvaluation(
    dfs_test, 
    dfs_pred_final,
    col_user=COL_USER,
    col_item=COL_ITEM,
    col_rating=COL_RATING,
    col_prediction=COL_PREDICTION,
    k=K
)

print(
    f"Precision@k = {evaluations.precision_at_k()}",
    f"Recall@k = {evaluations.recall_at_k()}",
    f"NDCG@k = {evaluations.ndcg_at_k()}",
    f"Mean average precision = {evaluations.map_at_k()}",
    sep="\n"
)

[Stage 548:>                (0 + 1) / 1][Stage 1101:>               (0 + 1) / 1]

Precision@k = 0.0
Recall@k = 0.0
NDCG@k = 0.0
Mean average precision = 0.0


                                                                                

[Stage 548:>                                                        (0 + 1) / 1]