# Recomendation Systems #

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [26]:
spark = SparkSession.builder.appName("RecommendationSystemData").getOrCreate()
data = [
    (1, 101, 4.5, "2023-01-01 10:00:00"),
    (2, 105, 3.0, "2023-01-01 11:15:00"),
    (1, 106, 5.0, "2023-01-02 09:45:00"),
    (3, 103, 2.0, "2023-01-02 15:20:00"),
    (2, 102, 4.0, "2023-01-03 13:35:00"),
    (3, 104, 4.0, "2023-01-03 17:00:00"),
    (1, 105, 3.5, "2023-01-04 08:00:00"),
    (4, 101, 2.0, "2023-01-04 10:00:00"),
    (4, 102, 3.5, "2023-01-05 12:15:00"),
    (5, 103, 5.0, "2023-01-05 14:30:00"),
    (6, 104, 4.5, "2023-01-06 16:00:00"),
    (5, 106, 2.0, "2023-01-07 09:15:00"),
    (7, 107, 4.0, "2023-01-07 11:45:00"),
    (8, 108, 3.5, "2023-01-08 13:25:00"),
    (9, 109, 4.0, "2023-01-08 15:40:00"),
    (10, 110, 5.0, "2023-01-09 08:50:00"),
    (1, 111, 3.0, "2023-01-10 10:20:00"),
    (2, 112, 4.0, "2023-01-10 12:30:00"),
    (3, 113, 2.5, "2023-01-11 09:00:00"),
    (4, 114, 3.5, "2023-01-11 14:45:00"),
    (5, 115, 4.5, "2023-01-12 15:15:00"),
    (6, 116, 2.0, "2023-01-12 17:30:00"),
    (7, 117, 5.0, "2023-01-13 18:00:00"),
    (8, 118, 4.0, "2023-01-14 10:30:00"),
    (9, 119, 3.0, "2023-01-15 11:00:00"),
    (10, 120, 4.0, "2023-01-15 12:15:00"),
    (1, 121, 3.5, "2023-01-16 08:20:00"),
    (2, 122, 2.5, "2023-01-16 14:30:00"),
    (3, 123, 5.0, "2023-01-17 09:50:00"),
    (4, 124, 4.5, "2023-01-18 10:10:00"),
    (5, 125, 3.0, "2023-01-19 12:20:00"),
    (6, 126, 2.5, "2023-01-20 15:35:00"),
    (7, 127, 4.0, "2023-01-21 16:00:00"),
    (8, 128, 5.0, "2023-01-22 10:45:00"),
    (9, 129, 3.5, "2023-01-23 11:10:00"),
    (10, 130, 4.0, "2023-01-24 12:30:00"),
    (1, 131, 2.0, "2023-01-25 14:50:00"),
    (2, 132, 3.5, "2023-01-26 09:15:00"),
    (3, 133, 4.5, "2023-01-27 10:00:00"),
    (4, 134, 3.0, "2023-01-28 13:30:00"),
    (5, 135, 4.0, "2023-01-29 15:00:00"),
    (6, 136, 2.5, "2023-01-30 17:20:00"),
    (7, 137, 5.0, "2023-01-31 08:00:00"),
    (8, 138, 3.5, "2023-02-01 09:45:00"),
    (9, 139, 4.0, "2023-02-02 11:00:00"),
    (10, 140, 3.0, "2023-02-03 14:00:00"),
]

# The DataFrame creation code would remain the same

columns = ["user_id", "item_id", "rating", "timestamp"]
df = spark.createDataFrame(data, schema=columns)
df.printSchema()
df.show()


root
 |-- user_id: long (nullable = true)
 |-- item_id: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: string (nullable = true)

+-------+-------+------+-------------------+
|user_id|item_id|rating|          timestamp|
+-------+-------+------+-------------------+
|      1|    101|   4.5|2023-01-01 10:00:00|
|      2|    105|   3.0|2023-01-01 11:15:00|
|      1|    106|   5.0|2023-01-02 09:45:00|
|      3|    103|   2.0|2023-01-02 15:20:00|
|      2|    102|   4.0|2023-01-03 13:35:00|
|      3|    104|   4.0|2023-01-03 17:00:00|
|      1|    105|   3.5|2023-01-04 08:00:00|
|      4|    101|   2.0|2023-01-04 10:00:00|
|      4|    102|   3.5|2023-01-05 12:15:00|
|      5|    103|   5.0|2023-01-05 14:30:00|
|      6|    104|   4.5|2023-01-06 16:00:00|
|      5|    106|   2.0|2023-01-07 09:15:00|
|      7|    107|   4.0|2023-01-07 11:45:00|
|      8|    108|   3.5|2023-01-08 13:25:00|
|      9|    109|   4.0|2023-01-08 15:40:00|
|     10|    110|   5.0|2023-01

In [28]:
df = df.drop("timestamp")
(train_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)
als = ALS(userCol="user_id",
          itemCol="item_id",
          ratingCol="rating",
          rank=10,
          maxIter=5,
          regParam=0.01,
          coldStartStrategy="drop")
model = als.fit(train_data)
predictions = model.transform(test_data)
predictions = predictions.dropna(subset=["prediction"])
if predictions.count() == 0:
    print("No predictions available for evaluation.")
else:
    # Evaluate the model using RMSE
    evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator_rmse.evaluate(predictions)
    print(f"Root-mean-square error (RMSE) = {rmse}")

    # Evaluate the model using MAE
    evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")
    mae = evaluator_mae.evaluate(predictions)
    print(f"Mean Absolute Error (MAE) = {mae}")

    # Evaluate the model using R-squared
    evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
    r2 = evaluator_r2.evaluate(predictions)
    print(f"R-squared = {r2}")

    # Show the predictions
    predictions.show()


Root-mean-square error (RMSE) = 3.2702302865081565
Mean Absolute Error (MAE) = 3.2163821359475455
R-squared = -63.16643676077132
+-------+-------+------+-----------+
|user_id|item_id|rating| prediction|
+-------+-------+------+-----------+
|      2|    102|   4.0| 0.89628565|
|      1|    105|   3.5|-0.48994514|
|      6|    104|   4.5|  1.9445131|
+-------+-------+------+-----------+



24/11/05 15:23:37 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1980556 ms exceeds timeout 120000 ms
24/11/05 15:23:37 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/05 15:23:37 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$