# **Required Dependencies**

# **Task 1**

In [1]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate() #Create the spark session

#https://sparkbyexamples.com/pyspark/pyspark-read-csv-file-into-dataframe/
RatingsSchema = StructType([
    StructField("UserID", IntegerType(), True),
    StructField("ItemID", IntegerType(), True),
    StructField("Rating", FloatType(), True)
])

TrustSchema = StructType([
    StructField("TrustorUserID", IntegerType(), True),
    StructField("TrusteeUserID", IntegerType(), True),
    StructField("TrustValue", IntegerType(), True)
])

RatingsDF = spark.read.load("ratings.txt", format='csv', sep=" ", schema=RatingsSchema) #Read the contents of ratings.txt

print("There are", RatingsDF.count(),"rows of data as standard") #Display the number of rows in RatingsDF
RatingsDF = RatingsDF.dropna() #Remove rows with empty data present
print("There are", RatingsDF.count(),"rows of data after dropping empty data") #Display the number of rows in RatingsDF so it can be compared to remove removing incomplete rows
RatingsDF = RatingsDF.dropDuplicates(['UserID', 'ItemID']) #Remove rows that are duplicates of each other
print("There are", RatingsDF.count(),"rows of data after dropping duplicates") #Display the number of rows in RatingsDF so it can be comapred to see how much data cleaning has taken place

#Repeat the entire process for the trust.txt file into a second dataframe
TrustDF = spark.read.load("trust.txt", format='csv', sep=" ", schema=TrustSchema)

print("There are", TrustDF.count(), "rows of data as standard")
TrustDF = TrustDF.dropna()
print("There are", TrustDF.count(), "rows of data after dropping empty data")
TrustDF = TrustDF.dropDuplicates(['TrustorUserID', 'TrusteeUserID'])
print("There are", TrustDF.count(), "rows of data after dropping duplicates")

import matplotlib.pyplot as plt
import numpy as np

RatingsPandas = RatingsDF.toPandas()
RatingsOnly = RatingsPandas['Rating']
plt.rcParams['figure.dpi'] = 100
bins=[-0.25, 0.25, 0.75, 1.25, 1.75, 2.25, 2.75, 3.25, 3.75, 4.25] #defining the bins so the histogram has very neat and clear bars
fig, ax = plt.subplots(figsize = (9,9))#
counts, edges, bars = plt.hist(RatingsOnly, bins=bins, edgecolor="black") #plotting the histogram
plt.xlabel('Rating')
plt.ylabel('Times the Rating was given')
plt.bar_label(bars) #adds the value of the bar as a label
plt.show()
#https://docs.kanaries.net/topics/PySpark/pyspark-dataframe-column-list
#https://www.geeksforgeeks.org/how-to-plot-histogram-from-list-of-data-in-matplotlib/
#https://stackoverflow.com/questions/39841733/matplotlib-histogram-how-to-display-the-count-over-the-bar
#https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.hist.html


dfR = RatingsDF.groupBy("ItemID").agg(mean('Rating').alias("AverageRating"), count('ItemID').alias("NumOfRatings")) #Get the average rating and the number of times an item has been rated for each film
dfR = dfR.filter(dfR.NumOfRatings > 10)

dfTrustee = TrustDF.groupBy("TrusteeUserID").agg(count('TrusteeUserID').alias("NumOfRatings"))
dfTrustee = dfTrustee.filter(dfTrustee.NumOfRatings > 5)

dfRUsers = RatingsDF.groupBy("UserID").agg(count('UserID').alias("RatingsGiven"))

print("Top 5 films")
dfR.orderBy(desc("AverageRating")).show(5)
print("Bottom 5 films")
dfR.orderBy(asc("AverageRating")).show(5)

print("The top 10 most rated films")
dfR.orderBy(desc("NumOfRatings")).show(10)

print("Top 15 users to be rated by other users")
dfTrustee.orderBy(desc("NumOfRatings")).show(15)

print("The top 10 users with the most ratings given")
dfRUsers.orderBy(desc("RatingsGiven")).show(10)

ModuleNotFoundError: No module named 'pyspark'

Part 3

In [4]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

#https://medium.com/@brunoborges_38708/recommender-system-using-als-in-pyspark-10329e1d1ee1
#https://api-docs.databricks.com/python/pyspark/latest/api/pyspark.ml.recommendation.ALS.html

#Splitting the data to be able to train the model
train_data, test_data = RatingsDF.randomSplit([0.9, 0.1], seed=1234)

#Using the ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    itemCol="ItemID",
    userCol="UserID",
    ratingCol="Rating",
    coldStartStrategy="drop",
    nonnegative=True
)

#Fitting the training data into the model
model = als.fit(train_data)
predictions = model.transform(test_data)

#create evaluator
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="Rating",
    predictionCol="prediction"
)

#using RMSE as the mathmatical evaluator of the entire model
rmse=evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

#Make 5 recommendations for each user
userRecommendations = model.recommendForAllUsers(5)
userRecommendations.show(truncate=False)


#testing the optimal number of iterations for ALS
tries=[2,3,5,10,15,20,25]
test_rmse=np.zeros((len(tries),1))
train_rmse=np.zeros((len(tries),1))

for i in range(0,len(tries)):
    als = ALS(
        maxIter=tries[i],
        regParam=0.1,
        itemCol="ItemID",
        userCol="UserID",
        ratingCol="Rating",
        coldStartStrategy="drop",
        nonnegative=True
    )

    model = als.fit(train_data)
    train_predictions = model.transform(train_data)
    test_predictions = model.transform(test_data)

    evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="Rating",
        predictionCol="prediction"
    )

    train_rmse[i]=evaluator.evaluate(train_predictions)
    test_rmse[i]=evaluator.evaluate(test_predictions)

plt.figure()
plt.xlabel("Iterations")
plt.ylabel("Root Mean Squared Error")
plt.plot(tries, train_rmse, 'b')
plt.plot(tries, test_rmse, 'g')
plt.axvline(x=10, color='r')
plt.show()


25/01/07 17:00:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/01/07 17:00:48 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Root Mean Squared Error (RMSE) = 0.8452041954555893


                                                                                

+------+--------------------------------------------------------------------------------------------+
|UserID|recommendations                                                                             |
+------+--------------------------------------------------------------------------------------------+
|1     |[{1517, 4.9735093}, {1353, 4.61414}, {162, 4.500662}, {97, 4.500662}, {1111, 4.449911}]     |
|3     |[{319, 4.098793}, {68, 4.030685}, {158, 3.9604828}, {145, 3.9604828}, {107, 3.9604828}]     |
|5     |[{944, 3.3017063}, {487, 3.1185508}, {969, 3.0695858}, {208, 2.895045}, {1183, 2.8861465}]  |
|6     |[{162, 5.239632}, {97, 5.239632}, {944, 5.1879244}, {1517, 5.0094934}, {1245, 4.954931}]    |
|9     |[{312, 4.8559732}, {319, 4.824621}, {68, 4.824369}, {888, 4.479307}, {162, 4.376497}]       |
|12    |[{1353, 5.128199}, {162, 4.938905}, {97, 4.938905}, {1535, 4.9277897}, {1517, 4.8858047}]   |
|13    |[{1517, 4.9222584}, {162, 4.283011}, {97, 4.283011}, {464, 4.2118573}, {11

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/codespace/.python/current/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.python/current/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.python/current/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 