# Collaborative FIltering

In [1]:
# Import Sparksession
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("CF").getOrCreate()

In [2]:
# Print PySpark and Python versions
import sys
print('Python version: '+sys.version)
print('Spark version: '+spark.version)

Python version: 3.8.4 | packaged by conda-forge | (default, Jul 17 2020, 15:16:46) 
[GCC 7.5.0]
Spark version: 3.0.0


## Load Data

In [25]:
# Read data
file_location = "cf_data.csv"
file_type = "csv"
infer_schema = "false"
first_row_is_header = "true"


df = spark.read.format(file_type)\
.option("inferSchema", infer_schema)\
.option("header", first_row_is_header)\
.load(file_location)


In [26]:
# Print Metadata
df.printSchema()

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)



In [27]:
#  Count data
df.count()
print('The total number of records in the credit card dataset are '+str(df.count()))

The total number of records in the credit card dataset are 98468


# Import appropriate libraries


In [33]:
# Import appropriate libraries
from pyspark.sql.types import *
import pyspark.sql.functions as sql_fun
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics
from pyspark.ml.evaluation import RegressionEvaluator
import re

## Model building

In [34]:
# Casting variables
int_vars=['userId','movieId']
for column in int_vars:
	df=df.withColumn(column,df[column].cast(IntegerType()))
float_vars=['rating']
for column in float_vars:
	df=df.withColumn(column,df[column].cast(FloatType()))

(training, test) = df.randomSplit([0.8, 0.2])

als = ALS(rank=15,maxIter=2, regParam=0.01, 
          userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop",
          implicitPrefs=False) 
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))


Root-mean-square error = 1.475354940517705


In [35]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.count()
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)
movieRecs.count()

8947

In [36]:
userRecs_df = userRecs.toPandas()
print(userRecs_df.shape)

movieRecs_df = movieRecs.toPandas()
print(movieRecs_df.shape)

(600, 2)
(8947, 2)


In [37]:
userRecs_df.head()

Unnamed: 0,userId,recommendations
0,190174,"[(3436, 9.908001899719238), (54023, 9.61083316..."
1,190227,"[(4771, 7.483070373535156), (8767, 7.309700012..."
2,190387,"[(8767, 12.672131538391113), (1336, 12.2035512..."
3,190540,"[(3277, 9.617005348205566), (33789, 9.22812843..."
4,190348,"[(3436, 8.795571327209473), (3859, 8.479754447..."
