In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, LongType, IntegerType
from pyspark.ml.recommendation import ALS

In [3]:
spark_session = SparkSession.builder.appName('Movie Recommendation').getOrCreate()

In [4]:
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('movie_id', IntegerType(), True),
    StructField('rating', IntegerType(), True),
    StructField('timestamp', LongType(), True)
])

data = spark_session.read.option('sep', '\t').schema(schema).csv('../../data/ml-100k/u.data')
data.show(10)

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
|    298|     474|     4|884182806|
|    115|     265|     2|881171488|
|    253|     465|     5|891628467|
|    305|     451|     3|886324817|
|      6|      86|     3|883603013|
+-------+--------+------+---------+
only showing top 10 rows



In [6]:
def load_movie_names():
    movie_names = {}

    with open('../../data/ml-100k/u.item', 'r', encoding='ISO-8859-1', errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movie_names[int(fields[0])] = fields[1]

    return movie_names

movie_names = load_movie_names()

In [7]:
print('Training recommendation model...')

als_model = ALS().setMaxIter(10) \
                 .setRegParam(0.01) \
                 .setUserCol('user_id') \
                 .setItemCol('movie_id') \
                 .setRatingCol('rating')

trained_model = als_model.fit(data)

Training recommendation model...


In [10]:
user_id = 123
user_schema = StructType([
    StructField('user_id', IntegerType(), True)
])

users = spark_session.createDataFrame([[user_id]], user_schema)
num_of_recommendations  = 20
recommendations = trained_model.recommendForUserSubset(users, num_of_recommendations).collect()

In [13]:
print(f'Top {num_of_recommendations} recommendations for user ID {user_id}')

for user_recomendations in recommendations:
    recs = user_recomendations[1]

    for rec in recs:
        movie = rec[0]
        rating = rec[1]
        movie_name = movie_names[movie]
        print(f'{movie_name}, rating: {rating}')


Top 20 recommendations for user ID 123
City of Industry (1997), rating: 8.021982192993164
Crooklyn (1994), rating: 7.767877578735352
Beautiful Thing (1996), rating: 6.995762825012207
Nénette et Boni (1996), rating: 6.717072486877441
Carried Away (1996), rating: 6.711720943450928
Misérables, Les (1995), rating: 6.637094497680664
Cabin Boy (1994), rating: 6.6296868324279785
When the Cats Away (Chacun cherche son chat) (1996), rating: 6.5875725746154785
Grace of My Heart (1996), rating: 6.368398666381836
Deconstructing Harry (1997), rating: 6.326350212097168
My Man Godfrey (1936), rating: 6.290832042694092
Caught (1996), rating: 6.269672870635986
Ballad of Narayama, The (Narayama Bushiko) (1958), rating: 6.231607913970947
Soul Food (1997), rating: 6.206301689147949
Before the Rain (Pred dozhdot) (1994), rating: 6.01909065246582
Big Blue, The (Grand bleu, Le) (1988), rating: 6.010014057159424
Delicatessen (1991), rating: 6.002135276794434
Ciao, Professore! (1993), rating: 5.945582866668701

In [5]:
spark_session.stop()