In [1]:
import findspark
findspark.init()

import codecs

from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructField, StructType, IntegerType, LongType

In [2]:
spark_session = SparkSession.builder.appName('Get most popular movies').getOrCreate()

In [3]:
schema = StructType([
    StructField('user_id', IntegerType(), True),
    StructField('movie_id', IntegerType(), True),
    StructField('rating', IntegerType(), True),
    StructField('timestamp', LongType(), True)
])

In [4]:
data = spark_session.read.option('sep', '\t').schema(schema).csv('../../data/ml-100k/u.data')
data.show(5)

+-------+--------+------+---------+
|user_id|movie_id|rating|timestamp|
+-------+--------+------+---------+
|    196|     242|     3|881250949|
|    186|     302|     3|891717742|
|     22|     377|     1|878887116|
|    244|      51|     2|880606923|
|    166|     346|     1|886397596|
+-------+--------+------+---------+
only showing top 5 rows



In [5]:
most_popular_movies = data.groupBy('movie_id') \
                          .agg(func.count('movie_id').alias('num_of_ratings')) \
                          .sort('num_of_ratings', ascending=False)
most_popular_movies.show(5)

+--------+--------------+
|movie_id|num_of_ratings|
+--------+--------------+
|      50|           583|
|     258|           509|
|     100|           508|
|     181|           507|
|     294|           485|
+--------+--------------+
only showing top 5 rows



In [6]:
def get_movie_name_dict():
    movie_names = {}
    with codecs.open('../../data/ml-100k/u.item', 'r', encoding='UTF-8', errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movie_names[int(fields[0])] = fields[1]
    
    return movie_names

In [7]:
name_dict = spark_session.sparkContext.broadcast(get_movie_name_dict())
list(name_dict.value.items())[:10]

[(1, 'Toy Story (1995)'),
 (2, 'GoldenEye (1995)'),
 (3, 'Four Rooms (1995)'),
 (4, 'Get Shorty (1995)'),
 (5, 'Copycat (1995)'),
 (6, 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)'),
 (7, 'Twelve Monkeys (1995)'),
 (8, 'Babe (1995)'),
 (9, 'Dead Man Walking (1995)'),
 (10, 'Richard III (1995)')]

In [8]:
def look_up_movie_name(id):
    return name_dict.value[id]

In [10]:
look_up_movie_name_UDF = func.udf(look_up_movie_name)
most_popular_movies_named = most_popular_movies.withColumn('movie_name', look_up_movie_name_UDF(func.col('movie_id'))) \
                                               .select('movie_name', 'num_of_ratings')

most_popular_movies_named.show(10)

+--------------------+--------------+
|          movie_name|num_of_ratings|
+--------------------+--------------+
|    Star Wars (1977)|           583|
|      Contact (1997)|           509|
|        Fargo (1996)|           508|
|Return of the Jed...|           507|
|    Liar Liar (1997)|           485|
|English Patient, ...|           481|
|       Scream (1996)|           478|
|    Toy Story (1995)|           452|
|Air Force One (1997)|           431|
|Independence Day ...|           429|
+--------------------+--------------+
only showing top 10 rows



In [11]:
spark_session.stop()