In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import collections
import codecs

In [2]:
spark = SparkSession.builder.appName("DF-Popular-Movie").getOrCreate()

### Broadcast Variables

Broadcast Variables broadcast ojects to the executors, such that they are always there whenever needed. Use sc.broadcast() to ship off whatever you want and value() to get the object back. Use the broadcasted object however you want to map functions and UDFs.    

In [3]:
def loadMovieNames():
    movieNames = {}
    with codecs.open('C:/Users/dennis/Spark/Spark/resources/u.item',"r",encoding='ISO-8859-1',errors='ignore') as file:
        for line in file:
            fields = line.split('|')
            movieNames[int(fields[0])]=fields[1]
    return movieNames            

In [4]:
nameDict = spark.sparkContext.broadcast(loadMovieNames())

# Load and Verify Data

In [5]:
# Create schema when reading u.data
from pyspark.sql.types import StringType, DoubleType, IntegerType, StructType, StructField
schema = StructType([StructField('userId', IntegerType(), True),
                     StructField('movieId', IntegerType(), True),
                     StructField('rating', IntegerType(), True),
                     StructField('timestamp', DoubleType(), True)])

In [6]:
data = spark.read.csv('resources/u.data',sep = '\t', header = False, schema = schema)

In [7]:
data.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: double (nullable = true)



In [8]:
data.head(3)

[Row(userId=196, movieId=242, rating=3, timestamp=881250949.0),
 Row(userId=186, movieId=302, rating=3, timestamp=891717742.0),
 Row(userId=22, movieId=377, rating=1, timestamp=878887116.0)]

In [9]:
data.columns

['userId', 'movieId', 'rating', 'timestamp']

In [10]:
data.describe().show()

+-------+------------------+------------------+------------------+-----------------+
|summary|            userId|           movieId|            rating|        timestamp|
+-------+------------------+------------------+------------------+-----------------+
|  count|            100000|            100000|            100000|           100000|
|   mean|         462.48475|         425.53013|           3.52986|8.8352885148862E8|
| stddev|266.61442012750905|330.79835632558473|1.1256735991443214|5343856.189502848|
|    min|                 1|                 1|                 1|      8.7472471E8|
|    max|               943|              1682|                 5|     8.93286638E8|
+-------+------------------+------------------+------------------+-----------------+



# Create UDF

In [11]:
# Create a user-defined functions to look up movie names from our broadcasted dictionary 
def lookupName(movieID):
    return nameDict.value[movieID]
lookupNameUDF = f.udf(lookupName)

# Display most popular movies

In [12]:
movieCounts = data.groupBy('movieId').count()
movieCounts.show(3)

+-------+-----+
|movieId|count|
+-------+-----+
|    496|  231|
|    471|  221|
|    463|   71|
+-------+-----+
only showing top 3 rows



In [13]:
movieWithNames = movieCounts.withColumn('movieTitle',lookupNameUDF(f.col("movieId")))
movieWithNames.show(3)

+-------+-----+--------------------+
|movieId|count|          movieTitle|
+-------+-----+--------------------+
|    496|  231|It's a Wonderful ...|
|    471|  221|Courage Under Fir...|
|    463|   71|Secret of Roan In...|
+-------+-----+--------------------+
only showing top 3 rows



In [14]:
movieWithNames.select('movieId','movieTitle','count').orderBy(f.desc("count")).show(10,False)

+-------+-----------------------------+-----+
|movieId|movieTitle                   |count|
+-------+-----------------------------+-----+
|50     |Star Wars (1977)             |583  |
|258    |Contact (1997)               |509  |
|100    |Fargo (1996)                 |508  |
|181    |Return of the Jedi (1983)    |507  |
|294    |Liar Liar (1997)             |485  |
|286    |English Patient, The (1996)  |481  |
|288    |Scream (1996)                |478  |
|1      |Toy Story (1995)             |452  |
|300    |Air Force One (1997)         |431  |
|121    |Independence Day (ID4) (1996)|429  |
+-------+-----------------------------+-----+
only showing top 10 rows



In [15]:
spark.stop()