# Question 1

## Initiate Spark

In [1]:
from pyspark.sql import SparkSession
from sparkmeasure import StageMetrics
from pyspark.sql.functions import lower

# Create a new Spark Session
spark = SparkSession \
    .builder \
    .appName("Movies") \
    .config("spark.jars", "spark-measure_2.11-0.17.jar") \
    .getOrCreate()

# Create spark metrics object
stagemetrics = StageMetrics(spark)

In [2]:
spark

## Load Datasets To Dataframes

### Movie Dataframe

In [47]:
movie_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true")
            .load("datasets/movie.csv")
           )

In [4]:
movie_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



### Rating Dataframe

In [50]:
rating_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true")
            .load("datasets/rating.csv")
           )

In [51]:
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [6]:
rating_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



### Tag Dataframe

In [48]:
tag_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true") 
            .load("datasets/tag.csv")
           )

In [8]:
tag_df.show(5)

+------+-------+-------------+-------------------+
|userId|movieId|          tag|          timestamp|
+------+-------+-------------+-------------------+
|    18|   4141|  Mark Waters|2009-04-24 18:19:40|
|    65|    208|    dark hero|2013-05-10 01:41:18|
|    65|    353|    dark hero|2013-05-10 01:41:19|
|    65|    521|noir thriller|2013-05-10 01:39:43|
|    65|    592|    dark hero|2013-05-10 01:41:18|
+------+-------+-------------+-------------------+
only showing top 5 rows



### Genome Tags Dataframe

In [49]:
genome_tags_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true") 
            .load("datasets/genome_tags.csv")
           )

In [10]:
genome_tags_df.show(5)

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
|    5|       1930s|
+-----+------------+
only showing top 5 rows



## Queries

### Query 1

In [11]:
# Start measuring performance
stagemetrics.begin()

# Get the id of the movie "Jumanji"
jumanji_id = movie_df.filter(movie_df.title.contains("Jumanji")) \
            .select("movieId") \
            .collect()[0]["movieId"]

# Get the number of users that watched "Jumanji"
watched_jumanji = rating_df.filter(rating_df["movieId"] == jumanji_id) \
                 .count()

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

elapsedTime => 36913 (37 s)


In [12]:
watched_jumanji

22243

### Query 2

In [52]:
# Start measuring performance
stagemetrics.begin()

# Get the movieIds with tags containing the word "boring"
unique_boring_movieIds = tag_df.filter(lower(tag_df["tag"]).contains("boring")) \
                        .select("movieId") \
                        .dropDuplicates()

# Get the corresponding movie titles from movieIds in alphabetical order
unique_boring_movie_titles = unique_boring_movieIds \
                            .join(movie_df, "movieId", "inner") \
                            .select(movie_df.title) \
                            .sort(movie_df.title)

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
stagemetrics.print_report()
#print(stagemetrics.report().split('\n')[6])


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
 no data to report 


In [53]:
unique_boring_movie_titles.show(5)

+--------------------+
|               title|
+--------------------+
|(500) Days of Sum...|
|101 Reykjavik (10...|
|12 Years a Slave ...|
|         1408 (2007)|
|1492: Conquest of...|
+--------------------+
only showing top 5 rows



### Query 3

In [57]:
# Start measuring performance
stagemetrics.begin()

# Get the userIds and movieIds with tags containing the word "bollywood"
bollywood_userIds_movieIds = tag_df.filter(lower(tag_df["tag"]).contains("bollywood")) \
                                   .select(["userId", "movieId", "tag"])

# Get all userIds and movieIds with rating above 3
above_3_rating = rating_df.filter(rating_df.rating > 3) \
                          .select(["userId", "movieId", "rating"])

# Inner join based on unique combination of userId and movieId
query_3_result = bollywood_userIds_movieIds \
                .join(above_3_rating, ["userId", "movieId"], "inner") \
                .select("userId") \
                .dropDuplicates() \
                .sort(above_3_rating.userId)

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
stagemetrics.print_report()
#print(stagemetrics.report().split('\n')[6])


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
 no data to report 


In [58]:
query_3_result.show(5)

+------+
|userId|
+------+
| 10573|
| 19837|
| 23333|
| 25004|
| 31338|
+------+
only showing top 5 rows



 ### Query 4

In [124]:
from pyspark.sql.functions import lower, year

# Start measuring performance
stagemetrics.begin()

with_year = rating_df.withColumn("year", year(rating_df["timestamp"])) \
            .groupBy(["year", "movieId"]) \
            .agg({"rating": "mean"}) \
            .sort(["year", "avg(rating)"], ascending = [True, False]) \

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
stagemetrics.print_report()
#print(stagemetrics.report().split('\n')[6])


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
 no data to report 


In [125]:
with_year.show(50)

+----+-------+------------------+
|year|movieId|       avg(rating)|
+----+-------+------------------+
|1995|     47|               5.0|
|1995|   1176|               4.0|
|1995|     21|               3.0|
|1995|   1079|               3.0|
|1996|   1384| 4.708333333333333|
|1996|   1369| 4.538461538461538|
|1996|   1148| 4.519774011299435|
|1996|    527|4.5161642205474015|
|1996|   1415|               4.5|
|1996|   1400|               4.5|
|1996|    720| 4.488428745432399|
|1996|    318|4.4781818181818185|
|1996|   1399| 4.388888888888889|
|1996|    912| 4.386454183266932|
|1996|    745| 4.369969040247678|
|1996|   1198| 4.365409622886866|
|1996|   1057| 4.357142857142857|
|1996|    115| 4.333333333333333|
|1996|   1404| 4.333333333333333|
|1996|   1357| 4.328571428571428|
|1996|    904| 4.306233062330623|
|1996|    800| 4.306172839506173|
|1996|   1397|               4.3|
|1996|   1406|               4.3|
|1996|     50| 4.299451969806638|
|1996|   1133| 4.285714285714286|
|1996|    778|