# Question 1

## Initiate Spark

In [1]:
from pyspark.sql import SparkSession
from sparkmeasure import StageMetrics
from pyspark.sql.functions import lower

# Create a new Spark Session
spark = SparkSession \
    .builder \
    .appName("Movies") \
    .config("spark.jars", "spark-measure_2.11-0.17.jar") \
    .getOrCreate()

# Create spark metrics object
stagemetrics = StageMetrics(spark)

In [2]:
spark

## Load Datasets To Dataframes

### Movie Dataframe

In [47]:
movie_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true")
            .load("datasets/movie.csv")
           )

In [4]:
movie_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



### Rating Dataframe

In [50]:
rating_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true")
            .load("datasets/rating.csv")
           )

In [51]:
rating_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [6]:
rating_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



### Tag Dataframe

In [48]:
tag_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true") 
            .load("datasets/tag.csv")
           )

In [8]:
tag_df.show(5)

+------+-------+-------------+-------------------+
|userId|movieId|          tag|          timestamp|
+------+-------+-------------+-------------------+
|    18|   4141|  Mark Waters|2009-04-24 18:19:40|
|    65|    208|    dark hero|2013-05-10 01:41:18|
|    65|    353|    dark hero|2013-05-10 01:41:19|
|    65|    521|noir thriller|2013-05-10 01:39:43|
|    65|    592|    dark hero|2013-05-10 01:41:18|
+------+-------+-------------+-------------------+
only showing top 5 rows



### Genome Tags Dataframe

In [49]:
genome_tags_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .option("inferSchema", "true") 
            .load("datasets/genome_tags.csv")
           )

In [10]:
genome_tags_df.show(5)

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
|    5|       1930s|
+-----+------------+
only showing top 5 rows



## Queries

### Query 1

In [11]:
# Start measuring performance
stagemetrics.begin()

# Get the id of the movie "Jumanji"
jumanji_id = movie_df.filter(movie_df.title.contains("Jumanji")) \
            .select("movieId") \
            .collect()[0]["movieId"]

# Get the number of users that watched "Jumanji"
watched_jumanji = rating_df.filter(rating_df["movieId"] == jumanji_id) \
                 .count()

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

elapsedTime => 36913 (37 s)


In [12]:
watched_jumanji

22243

### Query 2

In [52]:
# Start measuring performance
stagemetrics.begin()

# Get the movieIds with tags containing the word "boring"
unique_boring_movieIds = tag_df.filter(lower(tag_df["tag"]).contains("boring")) \
                        .select("movieId") \
                        .dropDuplicates()

# Get the corresponding movie titles from movieIds in alphabetical order
unique_boring_movie_titles = unique_boring_movieIds \
                            .join(movie_df, "movieId", "inner") \
                            .select(movie_df.title) \
                            .sort(movie_df.title)

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
stagemetrics.print_report()
#print(stagemetrics.report().split('\n')[6])


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
 no data to report 


In [53]:
unique_boring_movie_titles.show(5)

+--------------------+
|               title|
+--------------------+
|(500) Days of Sum...|
|101 Reykjavik (10...|
|12 Years a Slave ...|
|         1408 (2007)|
|1492: Conquest of...|
+--------------------+
only showing top 5 rows



### Query 3

In [57]:
# Start measuring performance
stagemetrics.begin()

# Get the userIds and movieIds with tags containing the word "bollywood"
bollywood_userIds_movieIds = tag_df.filter(lower(tag_df["tag"]).contains("bollywood")) \
                                   .select(["userId", "movieId", "tag"])

# Get all userIds and movieIds with rating above 3
above_3_rating = rating_df.filter(rating_df.rating > 3) \
                          .select(["userId", "movieId", "rating"])

# Inner join based on unique combination of userId and movieId
query_3_result = bollywood_userIds_movieIds \
                .join(above_3_rating, ["userId", "movieId"], "inner") \
                .select("userId") \
                .dropDuplicates() \
                .sort(above_3_rating.userId)

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
stagemetrics.print_report()
#print(stagemetrics.report().split('\n')[6])


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
 no data to report 


In [58]:
query_3_result.show(5)

+------+
|userId|
+------+
| 10573|
| 19837|
| 23333|
| 25004|
| 31338|
+------+
only showing top 5 rows



 ### Query 4

In [153]:
from pyspark.sql.functions import lower, year
from pyspark.sql.functions import rank, col, row_number
from pyspark.sql.window import Window

# Start measuring performance
stagemetrics.begin()

# Group by year and movieId and find the mean value of rating
grouped_data = rating_df.withColumn("year", year(rating_df["timestamp"])) \
              .groupBy(["year", "movieId"]) \
              .agg({"rating": "mean"})
              
# Create a window to limit ratings
window = Window.partitionBy(grouped_data["year"]) \
        .orderBy(grouped_data["avg(rating)"] \
        .desc())

# Limit the ratings in each group
grouped_data_limited = grouped_data.select('*', row_number().over(window).alias('row_number')) \
                      .filter(col('row_number') <= 10) \
                      .sort(["year", "avg(rating)"], ascending = [True, False])

# Get the movie titles
query_4_result = grouped_data_limited.join(movie_df, "movieId", "inner") \
                .select(["year", "title", "avg(rating)"])

# Show results
query_4_result.show()

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

+----+--------------------+-----------+
|year|               title|avg(rating)|
+----+--------------------+-----------+
|2005|Not Love, Just Fr...|        5.0|
|2005|Life Is Rosy (a.k...|        5.0|
|2005|Paris Was a Woman...|        5.0|
|2005|Married to It (1991)|        5.0|
|2005|Too Much Sleep (1...|        5.0|
|2005|Fear Strikes Out ...|        5.0|
|2005|Before the Fall (...|        5.0|
|2005|Gate of Heavenly ...|        5.0|
|2005|Take Care of My C...|        5.0|
|2005|   Dancemaker (1998)|        5.0|
+----+--------------------+-----------+


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
Aggregated Spark stage metrics:
numStages => 7
numTasks => 609
elapsedTime => 123180 (2.1 min)
stageDuration => 122907 (2.0 min)
executorRunTime => 353706 (5.9 min)
executorCpuTime => 328093 (5.5 min)
executorDeserializeTime => 2009 (2 s)
executorDeserializeCpuTime => 1683 (2 s)
resultSerializationTime => 20 (20 ms)
jvmGCTime => 5703 (6 s)
shuffleFetchWaitTime => 0

In [154]:
query_4_result.filter(query_4_result.year == 2005).show(truncate=50)

+----+--------------------------------------------------+-----------+
|year|                                             title|avg(rating)|
+----+--------------------------------------------------+-----------+
|2005|Not Love, Just Frenzy (Más que amor, frenesí) (...|        5.0|
|2005|Life Is Rosy (a.k.a. Life Is Beautiful) (Vie es...|        5.0|
|2005|                          Paris Was a Woman (1995)|        5.0|
|2005|                              Married to It (1991)|        5.0|
|2005|                             Too Much Sleep (1997)|        5.0|
|2005|                           Fear Strikes Out (1957)|        5.0|
|2005|Before the Fall (NaPolA - Elite für den Führer)...|        5.0|
|2005|                Gate of Heavenly Peace, The (1995)|        5.0|
|2005| Take Care of My Cat (Goyangileul butaghae) (2001)|        5.0|
|2005|                                 Dancemaker (1998)|        5.0|
+----+--------------------------------------------------+-----------+



### Query 5

In [172]:
from pyspark.sql.functions import *

# Start measuring performance
stagemetrics.begin()

# Get the id and title of 2015 movies
movies_2015 = movie_df.filter(movie_df.title.contains("(2015)")) \
             .select(["movieId", "title"])

# Get the tags of 2015 movies
joined = movies_2015.join(tag_df, "movieId", "inner") \
                    .select(["title", "tag"]) \
                    .sort("title")

# Group tags by movie title and concatenate them
query_5_result = joined.groupby("title") \
                .agg(concat_ws(", ", collect_list(joined.tag)).alias("tags"))

# Show results
query_5_result.show(truncate=50)

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

+--------------------------------------------------+--------------------------------------------------+
|                                             title|                                              tags|
+--------------------------------------------------+--------------------------------------------------+
|                           A Grain of Truth (2015)|Borys Lankosz, Abel Korzeniowski, Borys Lankosz...|
|                        A Walk in the Woods (2015)|                                        Ken Kwapis|
|                               Advantageous (2015)|                                    Jennifer Phang|
|                        As We Were Dreaming (2015)|                                   based on a book|
|                            Average Italian (2015)|                           Marcello Macchia, drugs|
|                     Beaver Trilogy Part IV (2015)|                       Brad Besser, movie business|
|                                   Blackhat (2015)|            

In [174]:
query_5_result.show(5, truncate=50)

+--------------------------+--------------------------------------------------+
|                     title|                                              tags|
+--------------------------+--------------------------------------------------+
|   A Grain of Truth (2015)|Borys Lankosz, Abel Korzeniowski, Borys Lankosz...|
|A Walk in the Woods (2015)|                                        Ken Kwapis|
|       Advantageous (2015)|                                    Jennifer Phang|
|As We Were Dreaming (2015)|                                   based on a book|
|    Average Italian (2015)|                           Marcello Macchia, drugs|
+--------------------------+--------------------------------------------------+
only showing top 5 rows



### Query 6

In [178]:
# Start measuring performance
stagemetrics.begin()

# Group rating by movieId and get movie title from inner join
query_6_result = rating_df \
                .groupBy("movieId") \
                .agg({"rating": "count"}) \
                .join(movie_df, "movieId", "inner") \
                .select(["title", "count(rating)"]) \
                .sort("count(rating)", ascending=False)

# Show results
query_6_result.show()

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

+--------------------+-------------+
|               title|count(rating)|
+--------------------+-------------+
| Pulp Fiction (1994)|        67310|
| Forrest Gump (1994)|        66172|
|Shawshank Redempt...|        63366|
|Silence of the La...|        63299|
|Jurassic Park (1993)|        59715|
|Star Wars: Episod...|        54502|
|   Braveheart (1995)|        53769|
|Terminator 2: Jud...|        52244|
|  Matrix, The (1999)|        51334|
|Schindler's List ...|        50054|
|    Toy Story (1995)|        49695|
|Fugitive, The (1993)|        49581|
|    Apollo 13 (1995)|        47777|
|Independence Day ...|        47048|
|Usual Suspects, T...|        47006|
|Star Wars: Episod...|        46839|
|       Batman (1989)|        46054|
|Star Wars: Episod...|        45313|
|American Beauty (...|        44987|
|Twelve Monkeys (a...|        44980|
+--------------------+-------------+
only showing top 20 rows

elapsedTime => 32063 (32 s)


In [180]:
# Show result for report
query_6_result.show(5, truncate=50)

+--------------------------------+-------------+
|                           title|count(rating)|
+--------------------------------+-------------+
|             Pulp Fiction (1994)|        67310|
|             Forrest Gump (1994)|        66172|
|Shawshank Redemption, The (1994)|        63366|
|Silence of the Lambs, The (1991)|        63299|
|            Jurassic Park (1993)|        59715|
+--------------------------------+-------------+
only showing top 5 rows



### Query 7

In [186]:
# Start measuring performance
stagemetrics.begin()

# Group by year and userId and count the ratings for each group
grouped_data = rating_df.withColumn("year", year(rating_df["timestamp"])) \
              .groupBy(["year", "userId"]) \
              .agg({"rating": "count"})

# Create a window to limit ratings
window = Window.partitionBy(grouped_data["year"]) \
        .orderBy(grouped_data["count(rating)"] \
        .desc())

# Limit the ratings in each group
query_7_results = grouped_data.select('*', row_number().over(window).alias('row_number')) \
                              .filter(col('row_number') <= 10) \
                              .select(["year", "userId", "count(rating)"]) \
                              .sort(["year", "count(rating)"], ascending = [True, False])

# Show results
query_7_results.show()

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

+----+------+-------------+
|year|userId|count(rating)|
+----+------+-------------+
|1995|131160|            3|
|1995| 28507|            1|
|1996| 25878|          800|
|1996|  1931|          722|
|1996| 46663|          669|
|1996|107732|          657|
|1996| 24214|          624|
|1996| 41389|          605|
|1996| 19067|          605|
|1996|  4548|          589|
|1996| 46146|          570|
|1996| 81218|          510|
|1997|124052|         1352|
|1997|128653|         1141|
|1997|  5814|          849|
|1997| 64778|          655|
|1997| 83343|          583|
|1997|101971|          559|
|1997| 34962|          526|
|1997| 19954|          522|
+----+------+-------------+
only showing top 20 rows



In [188]:
# Show results for report
query_7_results.filter(query_7_results.year == 1995).sort(["year", "userId"], ascending = [True, True]).show()

+----+------+-------------+
|year|userId|count(rating)|
+----+------+-------------+
|1995| 28507|            1|
|1995|131160|            3|
+----+------+-------------+



### Query 8

In [244]:
# Start measuring performance
stagemetrics.begin()

first_genre = movie_df \
             .select('*', split(movie_df.genres, '[|]')[0].alias("genre")) \
             .drop("genres")

# Get total ratings by movie
total_ratings_by_movie = first_genre \
                        .join(rating_df, "movieId", "inner") \
                        .groupBy(["genre", "title"]) \
                        .agg({"*": "count"}) \
                        .withColumnRenamed("count(1)", "total_ratings")

# Create a window to limit total ratings
window = Window.partitionBy(total_ratings_by_movie["genre"]) \
        .orderBy(total_ratings_by_movie["total_ratings"] \
        .desc())

# Limit the total ratings in each group
query_8_results = total_ratings_by_movie \
                  .select('*', row_number().over(window).alias('row_number')) \
                  .filter(col('row_number') <= 1) \
                  .select(["genre", "title", "total_ratings"]) \
                  .sort(["genre"]) \
                  .filter(total_ratings_by_movie.genre != "(no genres listed)")

# Show results
query_8_results.show()

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
print(stagemetrics.report().split('\n')[6])

+-----------+--------------------+-------------+
|      genre|               title|total_ratings|
+-----------+--------------------+-------------+
|     Action|Jurassic Park (1993)|        59715|
|  Adventure|    Toy Story (1995)|        49695|
|  Animation|Beauty and the Be...|        35138|
|   Children|E.T. the Extra-Te...|        32685|
|     Comedy| Pulp Fiction (1994)|        67310|
|      Crime|Shawshank Redempt...|        63366|
|Documentary|Bowling for Colum...|        12280|
|      Drama|Schindler's List ...|        50054|
|    Fantasy|       Brazil (1985)|        13957|
|  Film-Noir|Maltese Falcon, T...|        12144|
|     Horror|        Alien (1979)|        30933|
|       IMAX|Encounter in the ...|           30|
|    Musical|Sound of Music, T...|        14049|
|    Mystery|Twelve Monkeys (a...|        44980|
|    Romance|Meet Joe Black (1...|         5210|
|     Sci-Fi|Mission to Mars (...|         6365|
|   Thriller|Fugitive, The (1993)|        49581|
|        War|Run Sil

In [245]:
# Show results for report
query_8_results.drop("total_ratings").show(5, truncate=100)

+---------+---------------------------------+
|    genre|                            title|
+---------+---------------------------------+
|   Action|             Jurassic Park (1993)|
|Adventure|                 Toy Story (1995)|
|Animation|      Beauty and the Beast (1991)|
| Children|E.T. the Extra-Terrestrial (1982)|
|   Comedy|              Pulp Fiction (1994)|
+---------+---------------------------------+
only showing top 5 rows

