# Query 1

## Initiate Spark

In [14]:
from pyspark.sql import SparkSession
from sparkmeasure import StageMetrics
from pyspark.sql.functions import lower

# Create a new Spark Session
spark = SparkSession \
    .builder \
    .appName("Movies2") \
    .config("spark.jars", "../spark-measure_2.11-0.17.jar") \
    .getOrCreate()

# Create spark metrics object
stagemetrics = StageMetrics(spark)

In [15]:
spark

## Load Datasets To Dataframes

### Movie Dataframe

In [16]:
movie_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .load("../datasets/movie.csv")
           )

In [17]:
movie_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



### Rating Dataframe

In [18]:
rating_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .load("../datasets/rating.csv")
           )

In [19]:
rating_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



### Tag Dataframe

In [20]:
tag_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .load("../datasets/tag.csv")
           )

In [21]:
tag_df.show(5)

+------+-------+-------------+-------------------+
|userId|movieId|          tag|          timestamp|
+------+-------+-------------+-------------------+
|    18|   4141|  Mark Waters|2009-04-24 18:19:40|
|    65|    208|    dark hero|2013-05-10 01:41:18|
|    65|    353|    dark hero|2013-05-10 01:41:19|
|    65|    521|noir thriller|2013-05-10 01:39:43|
|    65|    592|    dark hero|2013-05-10 01:41:18|
+------+-------+-------------+-------------------+
only showing top 5 rows



### Genome Tags Dataframe

In [22]:
genome_tags_df = (spark.read
            .format("csv")
            .option("header", "true")
            .option("delimiter", ",")
            .load("../datasets/genome_tags.csv")
           )

In [23]:
genome_tags_df.show(5)

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
|    5|       1930s|
+-----+------------+
only showing top 5 rows



## Query

In [26]:
# Start measuring performance
stagemetrics.begin()

unique_boring_movieIds = tag_df.filter(lower(tag_df["tag"]).contains("boring")) \
                        .select("movieId") \
                        .dropDuplicates()


unique_boring_movie_titles = unique_boring_movieIds \
                            .join(movie_df, unique_boring_movieIds.movieId == movie_df.movieId, 'inner') \
                            .select(movie_df.title).sort(movie_df.title)

# Stop measuring performance
stagemetrics.end()

# Print performance metrics
stagemetrics.print_report()
#print(stagemetrics.report().split('\n')[6])


Scheduling mode = FIFO
Spark Context default degree of parallelism = 4
 no data to report 


## Result

In [12]:
unique_boring_movie_titles.count()

674

In [13]:
spark.stop()