In [1]:
from pyspark.sql import SparkSession
from sparkmeasure import StageMetrics
from pyspark.sql.functions import year, row_number, col, count, split, explode, rank, dayofyear, hour, sum, concat_ws, collect_list, lower
from pyspark.sql.window import Window

spark = SparkSession \
    .builder \
    .appName('vaseis-2') \
    .config("spark.jars", "<path-to-jar>/spark-measure_2.12-0.17.jar") \
    .master("spark://antonis:7077") \
    .getOrCreate()

rating_df = (spark.read
    .format("csv")
    .option('header','true') #csv files have headers
    .option('delimiter', ',') #delimiter of the csv files that are processed
    .option('inferSchema', 'true') #if false, all values are strings
    .load("rating.csv") #file to be processed
)
movie_df = (spark.read
    .format("csv")
    .option('header','true') #csv files have headers
    .option('delimiter', ',') #delimiter of the csv files that are processed
    .option('inferSchema', 'true') #if false, all values are strings
    .load("movie.csv") #file to be processed
)
tag_df = (spark.read
    .format("csv")
    .option('header','true') #csv files have headers
    .option('delimiter', ',') #delimiter of the csv files that are processed
    .option('inferSchema', 'true') #if false, all values are strings
    .load("tag.csv") #file to be processed
)
stagemetrics = StageMetrics(spark)

In [2]:
# QUERY 1
stagemetrics.begin()

movie = movie_df.filter(movie_df["title"] == "Jumanji (1995)") #get rows with title jumanji
movieID = movie.collect()[0][0] #get movieID
q1 = rating_df.filter(rating_df["movieId"] == movieID).select(rating_df["userId"]).count() #count how many people have seen the movie jumanji
print(q1)

stagemetrics.end()
stagemetrics.print_report()

22243

Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 3
numTasks => 14
elapsedTime => 1768 (2 s)
stageDuration => 1632 (2 s)
executorRunTime => 17412 (17 s)
executorCpuTime => 15257 (15 s)
executorDeserializeTime => 74 (74 ms)
executorDeserializeCpuTime => 37 (37 ms)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 483 (0.5 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 12 (12 ms)
resultSize => 24312 (23.0 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 0
recordsRead => 22244
bytesRead => 693171282 (661.0 MB)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 12
shuffleTotalBlocksFetched => 12
shuffleLocalBlocksFetched => 12
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 708 (708 Bytes)
shuffleLocalBytesRead => 708 (708 Bytes)
shuffleRemoteBytesRead => 0 (0 Bytes)
shuffleRemoteBytesReadToDisk => 0 (0 Bytes)
shuffleBytesWritten => 708 (

In [17]:
# QUERY 2
stagemetrics.begin()

boring_movies = tag_df.select('userId','movieId', lower(col('tag')).alias('tag')).filter(tag_df["tag"].contains('boring')).drop_duplicates(subset=['movieId']).select("movieId")
boring_movie_names = boring_movies.join(movie_df, boring_movies.movieId == movie_df.movieId).orderBy(movie_df['title']).select(movie_df['title'])
boring_movie_names.show()

stagemetrics.end()
stagemetrics.print_report()

+--------------------+
|               title|
+--------------------+
|(500) Days of Sum...|
|101 Reykjavik (10...|
|12 Years a Slave ...|
|         1408 (2007)|
|1492: Conquest of...|
|2001: A Space Ody...|
|2010: The Year We...|
|         2046 (2004)|
|     21 Grams (2003)|
|24 Hour Party Peo...|
|3-Iron (Bin-jip) ...|
|40-Year-Old Virgi...|
|    6 Bullets (2012)|
| 633 Squadron (1964)|
| 7 Plus Seven (1970)|
|      8 Women (2002)|
|A.I. Artificial I...|
|  About a Boy (2002)|
|According to Gret...|
|   Adaptation (2002)|
+--------------------+
only showing top 20 rows


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 3
numTasks => 207
elapsedTime => 266 (0.3 s)
stageDuration => 222 (0.2 s)
executorRunTime => 548 (0.5 s)
executorCpuTime => 569 (0.6 s)
executorDeserializeTime => 258 (0.3 s)
executorDeserializeCpuTime => 223 (0.2 s)
resultSerializationTime => 2 (2 ms)
jvmGCTime => 41 (41 ms)
shuffleFetchWaitTime => 0 (

In [18]:
# QUERY 3
stagemetrics.begin()

bollywood_tag = tag_df.select('userId','movieId', lower(col('tag')).alias('tag')).filter(tag_df["tag"] == ('bollywood')).select('userId','movieId')
user_bollywood_good_rating = bollywood_tag.join(rating_df.select('userId','rating'), ["userId"], 'inner').distinct()
user_bollywood_good_rating = user_bollywood_good_rating.filter(col("rating")>3.0).dropDuplicates(['userId']).orderBy(col('userId')).select('userId')
user_bollywood_good_rating.show()


stagemetrics.end()
stagemetrics.print_report()

+------+
|userId|
+------+
|    65|
|   910|
|  1741|
|  2001|
|  7434|
|  7671|
|  8513|
| 14517|
| 17877|
| 19837|
| 20066|
| 20388|
| 23333|
| 24112|
| 25004|
| 33323|
| 35170|
| 36334|
| 37264|
| 37355|
+------+
only showing top 20 rows


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 4
numTasks => 418
elapsedTime => 2261 (2 s)
stageDuration => 2231 (2 s)
executorRunTime => 24243 (24 s)
executorCpuTime => 21703 (22 s)
executorDeserializeTime => 335 (0.3 s)
executorDeserializeCpuTime => 430 (0.4 s)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 1393 (1 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 848 (0.8 s)
resultSize => 1594388 (1557.0 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 484442112
recordsRead => 12195679
bytesRead => 713730828 (680.0 MB)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 872
shuffleTotalBlocksFetched => 811


In [19]:
# QUERY 4
stagemetrics.begin()

q4_df = rating_df.withColumn('year', year(rating_df['timestamp'])).groupBy('year','movieId').avg('rating') #.join(movie_df.select('movieId','title'),['movieId'])
window = Window.partitionBy(q4_df['year']).orderBy(q4_df['avg(rating)'].desc())

q4_df = q4_df.select('*', row_number().over(window).alias('row_num')).filter(col('row_num') <= 10).join(movie_df.select('movieId','title'),['movieId']).orderBy(movie_df['title'].asc()).filter(col('year') == 2005).select(col('title'))
q4_df.show()

stagemetrics.end()
stagemetrics.print_report()

+--------------------+
|               title|
+--------------------+
|Before the Fall (...|
|   Dancemaker (1998)|
|Fear Strikes Out ...|
|Gate of Heavenly ...|
|Life Is Rosy (a.k...|
|Married to It (1991)|
|Not Love, Just Fr...|
|Paris Was a Woman...|
|Take Care of My C...|
|Too Much Sleep (1...|
+--------------------+


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 4
numTasks => 413
elapsedTime => 3775 (4 s)
stageDuration => 3712 (4 s)
executorRunTime => 41478 (41 s)
executorCpuTime => 33567 (34 s)
executorDeserializeTime => 372 (0.4 s)
executorDeserializeCpuTime => 428 (0.4 s)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 3832 (4 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 761 (0.8 s)
resultSize => 1415096 (1381.0 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 643497984
recordsRead => 20027541
bytesRead => 693171282 (661.0 MB)
recordsWritten => 0
bytesW

In [26]:
# QUERY 5
stagemetrics.begin()

tag_df_2015 = tag_df.filter(year(tag_df['timestamp']) == 2015).select('movieId', 'tag').join(movie_df.select('movieId', 'title'), ['movieId'])
tag_df_2015 = tag_df_2015.groupBy('movieId','title').agg(concat_ws(", ", collect_list(tag_df_2015.tag)).alias('tags')).orderBy(tag_df_2015['title'].asc())
tag_df_2015.select(col('title'), col('tags')).show()

stagemetrics.end()
stagemetrics.print_report()

+--------------------+--------------------+
|               title|                tags|
+--------------------+--------------------+
|""Great Performan...|                BD-R|
|  'burbs, The (1989)|1980's, black com...|
|(500) Days of Sum...|annoying, artisti...|
|...tick... tick.....|                BD-R|
|            1 (2014)|             Sukumar|
|10 Things I Hate ...|chick flick, Heat...|
|    10,000 BC (2008)|historically inac...|
|101 Reykjavik (10...|             Iceland|
|10th Kingdom, The...|SERIE DE TV, fant...|
|      11 x 14 (1977)|       James Benning|
|11-11-11 (11-11-1...|PG-13:some distur...|
|        11:14 (2003)|multiple storylin...|
| 12 Angry Men (1957)|group psychology,...|
| 12 Angry Men (1997)|             Bob*ola|
|12 Years a Slave ...|based on a book, ...|
|        12:01 (1993)|           time loop|
|     12:01 PM (1990)|Jonathan Heap, ea...|
|13 Going on 30 (2...|Aging, Friends As...|
|     13 Lakes (2004)|       James Benning|
|13th Warrior, The...|          

In [21]:
# QUERY 6
stagemetrics.begin()

q6_df = rating_df.groupBy('movieId').count()
q6_df = q6_df.join(movie_df.select('movieId','title'),['movieId']).orderBy(q6_df['count'].desc())
q6_df.select(col('title'),col('count')).show()

stagemetrics.end()
stagemetrics.print_report()

+--------------------+-----+
|               title|count|
+--------------------+-----+
| Pulp Fiction (1994)|67310|
| Forrest Gump (1994)|66172|
|Shawshank Redempt...|63366|
|Silence of the La...|63299|
|Jurassic Park (1993)|59715|
|Star Wars: Episod...|54502|
|   Braveheart (1995)|53769|
|Terminator 2: Jud...|52244|
|  Matrix, The (1999)|51334|
|Schindler's List ...|50054|
|    Toy Story (1995)|49695|
|Fugitive, The (1993)|49581|
|    Apollo 13 (1995)|47777|
|Independence Day ...|47048|
|Usual Suspects, T...|47006|
|Star Wars: Episod...|46839|
|       Batman (1989)|46054|
|Star Wars: Episod...|45313|
|American Beauty (...|44987|
|Twelve Monkeys (a...|44980|
+--------------------+-----+
only showing top 20 rows


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 3
numTasks => 213
elapsedTime => 2058 (2 s)
stageDuration => 2015 (2 s)
executorRunTime => 22140 (22 s)
executorCpuTime => 20500 (21 s)
executorDeserializeTime 

In [23]:
# QUERY 7
stagemetrics.begin()

q7_df = rating_df.withColumn('year', year(rating_df['timestamp'])).groupBy('year','userId').count()
window = Window.partitionBy(q7_df['year']).orderBy(q7_df['count'].desc())
q7_df = q7_df.select('*', row_number().over(window).alias('row_num')).filter(col('row_num') <= 10)
q7_df.filter(q7_df['year'] == 1995).select(col('userId')).orderBy(col('userId').asc()).show()

stagemetrics.end()
stagemetrics.print_report()

+------+
|userId|
+------+
| 28507|
|131160|
+------+


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 3
numTasks => 412
elapsedTime => 3013 (3 s)
stageDuration => 3011 (3 s)
executorRunTime => 33901 (34 s)
executorCpuTime => 30522 (31 s)
executorDeserializeTime => 347 (0.3 s)
executorDeserializeCpuTime => 412 (0.4 s)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 1821 (2 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 5 (5 ms)
resultSize => 1377440 (1345.0 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpilled => 0 (0 Bytes)
peakExecutionMemory => 54525952
recordsRead => 20000263
bytesRead => 691677634 (659.0 MB)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 4
shuffleTotalBlocksFetched => 4
shuffleLocalBlocksFetched => 4
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 278 (278 Bytes)
shuffleLocalBytesRead => 278 (278 Bytes)
shuffleRemoteBytesRead => 0 (0 Bytes)
shuffleRemot

In [27]:
# QUERY 8
stagemetrics.begin()

q8_df = movie_df.withColumn('genre',explode(split('genres',"[|]")))
q8_df = q8_df.join(rating_df,['movieId'],'left')
q8_df = q8_df.groupBy('genre','title').count()
window = Window.partitionBy(q8_df['genre']).orderBy(q8_df['count'].desc())
q8_df = q8_df.select('*', row_number().over(window).alias('rank')).filter(col('rank') == 1).orderBy(col('genre').asc())
q8_df.select(col('genre'), col('title')).show()

stagemetrics.end()
stagemetrics.print_report()

+------------------+--------------------+
|             genre|               title|
+------------------+--------------------+
|(no genres listed)|Doctor Who: The T...|
|            Action|Jurassic Park (1993)|
|         Adventure|Jurassic Park (1993)|
|         Animation|    Toy Story (1995)|
|          Children|    Toy Story (1995)|
|            Comedy| Pulp Fiction (1994)|
|             Crime| Pulp Fiction (1994)|
|       Documentary|Bowling for Colum...|
|             Drama| Pulp Fiction (1994)|
|           Fantasy|    Toy Story (1995)|
|         Film-Noir|L.A. Confidential...|
|            Horror|Silence of the La...|
|              IMAX|    Apollo 13 (1995)|
|           Musical|      Aladdin (1992)|
|           Mystery|Usual Suspects, T...|
|           Romance| Forrest Gump (1994)|
|            Sci-Fi|Jurassic Park (1993)|
|          Thriller| Pulp Fiction (1994)|
|               War| Forrest Gump (1994)|
|           Western|Dances with Wolve...|
+------------------+--------------

In [30]:
# QUERY 9
stagemetrics.begin()

q9_df = rating_df.withColumn('year', year(rating_df['timestamp']))
q9_df = q9_df.withColumn('doy', dayofyear(rating_df['timestamp']))
q9_df = q9_df.withColumn('hour', hour(rating_df['timestamp']))
q9_df = q9_df.groupBy(q9_df['movieId'],q9_df['year'],q9_df['doy'],q9_df['hour']).count()
q9 = q9_df.filter(q9_df['count']>1).select(sum('count').alias('count'))
q9.show()

stagemetrics.end()
stagemetrics.print_report()

+-------+
|  count|
+-------+
|4281178|
+-------+


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 3
numTasks => 213
elapsedTime => 8795 (9 s)
stageDuration => 8793 (9 s)
executorRunTime => 102101 (1.7 min)
executorCpuTime => 83665 (1.4 min)
executorDeserializeTime => 342 (0.3 s)
executorDeserializeCpuTime => 336 (0.3 s)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 13616 (14 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 556 (0.6 s)
resultSize => 838537 (818.0 KB)
diskBytesSpilled => 238769112 (227.0 MB)
memoryBytesSpilled => 2152726528 (2.0 GB)
peakExecutionMemory => 2123366400
recordsRead => 20000263
bytesRead => 691677634 (659.0 MB)
recordsWritten => 0
bytesWritten => 0 (0 Bytes)
shuffleRecordsRead => 19632382
shuffleTotalBlocksFetched => 2600
shuffleLocalBlocksFetched => 2600
shuffleRemoteBlocksFetched => 0
shuffleTotalBytesRead => 276246804 (263.0 MB)
shuffleLocalBytesRead => 276246804 (263.0 MB)
sh

In [25]:
# QUERY 10
stagemetrics.begin()

q10_df = rating_df.join(tag_df, ['movieId'], 'inner')
q10_df = q10_df.select("movieId","tag").where((col('tag') == "funny") & (col('rating') > 3.5)).dropDuplicates()
q10_df = q10_df.join(movie_df, ['movieId'], 'inner')
q10_df = q10_df.withColumn("genre",explode(split("genres","[|]")))
q10_df = q10_df = q10_df.groupBy(q10_df['genre']).count().orderBy(q10_df['genre'].asc())
q10_df.show()

stagemetrics.end()
stagemetrics.print_report()

+-----------+-----+
|      genre|count|
+-----------+-----+
|     Action|  123|
|  Adventure|  128|
|  Animation|   83|
|   Children|   97|
|     Comedy|  527|
|      Crime|   82|
|Documentary|   15|
|      Drama|  182|
|    Fantasy|   78|
|  Film-Noir|    3|
|     Horror|   44|
|       IMAX|   22|
|    Musical|   39|
|    Mystery|   20|
|    Romance|  138|
|     Sci-Fi|   60|
|   Thriller|   69|
|        War|   12|
|    Western|    9|
+-----------+-----+


Scheduling mode = FIFO
Spark Context default degree of parallelism = 12
Aggregated Spark stage metrics:
numStages => 5
numTasks => 419
elapsedTime => 3236 (3 s)
stageDuration => 5572 (6 s)
executorRunTime => 34779 (35 s)
executorCpuTime => 27201 (27 s)
executorDeserializeTime => 623 (0.6 s)
executorDeserializeCpuTime => 600 (0.6 s)
resultSerializationTime => 0 (0 ms)
jvmGCTime => 4772 (5 s)
shuffleFetchWaitTime => 0 (0 ms)
shuffleWriteTime => 1113 (1 s)
resultSize => 1963839 (1917.0 KB)
diskBytesSpilled => 0 (0 Bytes)
memoryBytesSpi