In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
#PYSPARK_DRIVER_PYTHON = 3.85
#PYSPARK_PYTHON = 3.85
import os
import sys
#import pyspark as spark

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Same code as shown in SimpleApp.py


In [2]:
def open_with_spark(log_file="data/movies.csv", app_name="movieAnalysis"):
    spark = SparkSession.builder.appName(app_name).getOrCreate()
    df = spark.read.option("header",True).csv(log_file).cache()
    return df

In [25]:
import pyspark
"""
1. Summary statistics for each relevant data frame

"""
def summary_statistics(df: pyspark.sql.dataframe.DataFrame, cols= ["*"], stats = ["count", "mean", "stddev", "min", "25%", "75%", "max"]):
    df.select(*cols).summary(*stats).show()
# Do all
def summary_statistics_complete(file= "data/ratings.csv", cols= ["*"], stats = ["count", "mean", "stddev", "min", "25%", "75%", "max"]):
    summary_statistics(open_with_spark(file), cols, stats)

In [30]:
"""

Print out number of movies and number of unqiue users.

"""

summary_statistics_complete("data/movies.csv",["title"], ["count"])
# Get df of unique users
df_unique_users = open_with_spark("data/ratings.csv").select("UserID").distinct()
summary_statistics(df_unique_users,["userID"], ["count"])

23/04/16 15:54:57 WARN CacheManager: Asked to cache already cached data.
+-------+-----+
|summary|title|
+-------+-----+
|  count|58098|
+-------+-----+

23/04/16 15:54:58 WARN CacheManager: Asked to cache already cached data.
23/04/16 15:54:58 WARN MemoryStore: Not enough space to cache rdd_55_1 in memory! (computed 6.6 MiB so far)
23/04/16 15:54:58 WARN MemoryStore: Not enough space to cache rdd_55_0 in memory! (computed 6.6 MiB so far)




+-------+------+
|summary|userID|
+-------+------+
|  count|283228|
+-------+------+



                                                                                

In [27]:
summary_statistics_complete("data/ratings.csv",["rating"])

23/04/16 15:51:00 WARN CacheManager: Asked to cache already cached data.
23/04/16 15:51:01 WARN MemoryStore: Not enough space to cache rdd_55_0 in memory! (computed 13.0 MiB so far)
23/04/16 15:51:01 WARN MemoryStore: Not enough space to cache rdd_55_1 in memory! (computed 12.9 MiB so far)




+-------+------------------+
|summary|            rating|
+-------+------------------+
|  count|          27753444|
|   mean|3.5304452124932677|
| stddev| 1.066352750231989|
|    min|               0.5|
|    25%|               3.0|
|    75%|               4.0|
|    max|               5.0|
+-------+------------------+



                                                                                

In [3]:
# 2. Join Dataframes READABLE version, more memory needed
def join_ratings_and_movies_readable():  
    df_movie = open_with_spark()
    df_ratings = open_with_spark(log_file="data/ratings.csv", app_name="ratings")
    df_join = df_ratings.join(df_movie, "movieID")
    return df_join
    
# 2. Join Dataframes
def join_ratings_and_movies(): 
    return open_with_spark().join(open_with_spark(log_file="data/ratings.csv", app_name="ratings"), "movieID")

In [4]:
X = join_ratings_and_movies()
X.show(5)

23/04/14 12:18:58 WARN Utils: Your hostname, Coopers-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.16.108.70 instead (on interface en0)
23/04/14 12:18:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/14 12:18:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/14 12:19:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[Stage 3:>                                                          (0 + 1) / 1]

+-------+--------------------+--------------+------+------+----------+
|movieId|               title|        genres|userId|rating| timestamp|
+-------+--------------------+--------------+------+------+----------+
|    307|Three Colors: Blu...|         Drama|     1|   3.5|1256677221|
|    481|   Kalifornia (1993)|Drama|Thriller|     1|   3.5|1256677456|
|   1091|Weekend at Bernie...|        Comedy|     1|   1.5|1256677471|
|   1257|Better Off Dead.....|Comedy|Romance|     1|   4.5|1256677460|
|   1449|Waiting for Guffm...|        Comedy|     1|   4.5|1256677264|
+-------+--------------------+--------------+------+------+----------+
only showing top 5 rows



                                                                                

In [6]:
df_updated = X.groupby("title").agg(F.count("rating")).withColumnRenamed("count(rating)", "Num_ratings").sort("Num_ratings", ascending=False)
df_updated.limit(5)




23/04/13 17:40:44 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 35.3 MiB so far)
23/04/13 17:40:44 WARN BlockManager: Persisting block rdd_32_1 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_7 in memory! (computed 54.6 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_7 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 54.4 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_2 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_6 in memory! (computed 54.7 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_6 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_4 in memory! (computed 54.6 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_4 to disk instead.
23/04/13 17:40:51 WARN MemoryStore: Not enough space to

                                                                                

Row(title='Shawshank Redemption, The (1994)', Num_ratings=97999)

In [4]:
"""
Returns the top N movies with the most reviews (ratings)

"""
# (may be necessary for @param type)
import pyspark

# 3. Most-rated movies
def most_rated(df: pyspark.sql.dataframe.DataFrame, N=10):
    return df.groupby("title").agg(F.count("rating")).withColumnRenamed("count(rating)", "Num_ratings").sort("Num_ratings", ascending=False).limit(N)

# Function that does everything; can be used for timing or outputting or whatever
def most_rated_complete(N=10):
    JOINED = join_ratings_and_movies()
    TOP_N = most_rated(JOINED, N)
    return TOP_N


In [5]:
most_rated_complete(5).show(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/16 17:13:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/16 17:13:59 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[Stage 3:>                                                          (0 + 8) / 8]

23/04/16 17:14:22 WARN MemoryStore: Not enough space to cache rdd_32_5 in memory! (computed 54.7 MiB so far)
23/04/16 17:14:22 WARN BlockManager: Persisting block rdd_32_5 to disk instead.
23/04/16 17:14:22 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 54.4 MiB so far)
23/04/16 17:14:22 WARN BlockManager: Persisting block rdd_32_2 to disk instead.
23/04/16 17:14:22 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 54.3 MiB so far)
23/04/16 17:14:22 WARN MemoryStore: Not enough space to cache rdd_32_7 in memory! (computed 54.6 MiB so far)
23/04/16 17:14:22 WARN BlockManager: Persisting block rdd_32_7 to disk instead.
23/04/16 17:14:22 WARN BlockManager: Persisting block rdd_32_1 to disk instead.
23/04/16 17:14:22 WARN MemoryStore: Not enough space to cache rdd_32_0 in memory! (computed 54.4 MiB so far)
23/04/16 17:14:22 WARN BlockManager: Persisting block rdd_32_0 to disk instead.
23/04/16 17:14:22 WARN MemoryStore: Not enough space to



+--------------------+-----------+
|               title|Num_ratings|
+--------------------+-----------+
|Shawshank Redempt...|      97999|
| Forrest Gump (1994)|      97040|
| Pulp Fiction (1994)|      92406|
|Silence of the La...|      87899|
|  Matrix, The (1999)|      84545|
+--------------------+-----------+



                                                                                

In [4]:
"""
Returns the top N movies with the highest average reviews (ratings)

"""
# (may be necessary for @param type)
import pyspark

# 4. Highest-average-rated movies
def best_average_rated(df: pyspark.sql.dataframe.DataFrame, N=10, MIN_RATINGS=50):
    T = df.groupby("title").agg(F.mean("rating"))
    H = T.withColumnRenamed("avg(rating)", "Mean_rating")
    # H.show(5)
    G = df.groupby("title").agg(F.count("rating")).withColumnRenamed("count(rating)", "Num_ratings")
    C = H.join(G, "title")
    C = C.filter(C.Num_ratings >= MIN_RATINGS).select(["title", "Mean_rating"])
    J = C.sort("Mean_rating", ascending=False)
    K = J.limit(N).withColumn("Mean_rating", F.round("Mean_rating",3))
    return K

# Function that does everything; can be used for timing or outputting or whatever
def best_average_rated_complete(N=10, MIN_RATINGS=50):
    JOINED = join_ratings_and_movies()
    TOP_N = best_average_rated(JOINED, N, MIN_RATINGS)
    return TOP_N #.select("*", round("Mean_ratings"))

In [5]:
A = best_average_rated_complete()
A.show(10)

23/04/14 16:21:07 WARN Utils: Your hostname, Coopers-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.16.108.70 instead (on interface en0)
23/04/14 16:21:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/14 16:21:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/14 16:21:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


[Stage 2:>                  (0 + 1) / 1][Stage 3:>                  (0 + 1) / 1]

23/04/14 16:21:18 WARN BlockManager: Block rdd_23_0 already exists on this machine; not re-adding it


[Stage 4:>                  (0 + 8) / 8][Stage 5:>                  (0 + 0) / 8]

23/04/14 16:21:45 WARN MemoryStore: Not enough space to cache rdd_37_1 in memory! (computed 54.3 MiB so far)
23/04/14 16:21:45 WARN BlockManager: Persisting block rdd_37_1 to disk instead.
23/04/14 16:21:45 WARN MemoryStore: Not enough space to cache rdd_37_2 in memory! (computed 54.4 MiB so far)
23/04/14 16:21:45 WARN BlockManager: Persisting block rdd_37_2 to disk instead.
23/04/14 16:21:45 WARN MemoryStore: Not enough space to cache rdd_37_0 in memory! (computed 54.4 MiB so far)
23/04/14 16:21:45 WARN BlockManager: Persisting block rdd_37_0 to disk instead.
23/04/14 16:21:45 WARN MemoryStore: Not enough space to cache rdd_37_4 in memory! (computed 54.6 MiB so far)
23/04/14 16:21:45 WARN BlockManager: Persisting block rdd_37_4 to disk instead.
23/04/14 16:21:45 WARN MemoryStore: Not enough space to cache rdd_37_3 in memory! (computed 54.4 MiB so far)
23/04/14 16:21:45 WARN BlockManager: Persisting block rdd_37_3 to disk instead.
23/04/14 16:21:46 WARN MemoryStore: Not enough space to

[Stage 4:==>                (1 + 7) / 8][Stage 5:>                  (0 + 1) / 8]

23/04/14 16:21:56 WARN MemoryStore: Not enough space to cache rdd_37_0 in memory! (computed 13.0 MiB so far)




23/04/14 16:21:57 WARN MemoryStore: Not enough space to cache rdd_37_1 in memory! (computed 54.3 MiB so far)
23/04/14 16:21:57 WARN MemoryStore: Not enough space to cache rdd_37_2 in memory! (computed 54.4 MiB so far)




23/04/14 16:21:59 WARN MemoryStore: Not enough space to cache rdd_37_7 in memory! (computed 22.6 MiB so far)
23/04/14 16:21:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
23/04/14 16:21:59 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.




+--------------------+-----------+
|               title|Mean_rating|
+--------------------+-----------+
|Planet Earth II (...|      4.487|
| Planet Earth (2006)|      4.458|
|Shawshank Redempt...|      4.424|
|Band of Brothers ...|        4.4|
|Black Mirror: Whi...|      4.351|
|              Cosmos|      4.344|
|The Godfather Tri...|       4.34|
|Godfather, The (1...|      4.333|
|Usual Suspects, T...|      4.292|
|        Black Mirror|      4.264|
+--------------------+-----------+



                                                                                

In [14]:
"""
5. Popular genres: Find the top N popular genres by calculating the average rating for each genre.
"""
import pyspark
def popular_genres(df: pyspark.sql.dataframe.DataFrame, N=5, MIN_RATINGS=10):
    L = df.groupby("genres").agg(F.mean("rating"))
    L.show(5)
    M = L.withColumnRenamed("avg(rating)", "Mean_rating")
    G = df.groupby("genres").agg(F.count("rating")).withColumnRenamed("count(rating)", "Num_ratings")
    M.show(5)
    K = M.join(G, "genres")
    P = K.filter(K.Num_ratings >= MIN_RATINGS).select(["genres", "Mean_rating"])
    N = P.sort("Mean_rating", ascending=False).limit(N).withColumn("Mean_rating", F.round("Mean_rating",3))
    return N
def popular_genres_complete(N=5, MIN_RATINGS=10):
    JOINED = join_ratings_and_movies()
    TOP_N = popular_genres(JOINED, N)
    return TOP_N

In [13]:
W = popular_genres_complete(10)
W.show(10)


23/04/16 18:02:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/04/16 18:02:52 WARN CacheManager: Asked to cache already cached data.
23/04/16 18:02:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/04/16 18:02:53 WARN CacheManager: Asked to cache already cached data.
23/04/16 18:02:53 WARN MemoryStore: Not enough space to cache rdd_32_0 in memory! (computed 22.6 MiB so far)
23/04/16 18:02:53 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 22.5 MiB so far)
23/04/16 18:02:53 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 35.3 MiB so far)


                                                                                

+--------------------+------------------+
|              genres|       avg(rating)|
+--------------------+------------------+
|Comedy|Horror|Thr...| 3.288320727995902|
|Adventure|Sci-Fi|...| 3.212121212121212|
|Action|Adventure|...| 4.011721534573262|
| Action|Drama|Horror|3.7695176529090006|
|Action|Animation|...|  3.76522506619594|
+--------------------+------------------+
only showing top 5 rows

23/04/16 18:02:57 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 22.5 MiB so far)
23/04/16 18:02:57 WARN MemoryStore: Not enough space to cache rdd_32_0 in memory! (computed 22.6 MiB so far)
23/04/16 18:02:57 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 35.2 MiB so far)


                                                                                

+--------------------+------------------+
|              genres|       Mean_rating|
+--------------------+------------------+
|Comedy|Horror|Thr...| 3.288320727995902|
|Adventure|Sci-Fi|...| 3.212121212121212|
|Action|Adventure|...| 4.011721534573262|
| Action|Drama|Horror|3.7695176529090006|
|Action|Animation|...|  3.76522506619594|
+--------------------+------------------+
only showing top 5 rows

23/04/16 18:03:02 WARN MemoryStore: Not enough space to cache rdd_32_0 in memory! (computed 22.6 MiB so far)
23/04/16 18:03:02 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 22.5 MiB so far)
23/04/16 18:03:02 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 35.2 MiB so far)




23/04/16 18:03:04 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 22.5 MiB so far)
23/04/16 18:03:04 WARN MemoryStore: Not enough space to cache rdd_32_0 in memory! (computed 35.3 MiB so far)
23/04/16 18:03:04 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 22.5 MiB so far)


                                                                                

+--------------------+-----------+
|              genres|Mean_rating|
+--------------------+-----------+
|Action|Adventure|...|      4.201|
|Film-Noir|Romance...|      4.164|
|Action|Crime|Dram...|      4.163|
|Action|Adventure|...|      4.157|
|Action|Crime|Dram...|      4.156|
|Adventure|Animati...|      4.152|
|Animation|Childre...|      4.145|
|   Film-Noir|Mystery|      4.128|
|Crime|Film-Noir|M...|      4.127|
|Action|Adventure|...|       4.12|
+--------------------+-----------+

