In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
#PYSPARK_DRIVER_PYTHON = 3.85
#PYSPARK_PYTHON = 3.85
import os
import sys
#import pyspark as spark

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Same code as shown in SimpleApp.py


In [2]:
def open_with_spark(log_file="data/movies.csv", app_name="movieAnalysis"):
    spark = SparkSession.builder.appName(app_name).getOrCreate()
    df = spark.read.option("header",True).csv(log_file).cache()
    return df

In [3]:
# 2. Join Dataframes READABLE version, more memory needed
def join_ratings_and_movies_readable():  
    df_movie = open_with_spark()
    df_ratings = open_with_spark(log_file="data/ratings.csv", app_name="ratings")
    df_join = df_ratings.join(df_movie, "movieID")
    return df_join
    
# 2. Join Dataframes
def join_ratings_and_movies(): 
    return open_with_spark().join(open_with_spark(log_file="data/ratings.csv", app_name="ratings"), "movieID")

In [4]:
X = join_ratings_and_movies()
X.head(5)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/13 17:39:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/13 17:39:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

[Row(movieId='307', userId='1', rating='3.5', timestamp='1256677221', title='Three Colors: Blue (Trois couleurs: Bleu) (1993)', genres='Drama'),
 Row(movieId='481', userId='1', rating='3.5', timestamp='1256677456', title='Kalifornia (1993)', genres='Drama|Thriller'),
 Row(movieId='1091', userId='1', rating='1.5', timestamp='1256677471', title="Weekend at Bernie's (1989)", genres='Comedy'),
 Row(movieId='1257', userId='1', rating='4.5', timestamp='1256677460', title='Better Off Dead... (1985)', genres='Comedy|Romance'),
 Row(movieId='1449', userId='1', rating='4.5', timestamp='1256677264', title='Waiting for Guffman (1996)', genres='Comedy')]

In [6]:
df_updated = X.groupby("title").agg(F.count("rating")).withColumnRenamed("count(rating)", "Num_ratings").sort("Num_ratings", ascending=False)
df_updated.limit(5)




23/04/13 17:40:44 WARN MemoryStore: Not enough space to cache rdd_32_1 in memory! (computed 35.3 MiB so far)
23/04/13 17:40:44 WARN BlockManager: Persisting block rdd_32_1 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_7 in memory! (computed 54.6 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_7 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_2 in memory! (computed 54.4 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_2 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_6 in memory! (computed 54.7 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_6 to disk instead.
23/04/13 17:40:50 WARN MemoryStore: Not enough space to cache rdd_32_4 in memory! (computed 54.6 MiB so far)
23/04/13 17:40:50 WARN BlockManager: Persisting block rdd_32_4 to disk instead.
23/04/13 17:40:51 WARN MemoryStore: Not enough space to

                                                                                

Row(title='Shawshank Redemption, The (1994)', Num_ratings=97999)

In [6]:
"""
Returns the top N movies with the most reviews (ratings)

"""
# (may be necessary for @param type)
import pyspark

# 3. Most-rated movies
def most_rated(df: pyspark.sql.dataframe.DataFrame, N=10):
    return df.groupby("title").agg(F.count("rating")).withColumnRenamed("count(rating)", "Num_ratings").sort("Num_ratings", ascending=False).limit(N)

# Function that does everything; can be used for timing or outputting or whatever
def most_rated_complete(N=10):
    JOINED = join_ratings_and_movies()
    TOP_N = most_rated(JOINED, N)
    return TOP_N


In [9]:
A = most_rated_complete()
A.head(10)

23/04/13 17:48:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/04/13 17:48:03 WARN CacheManager: Asked to cache already cached data.
23/04/13 17:48:03 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
23/04/13 17:48:03 WARN CacheManager: Asked to cache already cached data.


[Stage 9:>                                                          (0 + 8) / 8]

23/04/13 17:48:30 WARN MemoryStore: Not enough space to cache rdd_92_1 in memory! (computed 54.3 MiB so far)
23/04/13 17:48:30 WARN BlockManager: Persisting block rdd_92_1 to disk instead.
23/04/13 17:48:30 WARN MemoryStore: Not enough space to cache rdd_92_4 in memory! (computed 54.6 MiB so far)
23/04/13 17:48:30 WARN BlockManager: Persisting block rdd_92_4 to disk instead.
23/04/13 17:48:31 WARN MemoryStore: Not enough space to cache rdd_92_6 in memory! (computed 54.7 MiB so far)
23/04/13 17:48:31 WARN BlockManager: Persisting block rdd_92_6 to disk instead.
23/04/13 17:48:31 WARN MemoryStore: Not enough space to cache rdd_92_7 in memory! (computed 54.6 MiB so far)
23/04/13 17:48:31 WARN BlockManager: Persisting block rdd_92_7 to disk instead.
23/04/13 17:48:31 WARN MemoryStore: Not enough space to cache rdd_92_3 in memory! (computed 54.4 MiB so far)
23/04/13 17:48:31 WARN BlockManager: Persisting block rdd_92_3 to disk instead.
23/04/13 17:48:31 WARN MemoryStore: Not enough space to

                                                                                

[Row(title='Shawshank Redemption, The (1994)', Num_ratings=97999),
 Row(title='Forrest Gump (1994)', Num_ratings=97040),
 Row(title='Pulp Fiction (1994)', Num_ratings=92406),
 Row(title='Silence of the Lambs, The (1991)', Num_ratings=87899),
 Row(title='Matrix, The (1999)', Num_ratings=84545),
 Row(title='Star Wars: Episode IV - A New Hope (1977)', Num_ratings=81815),
 Row(title='Jurassic Park (1993)', Num_ratings=76451),
 Row(title="Schindler's List (1993)", Num_ratings=71516),
 Row(title='Braveheart (1995)', Num_ratings=68803),
 Row(title='Toy Story (1995)', Num_ratings=68469)]

In [124]:
df_join = df_ratings.join(df_movie, "movieID")

In [125]:
df_join.head(5)

                                                                                

[Row(movieId='307', userId='1', rating='3.5', timestamp='1256677221', title='Three Colors: Blue (Trois couleurs: Bleu) (1993)', genres='Drama'),
 Row(movieId='481', userId='1', rating='3.5', timestamp='1256677456', title='Kalifornia (1993)', genres='Drama|Thriller'),
 Row(movieId='1091', userId='1', rating='1.5', timestamp='1256677471', title="Weekend at Bernie's (1989)", genres='Comedy'),
 Row(movieId='1257', userId='1', rating='4.5', timestamp='1256677460', title='Better Off Dead... (1985)', genres='Comedy|Romance'),
 Row(movieId='1449', userId='1', rating='4.5', timestamp='1256677264', title='Waiting for Guffman (1996)', genres='Comedy')]