In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Movie Dataset MapReduce").getOrCreate()

In [2]:
file_path = "TheMoviesDataset\movies_metadata.csv"
movies_df = spark.read.csv(file_path, header=True, inferSchema=True)

movies_df.show(5)

  file_path = "TheMoviesDataset\movies_metadata.csv"


+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------+--------------------+-----------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|        release_date|             revenue|             runtime|    spoken_languages|  status|             tagline|               title|   video|        vote_average|       vote_count|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+-----

In [None]:
from pyspark.sql.functions import from_json, col, explode
from pyspark.sql.types import ArrayType, StructType, StructField, StringType, IntegerType

genres_schema = ArrayType(StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True)
]))

# Parse the genres column
movies_df = movies_df.withColumn("genres_parsed", from_json(col("genres"), genres_schema))

# Show the parsed genres
movies_df.select("genres", "genres_parsed").show(5, truncate=False)


+-------------------------------------------------------------------------------------------------+-------------------------------------------------+
|genres                                                                                           |genres_parsed                                    |
+-------------------------------------------------------------------------------------------------+-------------------------------------------------+
|[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}] |[{16, Animation}, {35, Comedy}, {10751, Family}] |
|[{'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 10751, 'name': 'Family'}]|[{12, Adventure}, {14, Fantasy}, {10751, Family}]|
|[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]                                 |[{10749, Romance}, {35, Comedy}]                 |
|[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]    |[

In [4]:
# Explode the genres_parsed column to get one row per genre
movies_genres = movies_df.select(explode(col("genres_parsed.name")).alias("genre"))

# Show the exploded genres
movies_genres.show(10, truncate=False)

+---------+
|genre    |
+---------+
|Animation|
|Comedy   |
|Family   |
|Adventure|
|Fantasy  |
|Family   |
|Romance  |
|Comedy   |
|Comedy   |
|Drama    |
+---------+
only showing top 10 rows



In [5]:
# Count movies per genre
genre_counts = movies_genres.groupBy("genre").count()

# Show the results
genre_counts.show(truncate=False)

+---------------+-----+
|genre          |count|
+---------------+-----+
|Crime          |4294 |
|Romance        |6722 |
|TV Movie       |764  |
|Thriller       |7612 |
|Adventure      |3477 |
|Foreign        |1620 |
|Drama          |20239|
|War            |1321 |
|Documentary    |3929 |
|Family         |2747 |
|Fantasy        |2308 |
|History        |1396 |
|Mystery        |2466 |
|Animation      |1922 |
|Music          |1591 |
|Science Fiction|3041 |
|Horror         |4656 |
|Western        |1040 |
|Comedy         |13123|
|Action         |6581 |
+---------------+-----+
only showing top 20 rows

