***Import pyspark libraries and load the Disney_plus datasets***

In [52]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, year, month,split, explode, countDistinct

# Initialize SparkSession
spark = SparkSession.builder.appName("DisneyPlusAnalysis").getOrCreate()

# Load the CSV file
df = spark.read.csv("disney_plus_titles.csv", header=True, inferSchema=True)

# Show the first few rows
df.show(5)

# Print the schema
df.printSchema()


+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|       A Spark Story|Jason Sterman, Le...|Apthon Corbin, Lo...|                NULL|September 24, 2021|        2021| TV-PG|   88 min|         Documentary|Two Pixar filmmak...|
|     s2|  Movie|      Spooky Buddies|        Robert Vince|Tucker Albrizzi, ...|United States, Ca...|September 24, 2021|        2011|     G|   93 min|Comedy, Fantasy, ...|The puppies go on...|
|     s3|  Movie|The Fault in Our .

In [53]:
# Show the null counts
null_counts = df.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns])
null_counts.show(vertical=True)


-RECORD 0-----------
 show_id      | 0   
 type         | 0   
 title        | 0   
 director     | 440 
 cast         | 174 
 country      | 174 
 date_added   | 3   
 release_year | 0   
 rating       | 2   
 duration     | 0   
 listed_in    | 1   
 description  | 0   



In [54]:
# handling missing and null values
df= df.fillna({'title':'Unknown','cast':'Unknown','director': 'Unknown','country': 'Unknown','rating':'0','listed_in':'Unknown','description':'Unknown'})


**DEMONSTRATE SCALABILITY**

In [55]:
# Example: Average runtime of movies
average_runtime = df.filter(col("type") == "Movie").agg(avg("duration")).collect()[0][0]
hours = int(average_runtime / 60)
remaining_minutes = average_runtime % 60
minutes = int(remaining_minutes)
seconds = ((remaining_minutes - minutes) * 60)

print(f"Average movie runtime: {hours}hr:{minutes}mins:{seconds}sec")

Average movie runtime: 33hr:40mins:0.0sec


In [15]:
#show top 10 actors and no. of movies acted
cast_df = df.filter(col("cast").isNotNull()).withColumn("actor", explode(split(col("cast"), ", ")))
actor_counts = cast_df.groupBy("actor").count().orderBy(col("count").desc())
actor_counts.show(10)

+--------------------+-----+
|               actor|count|
+--------------------+-----+
|        Jim Cummings|   32|
|         Walt Disney|   18|
|Larry the Cable Guy |   17|
|      Keith Ferguson|   16|
|        Corey Burton|   15|
|        Bob Peterson|   15|
|        Jeff Bennett|   15|
|         Bill Farmer|   15|
|        Pinto Colvig|   15|
|           Tim Allen|   14|
+--------------------+-----+
only showing top 10 rows



In [51]:
# Filter for movies and non-null directors
movie_directors = df.filter(col("type") == "Movie").filter(col("director").isNotNull())

# Count movies per director
director_movie_counts = movie_directors.groupBy("director").agg(count("*").alias("movie_count")).orderBy(col("movie_count").desc())

director_movie_counts.show()

+----------------+-----------+
|        director|movie_count|
+----------------+-----------+
|         Unknown|         63|
|     Jack Hannah|         17|
|   John Lasseter|         16|
|       Paul Hoen|         16|
| Charles Nichols|         12|
|Robert Stevenson|         12|
|    Bob Peterson|         10|
|Vincent McEveety|         10|
|     James Algar|          9|
| Wilfred Jackson|          9|
|    Kenny Ortega|          9|
|    Norman Tokar|          8|
|  Duwayne Dunham|          8|
|  Stuart Gillard|          8|
|     Jack Kinney|          7|
|  Jon Turteltaub|          6|
|  Clyde Geronimi|          6|
|  Ben Sharpsteen|          6|
|  Michael Hegner|          5|
|  Chris Columbus|          5|
+----------------+-----------+
only showing top 20 rows



In [None]:
#stop the spark session.
spark.stop()