In [1]:
from pyspark.sql import functions as F

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("StreamingAnalytics").getOrCreate()

# Load each dataset
hulu_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/hulu_titles.csv", header=True)
disney_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/disney_plus_titles.csv", header=True)
amazon_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/amazon_prime_titles.csv", header=True)
netflix_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/netflix_titles.csv", header=True)


                                                                                

In [3]:
# Show the first few rows of each DataFrame
hulu_df.show()
disney_df.show()
amazon_df.show()
netflix_df.show()

# Show the schema of each DataFrame
hulu_df.printSchema()
disney_df.printSchema()
amazon_df.printSchema()
netflix_df.printSchema()


+-------+-------+--------------------+--------+----+-------------+----------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|director|cast|      country|      date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------+----+-------------+----------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Ricky Velez: Here...|    null|null|         null|October 24, 2021|        2021| TV-MA|     null|    Comedy, Stand Up|​Comedian Ricky V...|
|     s2|  Movie|        Silent Night|    null|null|         null|October 23, 2021|        2020|  null|   94 min|Crime, Drama, Thr...|Mark, a low end S...|
|     s3|  Movie|        The Marksman|    null|null|         null|October 23, 2021|        2021| PG-13|  108 min|    Action, Thriller|A hardened Arizon...|
|     s4|  Movie|                Gaia|    null|null|         nul

+-------+-------+--------------------+--------------------+--------------------+--------------+--------------+------------+------+--------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|       country|    date_added|release_year|rating|duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------+--------------+------------+------+--------+--------------------+--------------------+
|     s1|  Movie| The Grand Seduction|        Don McKellar|Brendan Gleeson, ...|        Canada|March 30, 2021|        2014|  null| 113 min|       Comedy, Drama|A small fishing v...|
|     s2|  Movie|Take Care Good Night|        Girish Joshi|Mahesh Manjrekar,...|         India|March 30, 2021|        2018|   13+| 110 min|Drama, International|A Metro Family de...|
|     s3|  Movie|Secrets of Deception|         Josh Webber|Tom Sizemore, Lor...| United St

In [4]:
feature_columns = ['title', 'listed_in', 'rating', 'release_year', 'country']

In [5]:
# Importing required libraries
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("StreamingAnalytics").getOrCreate()

# Load the datasets
hulu_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/hulu_titles.csv", header=True)
disney_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/disney_plus_titles.csv", header=True)
amazon_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/amazon_prime_titles.csv", header=True)
netflix_df = spark.read.csv("hdfs://localhost:9000/user/hduser/streaming_datasets/netflix_titles.csv", header=True)

# Columns to keep
selected_columns = ['title', 'type', 'release_year', 'rating', 'listed_in']

# Filter the dataframes
hulu_filtered = hulu_df.select(selected_columns)
disney_filtered = disney_df.select(selected_columns)
amazon_filtered = amazon_df.select(selected_columns)
netflix_filtered = netflix_df.select(selected_columns)

# Show the first few rows to confirm
hulu_filtered.show()
disney_filtered.show()
amazon_filtered.show()
netflix_filtered.show()


+--------------------+-------+------------+------+--------------------+
|               title|   type|release_year|rating|           listed_in|
+--------------------+-------+------------+------+--------------------+
|Ricky Velez: Here...|  Movie|        2021| TV-MA|    Comedy, Stand Up|
|        Silent Night|  Movie|        2020|  null|Crime, Drama, Thr...|
|        The Marksman|  Movie|        2021| PG-13|    Action, Thriller|
|                Gaia|  Movie|        2021|     R|              Horror|
|            Settlers|  Movie|        2021|  null|Science Fiction, ...|
|The Halloween Can...|TV Show|        2021|  null|        Family, Kids|
|  The Evil Next Door|  Movie|        2020|  null|    Horror, Thriller|
|The Next Thing Yo...|TV Show|        2021|  null|Cooking & Food, D...|
|              Queens|TV Show|        2021| TV-14|        Drama, Music|
|    The Bachelorette|TV Show|        2003| TV-14|    Reality, Romance|
|The Real Queens o...|TV Show|        2021|  null|         Music

In [6]:
# Describe the data and show null values
for df in [hulu_filtered, disney_filtered, amazon_filtered, netflix_filtered]:
    df.describe().show()
    df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show()


                                                                                

+-------+--------------------+-----------------+------------------+--------+---------+
|summary|               title|             type|      release_year|  rating|listed_in|
+-------+--------------------+-----------------+------------------+--------+---------+
|  count|                3079|             3083|              3073|    2553|     3073|
|   mean|               704.0|             null|2012.5675235925805|    null|     null|
| stddev|  1180.3935784305165|             null| 10.84406878215617|    null|     null|
|    min| and her mother m...| Mail Order Wife)|              1923|1 Season|   Action|
|    max|” Jimmy’s once ho...|          TV Show|              2021|   TV-Y7| Thriller|
+-------+--------------------+-----------------+------------------+--------+---------+

+-----+----+------------+------+---------+
|title|type|release_year|rating|listed_in|
+-----+----+------------+------+---------+
|   11|   7|          17|   537|       17|
+-----+----+------------+------+---------+



In [8]:
# Fill null values if any (replace 'value_to_fill' accordingly)
amazon_filtered = amazon_filtered.na.fill({'rating': 'Not Available', 'release_year': 'Not Available', 'type': 'Not Available', 'listed_in': 'Not Available'})
netflix_filtered = netflix_filtered.na.fill({'rating': 'Not Available', 'release_year': 'Not Available', 'type': 'Not Available', 'listed_in': 'Not Available'})

# Example: Count the number of titles by type for each platform
hulu_grouped = hulu_filtered.groupBy('type').count().show()
disney_grouped = disney_filtered.groupBy('type').count().show()
amazon_grouped = amazon_filtered.groupBy('type').count().show()
netflix_grouped = netflix_filtered.groupBy('type').count().show()



+--------------------+-----+
|                type|count|
+--------------------+-----+
| the early days o...|    1|
| “DAS BOOT” explo...|    1|
|  picaresque story!"|    1|
| her sister may d...|    1|
| who plays “Edgar...|    1|
|                null|    7|
| bizarre and inex...|    1|
|             TV Show| 1589|
| a curse flowing ...|    1|
|               Movie| 1484|
| brought on by th...|    1|
| his Persona awakens|    1|
|    Mail Order Wife)|    1|
+--------------------+-----+

+-------+-----+
|   type|count|
+-------+-----+
|TV Show|  398|
|  Movie| 1052|
+-------+-----+

+--------------------+-----+
|                type|count|
+--------------------+-----+
| 17-year-old Arat...|    1|
| Iori runs into a...|    1|
|             TV Show| 1854|
| and fight to gai...|    1|
| """"BANANA FISH"...|    1|
| once he sees one...|    2|
|               Movie| 7814|
| the threat of th...|    1|
|     the blazing sun|    1|
|       Not Available|   10|
| will Mahmut be a...|    1|
+--

In [9]:
# Example: Count the number of titles by year for each platform
hulu_year_grouped = hulu_filtered.groupBy('release_year').count().orderBy('release_year').show()
disney_year_grouped = disney_filtered.groupBy('release_year').count().orderBy('release_year').show()
amazon_year_grouped = amazon_filtered.groupBy('release_year').count().orderBy('release_year').show()
netflix_year_grouped = netflix_filtered.groupBy('release_year').count().orderBy('release_year').show()


+------------+-----+
|release_year|count|
+------------+-----+
|        null|   17|
|        1923|    1|
|        1933|    1|
|        1950|    3|
|        1951|    2|
|        1953|    4|
|        1954|    1|
|        1955|    2|
|        1957|    4|
|        1958|    2|
|        1959|    2|
|        1960|    1|
|        1961|    1|
|        1962|    3|
|        1963|    4|
|        1964|    2|
|        1965|    5|
|        1966|    2|
|        1967|    2|
|        1968|    1|
+------------+-----+
only showing top 20 rows

+----------------+-----+
|    release_year|count|
+----------------+-----+
|  Carrie Keranen|    1|
| Felicity Jones"|    1|
|            1928|    1|
|            1932|    3|
|            1933|    3|
|            1934|    4|
|            1935|    4|
|            1936|    6|
|            1937|    6|
|            1938|    5|
|            1939|    5|
|            1940|    7|
|            1941|    7|
|            1942|    6|
|            1943|    2|
|            1944|  

In [11]:
# Count the number of titles by year for each platform and save it
hulu_year_grouped = hulu_filtered.groupBy('release_year').count().orderBy('release_year')
disney_year_grouped = disney_filtered.groupBy('release_year').count().orderBy('release_year')
amazon_year_grouped = amazon_filtered.groupBy('release_year').count().orderBy('release_year')
netflix_year_grouped = netflix_filtered.groupBy('release_year').count().orderBy('release_year')

# Now, save these to CSV in your specified folder on your Ubuntu Desktop
hulu_year_grouped.write.csv("/home/hduser/Desktop/CCT-Dulin/hulu_year_grouped.csv")
disney_year_grouped.write.csv("/home/hduser/Desktop/CCT-Dulin/disney_year_grouped.csv")
amazon_year_grouped.write.csv("/home/hduser/Desktop/CCT-Dulin/amazon_year_grouped.csv")
netflix_year_grouped.write.csv("/home/hduser/Desktop/CCT-Dulin/netflix_year_grouped.csv")


In [12]:
hulu_pd = hulu_year_grouped.toPandas()
disney_pd = disney_year_grouped.toPandas()
amazon_pd = amazon_year_grouped.toPandas()
netflix_pd = netflix_year_grouped.toPandas()
