In [111]:
# Importing the necessary modules
import findspark
findspark.init()

# Initialize a SparkSession
from pyspark.sql import SparkSession

# Creating SparkSession
spark = SparkSession.builder.appName('TP').getOrCreate()

# Calling the session variable object
spark

Loading data sets

In [112]:
netflix_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/netflix_titles.csv"
disney_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/disney_plus_titles.csv"

from pyspark import SparkFiles
spark.sparkContext.addFile(netflix_url)
spark.sparkContext.addFile(disney_url)

netflix_df = spark.read.csv("file:///"+SparkFiles.get("netflix_titles.csv"), header=True, inferSchema= True)
disney_df = spark.read.csv("file:///"+SparkFiles.get("disney_plus_titles.csv"), header=True, inferSchema= True)

In [113]:
netflix_df.printSchema()
disney_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [114]:
netflix_df.show(3)
disney_df.show(3)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         null|Septem

In [115]:
netflix_df.createOrReplaceTempView("Netflix")
disney_df.createOrReplaceTempView("Disney")

# Data Cleaning

In [116]:
# remove (do not select) show_id and date_added columns
# convert columns with multiple values to arrays
netflix_clean = spark.sql("SELECT type, \
                  title, \
                  SPLIT(director, ',') AS director, \
                  SPLIT(cast, ',') AS cast, \
                  SPLIT(country, ',') AS country, \
                  release_year, \
                  rating, \
                  duration, \
                  SPLIT(listed_in, ',') AS listed_in, \
                  description \
                  FROM Netflix")
netflix_clean.createOrReplaceTempView("Netflix1")

disney_clean = spark.sql("SELECT type, \
                  title, \
                  SPLIT(director, ',') AS director, \
                  SPLIT(cast, ',') AS cast, \
                  SPLIT(country, ',') AS country, \
                  release_year, \
                  rating, \
                  duration, \
                  SPLIT(listed_in, ',') AS listed_in, \
                  description \
                  FROM Disney")
disney_clean.createOrReplaceTempView("Disney1")

In [117]:
# test duration and listed_in columns to see if they vary between platforms
overlap = spark.sql("SELECT Netflix1.title, Netflix1.duration, Disney1.duration,\
                     Netflix1.listed_in, Disney1.listed_in FROM Netflix1 \
                     INNER JOIN Disney1 ON Netflix1.title = Disney1.title")
overlap.show(5, truncate=False)
# there is variation - worth keeping each dataset separate

+----------------+---------+---------+--------------------------------------------------------+-----------------------------------------------------+
|title           |duration |duration |listed_in                                               |listed_in                                            |
+----------------+---------+---------+--------------------------------------------------------+-----------------------------------------------------+
|PJ Masks        |3 Seasons|5 Seasons|[Kids' TV]                                              |[Action-Adventure,  Animation,  Kids]                |
|Once Upon a Time|1 Season |7 Seasons|[International TV Shows,  Romantic TV Shows,  TV Dramas]|[Action-Adventure,  Fantasy,  Soap Opera / Melodrama]|
|Gigantosaurus   |1 Season |2 Seasons|[Kids' TV]                                              |[Action-Adventure,  Animation,  Kids]                |
|Becoming        |89 min   |1 Season |[Documentaries]                                         |[Anth

In [118]:
netflix_nulls = spark.sql("SELECT COUNT (*) AS type_nulls FROM Netflix1 WHERE type IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS title_nulls FROM Netflix1 WHERE title IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS director_nulls FROM Netflix1 WHERE director IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS cast_nulls FROM Netflix1 WHERE cast IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS country_nulls FROM Netflix1 WHERE country IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS release_year_nulls FROM Netflix1 WHERE release_year IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS rating_nulls FROM Netflix1 WHERE rating IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS duration_nulls FROM Netflix1 WHERE duration IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS listed_in_nulls FROM Netflix1 WHERE listed_in IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS desctiption_nulls FROM Netflix1 WHERE description IS NULL")
netflix_nulls.show()

+----------+
|type_nulls|
+----------+
|         1|
+----------+

+-----------+
|title_nulls|
+-----------+
|          2|
+-----------+

+--------------+
|director_nulls|
+--------------+
|          2636|
+--------------+

+----------+
|cast_nulls|
+----------+
|       826|
+----------+

+-------------+
|country_nulls|
+-------------+
|          832|
+-------------+

+------------------+
|release_year_nulls|
+------------------+
|                 2|
+------------------+

+------------+
|rating_nulls|
+------------+
|           6|
+------------+

+--------------+
|duration_nulls|
+--------------+
|             5|
+--------------+

+---------------+
|listed_in_nulls|
+---------------+
|              3|
+---------------+

+-----------------+
|desctiption_nulls|
+-----------------+
|                3|
+-----------------+



In [119]:
# remove rows for columns having <10 null values
# director, cast & country columns not as important to be NOT NULL/
#  can be dealt with later
netflix_clean_nulls = spark.sql("SELECT type, \
                  title, \
                  director, \
                  cast, \
                  country, \
                  release_year, \
                  rating, \
                  duration, \
                  listed_in, \
                  description \
                  FROM Netflix1 \
                  WHERE type IS NOT NULL AND \
                  title IS NOT NULL AND \
                  release_year IS NOT NULL AND \
                  rating IS NOT NULL AND \
                  duration IS NOT NULL AND \
                  listed_in IS NOT NULL AND \
                  description IS NOT NULL")
netflix_clean_nulls.createOrReplaceTempView("Netflix2")

In [120]:
disney_nulls = spark.sql("SELECT COUNT (*) AS type_nulls FROM Disney1 WHERE type IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS title_nulls FROM Disney1 WHERE title IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS director_nulls FROM Disney1 WHERE director IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS cast_nulls FROM Disney1 WHERE cast IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS country_nulls FROM Disney1 WHERE country IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS release_year_nulls FROM Disney1 WHERE release_year IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS rating_nulls FROM Disney1 WHERE rating IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS duration_nulls FROM Disney1 WHERE duration IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS listed_in_nulls FROM Disney1 WHERE listed_in IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS desctiption_nulls FROM Disney1 WHERE description IS NULL")
disney_nulls.show()

+----------+
|type_nulls|
+----------+
|         0|
+----------+

+-----------+
|title_nulls|
+-----------+
|          0|
+-----------+

+--------------+
|director_nulls|
+--------------+
|           473|
+--------------+

+----------+
|cast_nulls|
+----------+
|       190|
+----------+

+-------------+
|country_nulls|
+-------------+
|          218|
+-------------+

+------------------+
|release_year_nulls|
+------------------+
|                 0|
+------------------+

+------------+
|rating_nulls|
+------------+
|           3|
+------------+

+--------------+
|duration_nulls|
+--------------+
|             0|
+--------------+

+---------------+
|listed_in_nulls|
+---------------+
|              1|
+---------------+

+-----------------+
|desctiption_nulls|
+-----------------+
|                0|
+-----------------+



In [121]:
# remove rows for columns having <10 null values
# director, cast & country columns not as important to be NOT NULL/
#  can be dealt with later
disney_clean_nulls = spark.sql("SELECT type, \
                  title, \
                  director, \
                  cast, \
                  country, \
                  release_year, \
                  rating, \
                  duration, \
                  listed_in, \
                  description \
                  FROM Disney1 \
                  WHERE rating IS NOT NULL AND \
                  listed_in IS NOT NULL")
disney_clean_nulls.createOrReplaceTempView("Disney2")

NETFLIX table: Netflix2

DISNEY table: Disney2

In [122]:
# table counts
spark.sql("SELECT COUNT (*) FROM Netflix2").show()
spark.sql("SELECT COUNT (*) FROM Disney2").show()

+--------+
|count(1)|
+--------+
|    8799|
+--------+

+--------+
|count(1)|
+--------+
|    1446|
+--------+



# Data Exploration and Analysis

In [123]:
spark.sql("SELECT * FROM Disney2").show()

+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|   type|               title|            director|                cast|             country|release_year|rating|  duration|           listed_in|         description|
+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|                null|        2016|  TV-G|    23 min|[Animation,  Family]|Join Mickey and t...|
|  Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|                null|        1988|    PG|    91 min|            [Comedy]|Santa Claus passe...|
|  Movie|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|     [United States]|        2011|  TV-G|    23 min|[Animation,  Come...|Sid the Sloth is ...

In [124]:
spark.sql("SELECT COUNT (*) AS Number_of_PG_Movies_On_Netflix FROM Netflix2 WHERE rating == 'PG' AND type == 'Movie'").show()
spark.sql("SELECT COUNT (*) AS Number_of_PG_Movies_On_Disney FROM Disney2 WHERE rating == 'PG'AND type == 'Movie'").show()

+------------------------------+
|Number_of_PG_Movies_On_Netflix|
+------------------------------+
|                           286|
+------------------------------+

+-----------------------------+
|Number_of_PG_Movies_On_Disney|
+-----------------------------+
|                          234|
+-----------------------------+



In [125]:
mpaaNetflix = spark.sql("SELECT rating, COUNT (*) AS Number_of_Movies_On_Netflix FROM Netflix2 WHERE type == 'Movie' GROUP BY rating ORDER BY Number_of_Movies_On_Netflix DESC")

mpaaNetflix.show()

+-----------------+---------------------------+
|           rating|Number_of_Movies_On_Netflix|
+-----------------+---------------------------+
|            TV-MA|                       2052|
|            TV-14|                       1426|
|                R|                        794|
|            TV-PG|                        539|
|            PG-13|                        489|
|               PG|                        286|
|            TV-Y7|                        139|
|             TV-Y|                        131|
|             TV-G|                        126|
|               NR|                         75|
|                G|                         41|
|         TV-Y7-FV|                          5|
|               UR|                          3|
|            NC-17|                          3|
|             2021|                          2|
| November 1, 2020|                          1|
| Shavidee Trotter|                          1|
|    Maury Chaykin|                     

In [126]:
Disney_Movies = spark.sql("SELECT title, duration FROM Disney2 WHERE type == 'Movie'")

Disney_Movies.show()

+--------------------+--------+
|               title|duration|
+--------------------+--------+
|Duck the Halls: A...|  23 min|
|Ernest Saves Chri...|  91 min|
|Ice Age: A Mammot...|  23 min|
|The Queen Family ...|  41 min|
|   Becoming Cousteau|  94 min|
|A Muppets Christm...|  45 min|
|Adventure Thru th...|  59 min|
|  Puppy for Hanukkah|   4 min|
|     The Pixar Story|  91 min|
|America the Beaut...|   2 min|
|             Baymax!|   1 min|
|        Ciao Alberto|   8 min|
|           Enchanted| 110 min|
|               Feast|   8 min|
|        Frozen Fever|  11 min|
|        Get a Horse!|   7 min|
|Home Sweet Home A...|  95 min|
|       Jungle Cruise| 129 min|
|Limitless with Ch...|   2 min|
|Marvel Studios’ 2...|  14 min|
+--------------------+--------+
only showing top 20 rows



In [127]:
from pyspark.sql.functions import col, concat_ws

In [128]:
disneyEDA = disney_clean_nulls

In [140]:
disneyEDA = disneyEDA.withColumn("director", concat_ws(",", col("director")))
disneyEDA = disneyEDA.withColumn("cast", concat_ws(",", col("cast")))
disneyEDA = disneyEDA.withColumn("country", concat_ws(",", col("country")))
disneyEDA = disneyEDA.withColumn("listed_in", concat_ws(",", col("listed_in")))
disneyEDA.printSchema()

disneyEDA.createOrReplaceTempView("DisneyEDA")

root
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = false)
 |-- cast: string (nullable = false)
 |-- country: string (nullable = false)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = false)
 |-- description: string (nullable = true)



In [141]:
spark.sql("SELECT * FROM DisneyEDA").show()

+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|   type|               title|            director|                cast|             country|release_year|rating|  duration|           listed_in|         description|
+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|Alonso Ramirez Ra...|Chris Diamantopou...|                    |        2016|  TV-G|    23 min|  Animation,  Family|Join Mickey and t...|
|  Movie|Ernest Saves Chri...|         John Cherry|Jim Varney,  Noel...|                    |        1988|    PG|    91 min|              Comedy|Santa Claus passe...|
|  Movie|Ice Age: A Mammot...|        Karen Disher|Raymond Albert Ro...|       United States|        2011|  TV-G|    23 min|Animation,  Comed...|Sid the Sloth is ...

In [142]:
disneyEDA.where(col("director") == "John Cherry").show()

+-----+--------------------+-----------+--------------------+-------+------------+------+--------+---------+--------------------+
| type|               title|   director|                cast|country|release_year|rating|duration|listed_in|         description|
+-----+--------------------+-----------+--------------------+-------+------------+------+--------+---------+--------------------+
|Movie|Ernest Saves Chri...|John Cherry|Jim Varney,  Noel...|       |        1988|    PG|  91 min|   Comedy|Santa Claus passe...|
+-----+--------------------+-----------+--------------------+-------+------------+------+--------+---------+--------------------+



In [143]:
mpaaDisney = spark.sql("SELECT rating, COUNT (*) AS Number_of_Movies_On_Disney FROM Disney2 WHERE type == 'Movie' GROUP BY rating ORDER BY Number_of_Movies_On_Disney DESC")

mpaaDisney.show()

+-----------------+--------------------------+
|           rating|Number_of_Movies_On_Disney|
+-----------------+--------------------------+
|                G|                       253|
|               PG|                       234|
|             TV-G|                       233|
|            TV-PG|                       181|
|            PG-13|                        66|
|            TV-14|                        37|
|            TV-Y7|                        36|
|         TV-Y7-FV|                         6|
|             TV-Y|                         3|
|December 25, 2020|                         1|
+-----------------+--------------------------+



In [144]:
disneyEDA.where(col("listed_in") == "Comedy").show()

+-------+--------------------+------------+--------------------+--------------------+------------+------+---------+---------+--------------------+
|   type|               title|    director|                cast|             country|release_year|rating| duration|listed_in|         description|
+-------+--------------------+------------+--------------------+--------------------+------------+------+---------+---------+--------------------+
|  Movie|Ernest Saves Chri...| John Cherry|Jim Varney,  Noel...|                    |        1988|    PG|   91 min|   Comedy|Santa Claus passe...|
|  Movie|Just Roll With It...|            |Tobie Windham,  S...|                    |        2019|     G|   52 min|   Comedy|The Bennett-Blatt...|
|  Movie|Far Away From Rav...|            |Raven-Symoné,  Is...|                    |        2021|  TV-G|   11 min|   Comedy|Our gang is off f...|
|TV Show|  Wander Over Yonder|            |Jack McBrayer,  A...|United States,  C...|        2013| TV-Y7|2 Seasons|   

In [133]:
spark.sql("SELECT listed_in, COUNT (*) AS Numbers_per_Genre FROM DisneyEDA WHERE type == 'Movie' GROUP BY rating ORDER BY Number_of_Movies_On_Disney DESC").show()

AnalysisException: expression 'disneyeda.listed_in' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
'Sort ['Number_of_Movies_On_Disney DESC NULLS LAST], true
+- Aggregate [rating#3313], [listed_in#3942, count(1) AS Numbers_per_Genre#4083L]
   +- Filter (type#3306 = Movie)
      +- SubqueryAlias disneyeda
         +- View (`DisneyEDA`, [type#3306,title#3307,director#3909,cast#3920,country#3931,release_year#3312,rating#3313,duration#3314,listed_in#3942,description#3316])
            +- Project [type#3306, title#3307, director#3909, cast#3920, country#3931, release_year#3312, rating#3313, duration#3314, concat_ws(, , listed_in#3468) AS listed_in#3942, description#3316]
               +- Project [type#3306, title#3307, director#3909, cast#3920, concat_ws(, , country#3467) AS country#3931, release_year#3312, rating#3313, duration#3314, listed_in#3468, description#3316]
                  +- Project [type#3306, title#3307, director#3909, concat_ws(, , cast#3466) AS cast#3920, country#3467, release_year#3312, rating#3313, duration#3314, listed_in#3468, description#3316]
                     +- Project [type#3306, title#3307, concat_ws(, , director#3465) AS director#3909, cast#3466, country#3467, release_year#3312, rating#3313, duration#3314, listed_in#3468, description#3316]
                        +- Project [type#3306, title#3307, director#3465, cast#3466, country#3467, release_year#3312, rating#3313, duration#3314, listed_in#3468, description#3316]
                           +- Filter (isnotnull(rating#3313) AND isnotnull(listed_in#3468))
                              +- SubqueryAlias disney1
                                 +- View (`Disney1`, [type#3306,title#3307,director#3465,cast#3466,country#3467,release_year#3312,rating#3313,duration#3314,listed_in#3468,description#3316])
                                    +- Project [type#3306, title#3307, split(director#3308, ,, -1) AS director#3465, split(cast#3309, ,, -1) AS cast#3466, split(country#3310, ,, -1) AS country#3467, release_year#3312, rating#3313, duration#3314, split(listed_in#3315, ,, -1) AS listed_in#3468, description#3316]
                                       +- SubqueryAlias disney
                                          +- View (`Disney`, [show_id#3305,type#3306,title#3307,director#3308,cast#3309,country#3310,date_added#3311,release_year#3312,rating#3313,duration#3314,listed_in#3315,description#3316])
                                             +- Relation [show_id#3305,type#3306,title#3307,director#3308,cast#3309,country#3310,date_added#3311,release_year#3312,rating#3313,duration#3314,listed_in#3315,description#3316] csv


In [None]:
#Future iterations will have this query working so that we can search the 'listed_in' array for the total count of different types of genres per platform. 

#Cat_Disney = spark.sql("SELECT ARRAY_TO_STRING(listed_in, ', ') AS String_Categories FROM Disney2")

AnalysisException: Undefined function: 'ARRAY_TO_STRING'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7

# Machine learning method selection, model training, and model evaluation

In [None]:
# Importing necessary modules

from pyspark.ml.feature import (CountVectorizer, RegexTokenizer, StopWordsRemover, IDF, StringIndexer)

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

Disney Data

In [None]:
Disney3 = disney_clean_nulls.select("title", "director", "cast", "rating", "listed_in", "description")
Disney3.show(3)

+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|               title|            director|                cast|rating|           listed_in|         description|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|
|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|
|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
only showing top 3 rows



In [None]:
Disney3.select("rating").distinct().show()

+-----------------+
|           rating|
+-----------------+
|             TV-Y|
|December 25, 2020|
|               PG|
|    United States|
|         TV-Y7-FV|
|            TV-PG|
|                G|
|            TV-14|
|             TV-G|
|            TV-Y7|
|            PG-13|
+-----------------+



In [None]:
Disney3.select("listed_in").distinct().show()

+--------------------+
|           listed_in|
+--------------------+
|[Musical,  Romanc...|
|[Family,  Fantasy...|
|[Biographical,  D...|
|   [Drama,  Fantasy]|
|[Coming of Age,  ...|
|[Comedy,  Coming ...|
|[Drama,  Music,  ...|
|[Comedy,  Family,...|
|[Action-Adventure...|
|[Documentary,  Fa...|
|[Animation,  Fami...|
|[Animation,  Family]|
|[Docuseries,  Fam...|
|[Animation,  Fami...|
|[Biographical,  H...|
|[Docuseries,  Sci...|
|[Animation,  Kids...|
|[Animation,  Kids...|
|[Action-Adventure...|
|              [2016]|
+--------------------+
only showing top 20 rows



Natural Language Processing

In [None]:
from pyspark.sql.functions import length

Disney4 = Disney3.withColumn('title_length', length(Disney3['title']))
Disney4.show(3)

+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|               title|            director|                cast|rating|           listed_in|         description|title_length|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|
|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|
|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
only showing top 3 rows



In [None]:
# Tokenize words in title column
tokenizer1 = RegexTokenizer(inputCol= 'title', outputCol='Disney_words', pattern='\\W')
tokenized = tokenizer1.transform(Disney4)
tokenized.show(3)

# Tokenize words in description column
tokenizer2 = RegexTokenizer(inputCol= 'description', outputCol='Disney_tokens', pattern='\\W')
tokenized2 = tokenizer2.transform(tokenized)
tokenized2.show(3, truncate=False)

+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+--------------------+
|               title|            director|                cast|rating|           listed_in|         description|title_length|        Disney_words|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+--------------------+
|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|[duck, the, halls...|
|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|[ernest, saves, c...|
|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|[ice, age, a, mam...|
+--------------------+--------------------+--------------------+------+--------------------+--------------------

In [None]:
# Remove stopwords in tokenized Disney_words column
Disney_remover = StopWordsRemover(inputCol='Disney_words', outputCol='filtered_title')
removed_Dis = Disney_remover.transform(tokenized2)

# Processing title column
Disney_cv = CountVectorizer(inputCol='filtered_title', outputCol='vec_title')
Disney_idf = IDF(inputCol='vec_title', outputCol='Disney_tfidf')
Disney_numeric = StringIndexer(inputCol='rating', outputCol='label')
Disney_Assembler = VectorAssembler(inputCols=['Disney_tfidf','title_length'], outputCol='features')

In [None]:
from pyspark.ml import Pipeline

# Creaing data pipeline
Disney_pipeline = Pipeline(stages=[Disney_numeric, tokenizer1, Disney_remover, Disney_cv, Disney_idf, Disney_Assembler])
                        #tokenizer2, remover2, Disney_cv2, Disney_idf2, Disney_Assembler2])

In [None]:
Disney_fit = Disney_pipeline.fit(Disney4)
Disney_clean = Disney_fit.transform(Disney4)
Disney_cleaned = Disney_clean.select('label', 'features')

In [None]:
Disney4.printSchema()

root
 |-- title: string (nullable = true)
 |-- director: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cast: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: string (nullable = true)
 |-- listed_in: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- title_length: integer (nullable = true)



In [None]:
Disney_cleaned.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(1857,[8,14,37,55...|
|  3.0|(1857,[14,1574,16...|
|  0.0|(1857,[14,59,96,1...|
|  1.0|(1857,[58,173,590...|
|  6.0|(1857,[370,1579,1...|
|  5.0|(1857,[1035,1856]...|
|  5.0|(1857,[243,607,13...|
|  1.0|(1857,[18,122,139...|
|  2.0|(1857,[14,41,112,...|
|  1.0|(1857,[0,72,98,50...|
|  0.0|(1857,[273,1434,1...|
|  2.0|(1857,[6,153,1856...|
|  1.0|(1857,[77,464,111...|
|  1.0|(1857,[30,670,185...|
|  0.0|(1857,[681,1856],...|
|  1.0|(1857,[764,1710,1...|
|  1.0|(1857,[0,255,534,...|
|  1.0|(1857,[0,1630,185...|
|  3.0|(1857,[385,1856],...|
|  2.0|(1857,[1217,1856]...|
+-----+--------------------+
only showing top 20 rows



Naive Bayes Classifier

In [None]:
# Building the model
NB = NaiveBayes()

In [None]:
# Splitting data into training and testing datasets 
Disney_train, Disney_test = Disney_cleaned.randomSplit([0.7, 0.3])

In [None]:
# Fitting the Naive Bayes model
Disney_model = NB.fit(Disney_train)
Disney_results = Disney_model.transform(Disney_test)
Disney_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(1857,[0,2,3,83,8...|[-229.95900817051...|[1.0,4.8196007405...|       0.0|
|  0.0|(1857,[0,5,7,92,2...|[-183.18002788181...|[0.99999221576372...|       0.0|
|  0.0|(1857,[0,19,20,93...|[-213.16206845596...|[1.25255600188760...|       4.0|
|  0.0|(1857,[0,21,57,53...|[-162.49305310703...|[0.94508801908285...|       0.0|
|  0.0|(1857,[0,68,127,6...|[-292.98431580888...|[0.78644590774862...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



Model Evaluation

In [None]:
# Evaluating NB model
accuracy_eval = MulticlassClassificationEvaluator()



In [None]:
accuracy = accuracy_eval.evaluate(Disney_results)
print(accuracy)

0.31365669539226243


Building model on rating as label and description column as feature

In [None]:
# Remove stopwords in tokenized Disney_tokens column
remover2 = StopWordsRemover(inputCol='Disney_tokens', outputCol='filtered_description')
removed_Disney = remover2.transform(removed_Dis)

# Processing description column
Disney_cv2 = CountVectorizer(inputCol='filtered_description', outputCol='vec_description')
Disney_idf2 = IDF(inputCol='vec_description', outputCol='Disney_tfidf2')
Disney_numeric2 = StringIndexer(inputCol='rating', outputCol='label')
Disney_Assembler2 = VectorAssembler(inputCols=['Disney_tfidf2','title_length'], outputCol='features')

In [None]:
# Creaing data pipeline
Disney_pipeline = Pipeline(stages=[Disney_numeric2, tokenizer2, remover2, Disney_cv2, Disney_idf2, Disney_Assembler2])                        

In [None]:
Disney_fit2 = Disney_pipeline.fit(Disney4)
Disney_clean2 = Disney_fit2.transform(Disney4)
Disney_cleaned2 = Disney_clean2.select('label', 'features')

In [None]:
# Splitting data into training and testing datasets 
Disney_train2, Disney_test2 = Disney_cleaned2.randomSplit([0.7, 0.3])

In [None]:
# Fitting the Naive Bayes model
Disney_model2 = NB.fit(Disney_train2)
Disney_results2 = Disney_model2.transform(Disney_test2)
Disney_results2.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(4373,[0,1,2,5,67...|[-242.71330949420...|[0.99641633732020...|       0.0|
|  0.0|(4373,[0,1,2,9,22...|[-232.57516558639...|[1.0,6.6110834025...|       0.0|
|  0.0|(4373,[0,1,2,12,1...|[-501.15865178004...|[2.06211318685915...|       2.0|
|  0.0|(4373,[0,1,2,15,4...|[-250.02229672676...|[0.99984179332652...|       0.0|
|  0.0|(4373,[0,1,2,15,5...|[-373.99031338740...|[0.11321624378265...|       4.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [None]:
# Evaluating NB model
accuracy2 = accuracy_eval.evaluate(Disney_results2)
print(accuracy2)

0.3184445591365627


Netflix Data

In [None]:
Netflix3 = netflix_clean_nulls.select("title", "director", "cast", "rating", "listed_in", "description")
Netflix3.show()

+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|               title|            director|                cast|rating|           listed_in|         description|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|Dick Johnson Is Dead|   [Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|
|       Blood & Water|                null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|
|           Ganglands|   [Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|
|Jailbirds New Orl...|                null|                null| TV-MA|[Docuseries,  Rea...|Feuds, flirtation...|
|        Kota Factory|                null|[Mayur More,  Jit...| TV-MA|[International TV...|In a city of coac...|
|       Midnight Mass|     [Mike Flanagan]|[Kate Siegel,  Za...| TV-MA|[TV Dramas,  TV H

In [None]:
Netflix3.select("rating").distinct().show()

+--------------------+
|              rating|
+--------------------+
|    November 1, 2020|
|    Shavidee Trotter|
|       Adriane Lenox|
|                TV-Y|
|       Maury Chaykin|
|                2019|
|                2017|
|                  UR|
| Keppy Ekpenyong ...|
|      Benn Northover|
|                  PG|
|         Jide Kosoko|
|               TV-MA|
|     Jowharah Jones"|
|            TV-Y7-FV|
|                2006|
|      Itziar Aizpuru|
|                  NR|
|               TV-PG|
|               NC-17|
+--------------------+
only showing top 20 rows



In [None]:
Netflix3.select("listed_in").distinct().show(50)

+--------------------+
|           listed_in|
+--------------------+
|[Kids' TV,  Korea...|
|[Comedies,  Drama...|
|[Kids' TV,  TV Th...|
|[International TV...|
|[TV Dramas,  TV S...|
|[Action & Adventu...|
|[Classic Movies, ...|
|[Classic Movies, ...|
|            [71 min]|
|[ Janeane Garofalo"]|
|[Crime TV Shows, ...|
|[Comedies,  Inter...|
|[Kids' TV,  TV Sc...|
|[Comedies,  Music...|
|     [ Margaret Cho]|
|[Action & Adventure]|
|[Action & Adventu...|
|[Comedies,  Roman...|
|[Action & Adventu...|
|[Independent Movi...|
|[Reality TV,  TV ...|
|[Anime Features, ...|
|[Action & Adventu...|
|     [Documentaries]|
|[British TV Shows...|
|          [TV Shows]|
|[Crime TV Shows, ...|
|[TV Dramas,  TV S...|
|[British TV Shows...|
|[Comedies,  Cult ...|
|[Dramas,  Interna...|
|[Kids' TV,  Reali...|
|          [Comedies]|
|[Comedies,  LGBTQ...|
|[Action & Adventu...|
|[International Mo...|
|[Documentaries,  ...|
|[Children & Famil...|
|[Dramas,  Faith &...|
|[TV Comedies,  TV...|
|[Internati

Natural Language Processing

In [None]:
Netflix4 = Netflix3.withColumn('title_length', length(Netflix3['title']))
Netflix4.show(3)

+--------------------+-----------------+--------------------+------+--------------------+--------------------+------------+
|               title|         director|                cast|rating|           listed_in|         description|title_length|
+--------------------+-----------------+--------------------+------+--------------------+--------------------+------------+
|Dick Johnson Is Dead|[Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|          20|
|       Blood & Water|             null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|          13|
|           Ganglands|[Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|           9|
+--------------------+-----------------+--------------------+------+--------------------+--------------------+------------+
only showing top 3 rows



In [None]:
# Tokenize words in title column
reg_tokenizer3 = RegexTokenizer(inputCol="title", outputCol="Net_words", pattern='\\W')
reg_tokenized3 = reg_tokenizer3.transform(Netflix3)

# Tokenize words in description column
reg_tokenizer4 = RegexTokenizer(inputCol="description", outputCol="Net_tokens", pattern='\\W')
reg_tokenized4 = reg_tokenizer4.transform(reg_tokenized3)

reg_tokenized4.show(3)

+--------------------+-----------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|               title|         director|                cast|rating|           listed_in|         description|           Net_words|          Net_tokens|
+--------------------+-----------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|Dick Johnson Is Dead|[Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|[dick, johnson, i...|[as, her, father,...|
|       Blood & Water|             null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|      [blood, water]|[after, crossing,...|
|           Ganglands|[Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|         [ganglands]|[to, protect, his...|
+--------------------+-----------------+--------------------+------+--------------

Running model on Netflix title column

In [None]:
# Remove stopwords in tokenized words column
Netflix_remover = StopWordsRemover(inputCol='Net_words', outputCol='filtered_title')
Netflix_removed = Netflix_remover.transform(reg_tokenized4)

# Processing title column
Netflix_cv = CountVectorizer(inputCol='filtered_title', outputCol='vec_title')
Netflix_idf = IDF(inputCol='vec_title', outputCol='Netflix_tfidf')
Netflix_numeric = StringIndexer(inputCol='rating', outputCol='label')
Netflix_Assembler = VectorAssembler(inputCols=['Netflix_tfidf','title_length'], outputCol='features')

In [None]:
# Creaing data pipeline
Netflix_pipeline = Pipeline(stages=[Netflix_numeric, reg_tokenizer3, Netflix_remover, Netflix_cv, Netflix_idf, Netflix_Assembler])   

In [None]:
Netflix_fit = Netflix_pipeline.fit(Netflix4)
Netflix_clean = Netflix_fit.transform(Netflix4)
Netflix_cleaned = Netflix_clean.select('label', 'features')

In [None]:
# Splitting data into training and testing datasets 
Netflix_train, Netflix_test = Netflix_cleaned.randomSplit([0.7, 0.3])

In [None]:
# Fitting the Naive Bayes model 3
Netflix_model = NB.fit(Netflix_train)
Netflix_results = Netflix_model.transform(Netflix_test)
Netflix_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(8786,[0,20,647,8...|[-144.45849150106...|[0.12605895213653...|       3.0|
|  0.0|(8786,[0,66,2204,...|[-177.13516515002...|[4.07007478021616...|       3.0|
|  0.0|(8786,[0,84,8785]...|[-80.544169003394...|[0.52318537125980...|       0.0|
|  0.0|(8786,[0,118,252,...|[-359.23519907900...|[3.23195326480116...|       2.0|
|  0.0|(8786,[0,133,6712...|[-191.56860061772...|[2.39103688144098...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



Model Evaluation

In [None]:
# Evaluating NB model 3
accuracy3 = accuracy_eval.evaluate(Netflix_results)
print(accuracy3)

0.26000183701534946


Running NB model on Netflix description column

In [None]:
# Remove stopwords in tokenized Net_tokens column
Netflix_remover2 = StopWordsRemover(inputCol='Net_tokens', outputCol='filtered_description')
removed_Netflix = Netflix_remover2.transform(Netflix_removed)

# Processing Netflix description column
Netflix_cv2 = CountVectorizer(inputCol='filtered_description', outputCol='vec_description')
Netflix_idf2 = IDF(inputCol='vec_description', outputCol='Netflix_tfidf2')
Netflix_numeric2 = StringIndexer(inputCol='rating', outputCol='label')
Netflix_Assembler2 = VectorAssembler(inputCols=['Netflix_tfidf2','title_length'], outputCol='features')

In [None]:
# Creaing data pipeline
Netflix_pipeline2 = Pipeline(stages=[Netflix_numeric2, reg_tokenizer4, Netflix_remover2, Netflix_cv2, Netflix_idf2, Netflix_Assembler2]) 

In [None]:
Netflix_fit2 = Netflix_pipeline2.fit(Netflix4)
Netflix_clean2 = Netflix_fit2.transform(Netflix4)
Netflix_cleaned2 = Netflix_clean2.select('label', 'features')

In [None]:
# Splitting data into training and testing datasets 
Netflix_train2, Netflix_test2 = Netflix_cleaned2.randomSplit([0.7, 0.3])

In [None]:
# Fitting the Naive Bayes model 4
Netflix_model2 = NB.fit(Netflix_train2)
Netflix_results2 = Netflix_model2.transform(Netflix_test2)
Netflix_results2.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(18945,[0,1,2,31,...|[-766.77022193784...|[7.50329261185532...|       7.0|
|  0.0|(18945,[0,1,3,5,5...|[-475.13439675149...|[0.98916199485655...|       0.0|
|  0.0|(18945,[0,1,5,14,...|[-822.10202701101...|[4.46218591872811...|       1.0|
|  0.0|(18945,[0,1,6,7,2...|[-685.37529936071...|[0.99854775688450...|       0.0|
|  0.0|(18945,[0,1,7,71,...|[-922.98813040495...|[6.52090137644311...|       3.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



Model Evaluation

In [None]:
# Evaluating NB model 4
accuracy4 = accuracy_eval.evaluate(Netflix_results2)
print(accuracy4)

0.33649694273013475
