In [1]:
# Importing the necessary modules
import findspark
findspark.init()

# Initialize a SparkSession
from pyspark.sql import SparkSession

# Creating SparkSession
spark = SparkSession.builder.appName('TP').getOrCreate()

# Calling the session variable object
spark

Loading data sets

In [2]:
netflix_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/netflix_titles.csv"
disney_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/disney_plus_titles.csv"

from pyspark import SparkFiles
spark.sparkContext.addFile(netflix_url)
spark.sparkContext.addFile(disney_url)

netflix_df = spark.read.csv("file:///"+SparkFiles.get("netflix_titles.csv"), header=True, inferSchema= True)
disney_df = spark.read.csv("file:///"+SparkFiles.get("disney_plus_titles.csv"), header=True, inferSchema= True)

In [3]:
netflix_df.printSchema()
disney_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [4]:
netflix_df.show(3)
disney_df.show(3)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         null|Septem

In [5]:
netflix_df.createOrReplaceTempView("Netflix")
disney_df.createOrReplaceTempView("Disney")

# Data Cleaning

In [6]:
# remove (do not select) show_id and date_added columns
# convert columns with multiple values to arrays
netflix_clean = spark.sql("SELECT type, \
                  title, \
                  SPLIT(director, ',') AS director, \
                  SPLIT(cast, ',') AS cast, \
                  SPLIT(country, ',') AS country, \
                  release_year, \
                  rating, \
                  duration, \
                  SPLIT(listed_in, ',') AS listed_in, \
                  description \
                  FROM Netflix")
netflix_clean.createOrReplaceTempView("Netflix1")

disney_clean = spark.sql("SELECT type, \
                  title, \
                  SPLIT(director, ',') AS director, \
                  SPLIT(cast, ',') AS cast, \
                  SPLIT(country, ',') AS country, \
                  release_year, \
                  rating, \
                  duration, \
                  SPLIT(listed_in, ',') AS listed_in, \
                  description \
                  FROM Disney")
disney_clean.createOrReplaceTempView("Disney1")

In [7]:
# test duration and listed_in columns to see if they vary between platforms
overlap = spark.sql("SELECT Netflix1.title, Netflix1.duration, Disney1.duration,\
                     Netflix1.listed_in, Disney1.listed_in FROM Netflix1 \
                     INNER JOIN Disney1 ON Netflix1.title = Disney1.title")
overlap.show(5, truncate=False)
# there is variation - worth keeping each dataset separate

+----------------+---------+---------+--------------------------------------------------------+-----------------------------------------------------+
|title           |duration |duration |listed_in                                               |listed_in                                            |
+----------------+---------+---------+--------------------------------------------------------+-----------------------------------------------------+
|PJ Masks        |3 Seasons|5 Seasons|[Kids' TV]                                              |[Action-Adventure,  Animation,  Kids]                |
|Once Upon a Time|1 Season |7 Seasons|[International TV Shows,  Romantic TV Shows,  TV Dramas]|[Action-Adventure,  Fantasy,  Soap Opera / Melodrama]|
|Gigantosaurus   |1 Season |2 Seasons|[Kids' TV]                                              |[Action-Adventure,  Animation,  Kids]                |
|Becoming        |89 min   |1 Season |[Documentaries]                                         |[Anth

In [8]:
netflix_nulls = spark.sql("SELECT COUNT (*) AS type_nulls FROM Netflix1 WHERE type IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS title_nulls FROM Netflix1 WHERE title IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS director_nulls FROM Netflix1 WHERE director IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS cast_nulls FROM Netflix1 WHERE cast IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS country_nulls FROM Netflix1 WHERE country IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS release_year_nulls FROM Netflix1 WHERE release_year IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS rating_nulls FROM Netflix1 WHERE rating IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS duration_nulls FROM Netflix1 WHERE duration IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS listed_in_nulls FROM Netflix1 WHERE listed_in IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS desctiption_nulls FROM Netflix1 WHERE description IS NULL")
netflix_nulls.show()

+----------+
|type_nulls|
+----------+
|         1|
+----------+

+-----------+
|title_nulls|
+-----------+
|          2|
+-----------+

+--------------+
|director_nulls|
+--------------+
|          2636|
+--------------+

+----------+
|cast_nulls|
+----------+
|       826|
+----------+

+-------------+
|country_nulls|
+-------------+
|          832|
+-------------+

+------------------+
|release_year_nulls|
+------------------+
|                 2|
+------------------+

+------------+
|rating_nulls|
+------------+
|           6|
+------------+

+--------------+
|duration_nulls|
+--------------+
|             5|
+--------------+

+---------------+
|listed_in_nulls|
+---------------+
|              3|
+---------------+

+-----------------+
|desctiption_nulls|
+-----------------+
|                3|
+-----------------+



In [9]:
# remove rows for columns having <10 null values
# director, cast & country columns not as important to be NOT NULL/
#  can be dealt with later
netflix_clean_nulls = spark.sql("SELECT type, \
                  title, \
                  director, \
                  cast, \
                  country, \
                  release_year, \
                  rating, \
                  duration, \
                  listed_in, \
                  description \
                  FROM Netflix1 \
                  WHERE type IS NOT NULL AND \
                  title IS NOT NULL AND \
                  release_year IS NOT NULL AND \
                  rating IS NOT NULL AND \
                  duration IS NOT NULL AND \
                  listed_in IS NOT NULL AND \
                  description IS NOT NULL")
netflix_clean_nulls.createOrReplaceTempView("Netflix2")

In [10]:
disney_nulls = spark.sql("SELECT COUNT (*) AS type_nulls FROM Disney1 WHERE type IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS title_nulls FROM Disney1 WHERE title IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS director_nulls FROM Disney1 WHERE director IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS cast_nulls FROM Disney1 WHERE cast IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS country_nulls FROM Disney1 WHERE country IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS release_year_nulls FROM Disney1 WHERE release_year IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS rating_nulls FROM Disney1 WHERE rating IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS duration_nulls FROM Disney1 WHERE duration IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS listed_in_nulls FROM Disney1 WHERE listed_in IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS desctiption_nulls FROM Disney1 WHERE description IS NULL")
disney_nulls.show()

+----------+
|type_nulls|
+----------+
|         0|
+----------+

+-----------+
|title_nulls|
+-----------+
|          0|
+-----------+

+--------------+
|director_nulls|
+--------------+
|           473|
+--------------+

+----------+
|cast_nulls|
+----------+
|       190|
+----------+

+-------------+
|country_nulls|
+-------------+
|          218|
+-------------+

+------------------+
|release_year_nulls|
+------------------+
|                 0|
+------------------+

+------------+
|rating_nulls|
+------------+
|           3|
+------------+

+--------------+
|duration_nulls|
+--------------+
|             0|
+--------------+

+---------------+
|listed_in_nulls|
+---------------+
|              1|
+---------------+

+-----------------+
|desctiption_nulls|
+-----------------+
|                0|
+-----------------+



In [11]:
# remove rows for columns having <10 null values
# director, cast & country columns not as important to be NOT NULL/
#  can be dealt with later
disney_clean_nulls = spark.sql("SELECT type, \
                  title, \
                  director, \
                  cast, \
                  country, \
                  release_year, \
                  rating, \
                  duration, \
                  listed_in, \
                  description \
                  FROM Disney1 \
                  WHERE rating IS NOT NULL AND \
                  listed_in IS NOT NULL")
disney_clean_nulls.createOrReplaceTempView("Disney2")

NETFLIX table: Netflix2

DISNEY table: Disney2

In [12]:
# table counts
spark.sql("SELECT COUNT (*) FROM Netflix2").show()
spark.sql("SELECT COUNT (*) FROM Disney2").show()

+--------+
|count(1)|
+--------+
|    8799|
+--------+

+--------+
|count(1)|
+--------+
|    1446|
+--------+



# Data Exploration and Analysis

In [13]:
spark.sql("SELECT * FROM Disney2").show()

+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|   type|               title|            director|                cast|             country|release_year|rating|  duration|           listed_in|         description|
+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|                null|        2016|  TV-G|    23 min|[Animation,  Family]|Join Mickey and t...|
|  Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|                null|        1988|    PG|    91 min|            [Comedy]|Santa Claus passe...|
|  Movie|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|     [United States]|        2011|  TV-G|    23 min|[Animation,  Come...|Sid the Sloth is ...

In [14]:
spark.sql("SELECT COUNT (*) AS Number_of_PG_Movies_On_Netflix FROM Netflix2 WHERE rating == 'PG' AND type == 'Movie'").show()
spark.sql("SELECT COUNT (*) AS Number_of_PG_Movies_On_Disney FROM Disney2 WHERE rating == 'PG'AND type == 'Movie'").show()

+------------------------------+
|Number_of_PG_Movies_On_Netflix|
+------------------------------+
|                           286|
+------------------------------+

+-----------------------------+
|Number_of_PG_Movies_On_Disney|
+-----------------------------+
|                          234|
+-----------------------------+



In [15]:
mpaaNetflix = spark.sql("SELECT rating, COUNT (*) AS Number_of_Movies_On_Netflix FROM Netflix2 WHERE type == 'Movie' GROUP BY rating ORDER BY Number_of_Movies_On_Netflix DESC")

mpaaNetflix.show()

+-----------------+---------------------------+
|           rating|Number_of_Movies_On_Netflix|
+-----------------+---------------------------+
|            TV-MA|                       2052|
|            TV-14|                       1426|
|                R|                        794|
|            TV-PG|                        539|
|            PG-13|                        489|
|               PG|                        286|
|            TV-Y7|                        139|
|             TV-Y|                        131|
|             TV-G|                        126|
|               NR|                         75|
|                G|                         41|
|         TV-Y7-FV|                          5|
|               UR|                          3|
|            NC-17|                          3|
|             2021|                          2|
| November 1, 2020|                          1|
| Shavidee Trotter|                          1|
|    Maury Chaykin|                     

In [16]:
Disney_Movies = spark.sql("SELECT title, duration FROM Disney2 WHERE type == 'Movie'")

Disney_Movies.show()

+--------------------+--------+
|               title|duration|
+--------------------+--------+
|Duck the Halls: A...|  23 min|
|Ernest Saves Chri...|  91 min|
|Ice Age: A Mammot...|  23 min|
|The Queen Family ...|  41 min|
|   Becoming Cousteau|  94 min|
|A Muppets Christm...|  45 min|
|Adventure Thru th...|  59 min|
|  Puppy for Hanukkah|   4 min|
|     The Pixar Story|  91 min|
|America the Beaut...|   2 min|
|             Baymax!|   1 min|
|        Ciao Alberto|   8 min|
|           Enchanted| 110 min|
|               Feast|   8 min|
|        Frozen Fever|  11 min|
|        Get a Horse!|   7 min|
|Home Sweet Home A...|  95 min|
|       Jungle Cruise| 129 min|
|Limitless with Ch...|   2 min|
|Marvel Studios’ 2...|  14 min|
+--------------------+--------+
only showing top 20 rows



In [17]:
from pyspark.sql.functions import col, concat_ws

In [18]:
disneyEDA = disney_clean_nulls

In [19]:
disneyEDA = disneyEDA.withColumn("director", concat_ws(",", col("director")))
disneyEDA = disneyEDA.withColumn("cast", concat_ws(",", col("cast")))
disneyEDA = disneyEDA.withColumn("country", concat_ws(",", col("country")))
disneyEDA = disneyEDA.withColumn("listed_in", concat_ws(",", col("listed_in")))
disneyEDA.printSchema()

disneyEDA.createOrReplaceTempView("DisneyEDA")

root
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = false)
 |-- cast: string (nullable = false)
 |-- country: string (nullable = false)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = false)
 |-- description: string (nullable = true)



In [20]:
spark.sql("SELECT * FROM DisneyEDA").show()

+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|   type|               title|            director|                cast|             country|release_year|rating|  duration|           listed_in|         description|
+-------+--------------------+--------------------+--------------------+--------------------+------------+------+----------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|Alonso Ramirez Ra...|Chris Diamantopou...|                    |        2016|  TV-G|    23 min|   Animation, Family|Join Mickey and t...|
|  Movie|Ernest Saves Chri...|         John Cherry|Jim Varney, Noell...|                    |        1988|    PG|    91 min|              Comedy|Santa Claus passe...|
|  Movie|Ice Age: A Mammot...|        Karen Disher|Raymond Albert Ro...|       United States|        2011|  TV-G|    23 min|Animation, Comedy...|Sid the Sloth is ...

In [21]:
disneyEDA.where(col("director") == "John Cherry").show()

+-----+--------------------+-----------+--------------------+-------+------------+------+--------+---------+--------------------+
| type|               title|   director|                cast|country|release_year|rating|duration|listed_in|         description|
+-----+--------------------+-----------+--------------------+-------+------------+------+--------+---------+--------------------+
|Movie|Ernest Saves Chri...|John Cherry|Jim Varney, Noell...|       |        1988|    PG|  91 min|   Comedy|Santa Claus passe...|
+-----+--------------------+-----------+--------------------+-------+------------+------+--------+---------+--------------------+



In [22]:
mpaaDisney = spark.sql("SELECT rating, COUNT (*) AS Number_of_Movies_On_Disney FROM Disney2 WHERE type == 'Movie' GROUP BY rating ORDER BY Number_of_Movies_On_Disney DESC")

mpaaDisney.show()

+-----------------+--------------------------+
|           rating|Number_of_Movies_On_Disney|
+-----------------+--------------------------+
|                G|                       253|
|               PG|                       234|
|             TV-G|                       233|
|            TV-PG|                       181|
|            PG-13|                        66|
|            TV-14|                        37|
|            TV-Y7|                        36|
|         TV-Y7-FV|                         6|
|             TV-Y|                         3|
|December 25, 2020|                         1|
+-----------------+--------------------------+



In [23]:
disneyEDA.where(col("listed_in") == "Comedy").show()

+-------+--------------------+------------+--------------------+--------------------+------------+------+---------+---------+--------------------+
|   type|               title|    director|                cast|             country|release_year|rating| duration|listed_in|         description|
+-------+--------------------+------------+--------------------+--------------------+------------+------+---------+---------+--------------------+
|  Movie|Ernest Saves Chri...| John Cherry|Jim Varney, Noell...|                    |        1988|    PG|   91 min|   Comedy|Santa Claus passe...|
|  Movie|Just Roll With It...|            |Tobie Windham, Su...|                    |        2019|     G|   52 min|   Comedy|The Bennett-Blatt...|
|  Movie|Far Away From Rav...|            |Raven-Symoné, Iss...|                    |        2021|  TV-G|   11 min|   Comedy|Our gang is off f...|
|TV Show|  Wander Over Yonder|            |Jack McBrayer, Ap...|United States, Ca...|        2013| TV-Y7|2 Seasons|   

In [24]:
spark.sql("SELECT listed_in, COUNT (*) AS Numbers_per_Genre FROM DisneyEDA WHERE type == 'Movie' GROUP BY rating ORDER BY Number_of_Movies_On_Disney DESC").show()

AnalysisException: expression 'disneyeda.listed_in' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;
'Sort ['Number_of_Movies_On_Disney DESC NULLS LAST], true
+- Aggregate [rating#64], [listed_in#693, count(1) AS Numbers_per_Genre#885L]
   +- Filter (type#57 = Movie)
      +- SubqueryAlias disneyeda
         +- View (`DisneyEDA`, [type#57,title#58,director#660,cast#671,country#682,release_year#63,rating#64,duration#65,listed_in#693,description#67])
            +- Project [type#57, title#58, director#660, cast#671, country#682, release_year#63, rating#64, duration#65, concat_ws(,, listed_in#219) AS listed_in#693, description#67]
               +- Project [type#57, title#58, director#660, cast#671, concat_ws(,, country#218) AS country#682, release_year#63, rating#64, duration#65, listed_in#219, description#67]
                  +- Project [type#57, title#58, director#660, concat_ws(,, cast#217) AS cast#671, country#218, release_year#63, rating#64, duration#65, listed_in#219, description#67]
                     +- Project [type#57, title#58, concat_ws(,, director#216) AS director#660, cast#217, country#218, release_year#63, rating#64, duration#65, listed_in#219, description#67]
                        +- Project [type#57, title#58, director#216, cast#217, country#218, release_year#63, rating#64, duration#65, listed_in#219, description#67]
                           +- Filter (isnotnull(rating#64) AND isnotnull(listed_in#219))
                              +- SubqueryAlias disney1
                                 +- View (`Disney1`, [type#57,title#58,director#216,cast#217,country#218,release_year#63,rating#64,duration#65,listed_in#219,description#67])
                                    +- Project [type#57, title#58, split(director#59, ,, -1) AS director#216, split(cast#60, ,, -1) AS cast#217, split(country#61, ,, -1) AS country#218, release_year#63, rating#64, duration#65, split(listed_in#66, ,, -1) AS listed_in#219, description#67]
                                       +- SubqueryAlias disney
                                          +- View (`Disney`, [show_id#56,type#57,title#58,director#59,cast#60,country#61,date_added#62,release_year#63,rating#64,duration#65,listed_in#66,description#67])
                                             +- Relation [show_id#56,type#57,title#58,director#59,cast#60,country#61,date_added#62,release_year#63,rating#64,duration#65,listed_in#66,description#67] csv


In [None]:
#Future iterations will have this query working so that we can search the 'listed_in' array for the total count of different types of genres per platform. 

#Cat_Disney = spark.sql("SELECT ARRAY_TO_STRING(listed_in, ', ') AS String_Categories FROM Disney2")

AnalysisException: Undefined function: 'ARRAY_TO_STRING'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 7

# Machine learning method selection, model training, and model evaluation

In [25]:
# Importing necessary modules

from pyspark.ml.feature import (CountVectorizer, RegexTokenizer, StopWordsRemover, IDF, StringIndexer)

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col

Disney Data

In [26]:
# Selecting a subset of the Disney dataset 
Disney3 = disney_clean_nulls.select("type", "title", "director", "cast", "rating", "listed_in", "description")
Disney3.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|
|  Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|
|  Movie|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|
|  Movie|The Queen Family ...|   [Hamish Hamilton]|[Darren Criss,  A...| TV-PG|           [Musical]|This is real life...|
|  Movie|   Becoming Cousteau|        [Liz Garbus]|[Jacques Yves Cou...| PG-13|[Biographical,  D...|An inside look at...|
|TV Show|             Ha

In [27]:
# Checking distinct elements from the type feature and potentially use it as the label for NaiveBayes model
Disney3.select("type").distinct().show()

+-------+
|   type|
+-------+
|TV Show|
|  Movie|
+-------+



In [28]:
# Checking distinct elements from the rating feature and potentially use it as the label for NaiveBayes model
Disney3.select("rating").distinct().show()

+-----------------+
|           rating|
+-----------------+
|             TV-Y|
|December 25, 2020|
|               PG|
|    United States|
|         TV-Y7-FV|
|            TV-PG|
|                G|
|            TV-14|
|             TV-G|
|            TV-Y7|
|            PG-13|
+-----------------+



In [29]:
# Checking distinct elements from the genre feature and potentially use it as the label for NaiveBayes model
Disney3.select("listed_in").distinct().show()

+--------------------+
|           listed_in|
+--------------------+
|[Musical,  Romanc...|
|[Family,  Fantasy...|
|[Biographical,  D...|
|   [Drama,  Fantasy]|
|[Coming of Age,  ...|
|[Comedy,  Coming ...|
|[Drama,  Music,  ...|
|[Comedy,  Family,...|
|[Action-Adventure...|
|[Documentary,  Fa...|
|[Animation,  Fami...|
|[Animation,  Family]|
|[Docuseries,  Fam...|
|[Animation,  Fami...|
|[Biographical,  H...|
|[Docuseries,  Sci...|
|[Animation,  Kids...|
|[Animation,  Kids...|
|[Action-Adventure...|
|              [2016]|
+--------------------+
only showing top 20 rows



Natural Language Processing

In [30]:
from pyspark.sql.functions import length

# Using title length for transforming features later on
Disney4 = Disney3.withColumn('title_length', length(Disney3['title']))
Disney4.show(3)

+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
| type|               title|            director|                cast|rating|           listed_in|         description|title_length|
+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|
|Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|
|Movie|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|
+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
only showing top 3 rows



In [31]:
# Using description length for transforming features later on
Disney5 = Disney4.withColumn('description_length', length(Disney4['description']))
Disney5.show(3)

+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+
| type|               title|            director|                cast|rating|           listed_in|         description|title_length|description_length|
+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+
|Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|                48|
|Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|                50|
|Movie|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|                41|
+-----+--------------------+--------------------+--------------------+------+-----------

In [32]:
# Tokenize words in title column
tokenizer1 = RegexTokenizer(inputCol= 'title', outputCol='Disney_words', pattern='\\W')
tokenized = tokenizer1.transform(Disney5)
tokenized.show(5)

# Tokenize words in description column
tokenizer2 = RegexTokenizer(inputCol= 'description', outputCol='Disney_tokens', pattern='\\W')
tokenized2 = tokenizer2.transform(tokenized)
tokenized2.show()

+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+
| type|               title|            director|                cast|rating|           listed_in|         description|title_length|description_length|        Disney_words|
+-----+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+
|Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|                48|[duck, the, halls...|
|Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|                50|[ernest, saves, c...|
|Movie|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|   

In [33]:
# Remove stopwords in tokenized Disney_words column
Disney_remover = StopWordsRemover(inputCol='Disney_words', outputCol='filtered_title')
removed_Dis = Disney_remover.transform(tokenized2)

# Processing title column
Disney_cv = CountVectorizer(inputCol='filtered_title', outputCol='vec_title')
Disney_idf = IDF(inputCol='vec_title', outputCol='Disney_tfidf')
Disney_numeric = StringIndexer(inputCol='rating', outputCol='label')
Disney_Assembler = VectorAssembler(inputCols=['Disney_tfidf','title_length'], outputCol='features')

In [34]:
removed_Dis.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+--------------------+--------------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|title_length|description_length|        Disney_words|       Disney_tokens|      filtered_title|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|                48|[duck, the, halls...|[join, mickey, an...|[duck, halls, mic...|
|  Movie|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          2

In [35]:
from pyspark.ml import Pipeline

# Creaing data pipeline
Disney_pipeline = Pipeline(stages=[Disney_numeric, tokenizer1, Disney_remover, Disney_cv, Disney_idf, Disney_Assembler])
                        #tokenizer2, remover2, Disney_cv2, Disney_idf2, Disney_Assembler2])

In [36]:
Disney_fit = Disney_pipeline.fit(Disney4)
Disney_clean = Disney_fit.transform(Disney4)
Disney_cleaned = Disney_clean.select('label', 'features')

In [37]:
Disney4.printSchema()

root
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cast: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: string (nullable = true)
 |-- listed_in: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- title_length: integer (nullable = true)



In [38]:
Disney_cleaned.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(1857,[8,14,37,55...|
|  3.0|(1857,[14,1574,16...|
|  0.0|(1857,[14,59,96,1...|
|  1.0|(1857,[58,173,590...|
|  6.0|(1857,[370,1579,1...|
|  5.0|(1857,[1035,1856]...|
|  5.0|(1857,[243,607,13...|
|  1.0|(1857,[18,122,139...|
|  2.0|(1857,[14,41,112,...|
|  1.0|(1857,[0,72,98,50...|
|  0.0|(1857,[273,1434,1...|
|  2.0|(1857,[6,153,1856...|
|  1.0|(1857,[77,464,111...|
|  1.0|(1857,[30,670,185...|
|  0.0|(1857,[681,1856],...|
|  1.0|(1857,[764,1710,1...|
|  1.0|(1857,[0,255,534,...|
|  1.0|(1857,[0,1630,185...|
|  3.0|(1857,[385,1856],...|
|  2.0|(1857,[1217,1856]...|
+-----+--------------------+
only showing top 20 rows



Naive Bayes Classifier

In [39]:
# Building the model
NB = NaiveBayes()

Model 1: Building on title column as feature and rating as label

In [40]:
# Splitting data into training and testing datasets 
Disney_train, Disney_test = Disney_cleaned.randomSplit([0.7, 0.3])

In [41]:
# Fitting the Naive Bayes model
Disney_model = NB.fit(Disney_train)
Disney_results = Disney_model.transform(Disney_test)
Disney_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(1857,[0,2,3,83,8...|[-250.26463649834...|[0.99999999999998...|       0.0|
|  0.0|(1857,[0,4,476,18...|[-95.768741046180...|[0.99999797919978...|       0.0|
|  0.0|(1857,[0,5,83,89,...|[-260.63130301867...|[0.99991545273035...|       0.0|
|  0.0|(1857,[0,5,471,48...|[-176.59500127053...|[4.04320022593981...|       4.0|
|  0.0|(1857,[0,16,475,1...|[-125.80367541815...|[4.45077599871312...|       1.0|
|  0.0|(1857,[0,33,206,1...|[-178.72335693063...|[8.11643088316265...|       1.0|
|  0.0|(1857,[0,45,1823,...|[-130.33636765100...|[0.00260460241839...|       1.0|
|  0.0|(1857,[0,68,127,6...|[-324.06766428117...|[5.43652794402738...|       2.0|
|  0.0|(1857,[0,184,194,...|[-391.17010746441...|[5.02431725408388...|       7.0|
|  0.0|(1857,[0,

Model Evaluation

In [43]:
# Evaluating NB model
accuracy_eval = MulticlassClassificationEvaluator()

accuracy = accuracy_eval.evaluate(Disney_results)
print(accuracy)

0.36342123533350773


Model 2: Building model on rating as label and description column as feature

In [44]:
# Remove stopwords in tokenized Disney_tokens column
remover2 = StopWordsRemover(inputCol='Disney_tokens', outputCol='filtered_description')
removed_Disney = remover2.transform(removed_Dis)

# Processing description column
Disney_cv2 = CountVectorizer(inputCol='filtered_description', outputCol='vec_description')
Disney_idf2 = IDF(inputCol='vec_description', outputCol='Disney_tfidf2')
Disney_numeric2 = StringIndexer(inputCol='rating', outputCol='label')
Disney_Assembler2 = VectorAssembler(inputCols=['Disney_tfidf2','description_length'], outputCol='features')

In [45]:
removed_Disney.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+--------------------+--------------------+--------------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|title_length|description_length|        Disney_words|       Disney_tokens|      filtered_title|filtered_description|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+--------------------+--------------------+--------------------+
|  Movie|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|                48|[duck, the, halls...|[join, mickey, an...|[duck, halls, mic...|[join, mickey, ga...|
|  Movie|Ernest Saves Chri...|       [John Cherr

In [46]:
# Creaing data pipeline
Disney_pipeline = Pipeline(stages=[Disney_numeric2, tokenizer2, remover2, Disney_cv2, Disney_idf2, Disney_Assembler2])                        

In [47]:
Disney_fit2 = Disney_pipeline.fit(Disney5)
Disney_clean2 = Disney_fit2.transform(Disney5)
Disney_cleaned2 = Disney_clean2.select('label', 'features')

Disney_cleaned2.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(4373,[15,21,83,1...|
|  3.0|(4373,[4,121,144,...|
|  0.0|(4373,[121,1169,1...|
|  1.0|(4373,[6,114,445,...|
|  6.0|(4373,[6,49,134,2...|
|  5.0|(4373,[8,11,1438,...|
|  5.0|(4373,[8,314,937,...|
|  1.0|(4373,[6,47,124,5...|
|  2.0|(4373,[167,279,30...|
|  1.0|(4373,[3,103,108,...|
|  0.0|(4373,[61,304,614...|
|  2.0|(4373,[97,424,609...|
|  1.0|(4373,[75,89,185,...|
|  1.0|(4373,[63,72,74,1...|
|  0.0|(4373,[3,138,335,...|
|  1.0|(4373,[126,164,18...|
|  1.0|(4373,[47,124,160...|
|  1.0|(4373,[120,372,39...|
|  3.0|(4373,[5,47,95,10...|
|  2.0|(4373,[6,23,29,30...|
+-----+--------------------+
only showing top 20 rows



In [48]:
# Splitting data into training and testing datasets 
Disney_train2, Disney_test2 = Disney_cleaned2.randomSplit([0.7, 0.3])

In [49]:
# Fitting the Naive Bayes model
Disney_model2 = NB.fit(Disney_train2)
Disney_results2 = Disney_model2.transform(Disney_test2)
Disney_results2.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(4373,[0,1,2,10,1...|[-370.71417130580...|[0.99996863372767...|       0.0|
|  0.0|(4373,[0,1,2,15,3...|[-433.50051836201...|[0.00168744117837...|       1.0|
|  0.0|(4373,[0,1,2,15,5...|[-230.29199845379...|[0.99999999998630...|       0.0|
|  0.0|(4373,[0,1,2,15,5...|[-376.40130856950...|[0.99999999999997...|       0.0|
|  0.0|(4373,[0,1,2,24,4...|[-529.36860523520...|[0.99999999999999...|       0.0|
|  0.0|(4373,[0,1,2,24,5...|[-359.31344164380...|[0.99991390574259...|       0.0|
|  0.0|(4373,[0,1,2,24,5...|[-329.80746626618...|[2.02048251846267...|       2.0|
|  0.0|(4373,[0,1,2,24,1...|[-402.12882325415...|[2.56354734406296...|       2.0|
|  0.0|(4373,[0,1,2,24,2...|[-430.62260395438...|[4.07675365721737...|       1.0|
|  0.0|(4373,[0,

In [50]:
# Evaluating NB model
accuracy2 = accuracy_eval.evaluate(Disney_results2)
print(accuracy2)

0.34993173574575764


Netflix Data

In [51]:
Netflix3 = netflix_clean_nulls.select("type", "title", "director", "cast", "rating", "listed_in", "description")
Netflix3.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|  Movie|Dick Johnson Is Dead|   [Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|
|TV Show|       Blood & Water|                null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|
|TV Show|           Ganglands|   [Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|
|TV Show|Jailbirds New Orl...|                null|                null| TV-MA|[Docuseries,  Rea...|Feuds, flirtation...|
|TV Show|        Kota Factory|                null|[Mayur More,  Jit...| TV-MA|[International TV...|In a city of coac...|
|TV Show|       Midnight

In [52]:
Netflix3.select("rating").distinct().show()

+--------------------+
|              rating|
+--------------------+
|    November 1, 2020|
|    Shavidee Trotter|
|       Adriane Lenox|
|                TV-Y|
|       Maury Chaykin|
|                2019|
|                2017|
|                  UR|
| Keppy Ekpenyong ...|
|      Benn Northover|
|                  PG|
|         Jide Kosoko|
|               TV-MA|
|     Jowharah Jones"|
|            TV-Y7-FV|
|                2006|
|      Itziar Aizpuru|
|                  NR|
|               TV-PG|
|               NC-17|
+--------------------+
only showing top 20 rows



In [53]:
Netflix3.select("listed_in").distinct().show()

+--------------------+
|           listed_in|
+--------------------+
|[Kids' TV,  Korea...|
|[Comedies,  Drama...|
|[Kids' TV,  TV Th...|
|[International TV...|
|[TV Dramas,  TV S...|
|[Action & Adventu...|
|[Classic Movies, ...|
|[Classic Movies, ...|
|            [71 min]|
|[ Janeane Garofalo"]|
|[Crime TV Shows, ...|
|[Comedies,  Inter...|
|[Kids' TV,  TV Sc...|
|[Comedies,  Music...|
|     [ Margaret Cho]|
|[Action & Adventure]|
|[Action & Adventu...|
|[Comedies,  Roman...|
|[Action & Adventu...|
|[Independent Movi...|
+--------------------+
only showing top 20 rows



Natural Language Processing

In [54]:
Netflix4 = Netflix3.withColumn('title_length', length(Netflix3['title']))
Netflix4.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|title_length|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|  Movie|Dick Johnson Is Dead|   [Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|          20|
|TV Show|       Blood & Water|                null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|          13|
|TV Show|           Ganglands|   [Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|           9|
|TV Show|Jailbirds New Orl...|                null|                null| TV-MA|[Docuseries,  Rea...|Feuds, flirtation...|          21|
|TV Show|        Kota Factory|                null|[May

In [55]:
# Using description length for transforming features later on
Netflix5 = Netflix4.withColumn('description_length', length(Netflix4['description']))
Netflix5.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|title_length|description_length|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+
|  Movie|Dick Johnson Is Dead|   [Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|          20|               152|
|TV Show|       Blood & Water|                null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|          13|               147|
|TV Show|           Ganglands|   [Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|           9|               146|
|TV Show|Jailbirds New Orl...|                null|                null| TV-

In [56]:
# Tokenize words in title column
reg_tokenizer3 = RegexTokenizer(inputCol="title", outputCol="Net_words", pattern='\\W')
reg_tokenized3 = reg_tokenizer3.transform(Netflix5)

# Tokenize words in description column
reg_tokenizer4 = RegexTokenizer(inputCol="description", outputCol="Net_tokens", pattern='\\W')
reg_tokenized4 = reg_tokenizer4.transform(reg_tokenized3)

reg_tokenized4.show()

+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+--------------------+
|   type|               title|            director|                cast|rating|           listed_in|         description|title_length|description_length|           Net_words|          Net_tokens|
+-------+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+------------------+--------------------+--------------------+
|  Movie|Dick Johnson Is Dead|   [Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|          20|               152|[dick, johnson, i...|[as, her, father,...|
|TV Show|       Blood & Water|                null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|          13|               147|      [blood, water]|[after, crossing,...|
|TV Show|           

Model 3: Running model on Netflix title column

In [57]:
# Remove stopwords in tokenized words column
Netflix_remover = StopWordsRemover(inputCol='Net_words', outputCol='filtered_title')
Netflix_removed = Netflix_remover.transform(reg_tokenized4)

# Processing title column
Netflix_cv = CountVectorizer(inputCol='filtered_title', outputCol='vec_title')
Netflix_idf = IDF(inputCol='vec_title', outputCol='Netflix_tfidf')
Netflix_numeric = StringIndexer(inputCol='rating', outputCol='label')
Netflix_Assembler = VectorAssembler(inputCols=['Netflix_tfidf','title_length'], outputCol='features')

In [58]:
# Creaing data pipeline
Netflix_pipeline = Pipeline(stages=[Netflix_numeric, reg_tokenizer3, Netflix_remover, Netflix_cv, Netflix_idf, Netflix_Assembler])   

In [59]:
Netflix_fit = Netflix_pipeline.fit(Netflix5)
Netflix_clean = Netflix_fit.transform(Netflix5)
Netflix_cleaned = Netflix_clean.select('label', 'features')

In [60]:
Netflix_cleaned.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  4.0|(8786,[87,853,140...|
|  0.0|(8786,[89,159,878...|
|  0.0|(8786,[4587,8785]...|
|  0.0|(8786,[27,2626,81...|
|  0.0|(8786,[479,4220,8...|
|  0.0|(8786,[218,7991,8...|
|  7.0|(8786,[9,27,281,3...|
|  0.0|(8786,[5022,8785]...|
|  1.0|(8786,[40,49,627,...|
|  4.0|(8786,[5318,8785]...|
|  0.0|(8786,[244,659,11...|
|  0.0|(8786,[158,344,87...|
|  0.0|(8786,[4031,6413,...|
|  2.0|(8786,[21,608,472...|
|  0.0|(8786,[37,107,326...|
|  0.0|(8786,[82,110,257...|
|  0.0|(8786,[2,358,587,...|
|  0.0|(8786,[6946,8473,...|
|  1.0|(8786,[3122,8785]...|
|  0.0|(8786,[6244,8785]...|
+-----+--------------------+
only showing top 20 rows



In [61]:
# Splitting data into training and testing datasets 
Netflix_train, Netflix_test = Netflix_cleaned.randomSplit([0.7, 0.3])

In [62]:
# Fitting the Naive Bayes model 3
Netflix_model = NB.fit(Netflix_train)
Netflix_results = Netflix_model.transform(Netflix_test)
Netflix_results.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(8786,[0,4,312,54...|[-422.55556751590...|[0.02054461564010...|       3.0|
|  0.0|(8786,[0,6,8785],...|[-72.425223320185...|[0.01457534079314...|       1.0|
|  0.0|(8786,[0,14,84,87...|[-119.62419440733...|[0.97948807490998...|       0.0|
|  0.0|(8786,[0,164,1431...|[-155.27498788950...|[0.99999543425453...|       0.0|
|  0.0|(8786,[0,253,319,...|[-230.03252039036...|[0.00123258439503...|       3.0|
|  0.0|(8786,[0,570,8785...|[-119.17176491163...|[6.58156795231826...|       1.0|
|  0.0|(8786,[0,825,8785...|[-92.903863744284...|[0.99999999832510...|       0.0|
|  0.0|(8786,[0,1883,878...|[-126.43116802309...|[0.00691842135593...|       1.0|
|  0.0|(8786,[0,4117,878...|[-128.65878605641...|[0.00240130998946...|       2.0|
|  0.0|(8786,[0,

In [63]:
Netflix_results.select("label").distinct().show()

+-----+
|label|
+-----+
|  8.0|
|  0.0|
|  7.0|
|  1.0|
| 25.0|
|  4.0|
| 31.0|
| 11.0|
| 22.0|
|  3.0|
| 19.0|
|  2.0|
| 10.0|
| 13.0|
|  6.0|
|  5.0|
|  9.0|
| 16.0|
+-----+



Model Evaluation

In [65]:
# Evaluating NB model 3
accuracy3 = accuracy_eval.evaluate(Netflix_results)
print(accuracy3)

0.26067613494454384


Model 4: Running NB model on Netflix description column

In [66]:
# Remove stopwords in tokenized Net_tokens column
Netflix_remover2 = StopWordsRemover(inputCol='Net_tokens', outputCol='filtered_description')
removed_Netflix = Netflix_remover2.transform(Netflix_removed)

# Processing Netflix description column
Netflix_cv2 = CountVectorizer(inputCol='filtered_description', outputCol='vec_description')
Netflix_idf2 = IDF(inputCol='vec_description', outputCol='Netflix_tfidf2')
Netflix_numeric2 = StringIndexer(inputCol='rating', outputCol='label')
Netflix_Assembler2 = VectorAssembler(inputCols=['Netflix_tfidf2','description_length'], outputCol='features')

In [67]:
# Creaing data pipeline
Netflix_pipeline2 = Pipeline(stages=[Netflix_numeric2, reg_tokenizer4, Netflix_remover2, Netflix_cv2, Netflix_idf2, Netflix_Assembler2]) 

In [68]:
Netflix_fit2 = Netflix_pipeline2.fit(Netflix5)
Netflix_clean2 = Netflix_fit2.transform(Netflix5)
Netflix_cleaned2 = Netflix_clean2.select('label', 'features')

In [69]:
Netflix_cleaned2.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  4.0|(18945,[0,16,22,4...|
|  0.0|(18945,[15,33,35,...|
|  0.0|(18945,[3,41,42,9...|
|  0.0|(18945,[2,11,56,1...|
|  0.0|(18945,[0,9,31,68...|
|  0.0|(18945,[1,33,258,...|
|  7.0|(18945,[121,163,1...|
|  0.0|(18945,[29,43,46,...|
|  1.0|(18945,[52,114,18...|
|  4.0|(18945,[0,8,14,62...|
|  0.0|(18945,[55,224,24...|
|  0.0|(18945,[5,10,147,...|
|  0.0|(18945,[1,3,8,25,...|
|  2.0|(18945,[2,15,238,...|
|  0.0|(18945,[40,55,85,...|
|  0.0|(18945,[0,105,178...|
|  0.0|(18945,[0,4,432,5...|
|  0.0|(18945,[17,73,95,...|
|  1.0|(18945,[2,17,47,7...|
|  0.0|(18945,[25,182,23...|
+-----+--------------------+
only showing top 20 rows



In [70]:
# Splitting data into training and testing datasets 
Netflix_train2, Netflix_test2 = Netflix_cleaned2.randomSplit([0.7, 0.3])

In [71]:
# Fitting the Naive Bayes model 4
Netflix_model2 = NB.fit(Netflix_train2)
Netflix_results2 = Netflix_model2.transform(Netflix_test2)
Netflix_results2.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(18945,[0,1,2,7,1...|[-952.43473994728...|[1.09076549002074...|       1.0|
|  0.0|(18945,[0,1,3,11,...|[-684.70114788446...|[0.62922102577005...|       0.0|
|  0.0|(18945,[0,1,4,5,1...|[-970.53553121675...|[3.87261065726891...|       1.0|
|  0.0|(18945,[0,1,5,447...|[-875.80711268951...|[5.86659002565289...|       2.0|
|  0.0|(18945,[0,1,6,7,2...|[-772.14922556965...|[0.99999999999929...|       0.0|
|  0.0|(18945,[0,1,7,117...|[-792.09281212905...|[3.88752047821577...|       1.0|
|  0.0|(18945,[0,1,19,23...|[-808.96081916887...|[5.57810074752518...|       1.0|
|  0.0|(18945,[0,1,37,48...|[-741.82117087272...|[8.7849288147328E...|       1.0|
|  0.0|(18945,[0,2,5,14,...|[-649.29069434911...|[1.90660114104433...|       1.0|
|  0.0|(18945,[0

Model Evaluation

In [72]:
# Evaluating NB model 4
accuracy4 = accuracy_eval.evaluate(Netflix_results2)
print(accuracy4)

0.35312534580917576
