In [1]:
# Importing the necessary modules
import findspark
findspark.init()

# Initialize a SparkSession
from pyspark.sql import SparkSession

# Creating SparkSession
spark = SparkSession.builder.appName('TP').getOrCreate()

# Calling the session variable object
spark

Loading data sets

In [2]:
netflix_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/netflix_titles.csv"
disney_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/disney_plus_titles.csv"

from pyspark import SparkFiles
spark.sparkContext.addFile(netflix_url)
spark.sparkContext.addFile(disney_url)

netflix_df = spark.read.csv("file:///"+SparkFiles.get("netflix_titles.csv"), header=True, inferSchema= True)
disney_df = spark.read.csv("file:///"+SparkFiles.get("disney_plus_titles.csv"), header=True, inferSchema= True)

In [3]:
netflix_df.printSchema()
disney_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [4]:
netflix_df.show(3)
disney_df.show(3)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         null|Septem

In [5]:
netflix_df.createOrReplaceTempView("Netflix")
disney_df.createOrReplaceTempView("Disney")

# Data Cleaning

In [6]:
# remove (do not select) show_id and date_added columns
# convert columns with multiple values to arrays
netflix_clean = spark.sql("SELECT type, \
                  title, \
                  SPLIT(director, ',') AS director, \
                  SPLIT(cast, ',') AS cast, \
                  SPLIT(country, ',') AS country, \
                  release_year, \
                  rating, \
                  duration, \
                  SPLIT(listed_in, ',') AS listed_in, \
                  description \
                  FROM Netflix")
netflix_clean.createOrReplaceTempView("Netflix1")

disney_clean = spark.sql("SELECT type, \
                  title, \
                  SPLIT(director, ',') AS director, \
                  SPLIT(cast, ',') AS cast, \
                  SPLIT(country, ',') AS country, \
                  release_year, \
                  rating, \
                  duration, \
                  SPLIT(listed_in, ',') AS listed_in, \
                  description \
                  FROM Disney")
disney_clean.createOrReplaceTempView("Disney1")

In [7]:
# test duration and listed_in columns to see if they vary between platforms
overlap = spark.sql("SELECT Netflix1.title, Netflix1.duration, Disney1.duration,\
                     Netflix1.listed_in, Disney1.listed_in FROM Netflix1 \
                     INNER JOIN Disney1 ON Netflix1.title = Disney1.title")
overlap.show(5, truncate=False)
# there is variation - worth keeping each dataset separate

+----------------+---------+---------+--------------------------------------------------------+-----------------------------------------------------+
|title           |duration |duration |listed_in                                               |listed_in                                            |
+----------------+---------+---------+--------------------------------------------------------+-----------------------------------------------------+
|PJ Masks        |3 Seasons|5 Seasons|[Kids' TV]                                              |[Action-Adventure,  Animation,  Kids]                |
|Once Upon a Time|1 Season |7 Seasons|[International TV Shows,  Romantic TV Shows,  TV Dramas]|[Action-Adventure,  Fantasy,  Soap Opera / Melodrama]|
|Gigantosaurus   |1 Season |2 Seasons|[Kids' TV]                                              |[Action-Adventure,  Animation,  Kids]                |
|Becoming        |89 min   |1 Season |[Documentaries]                                         |[Anth

In [8]:
netflix_nulls = spark.sql("SELECT COUNT (*) AS type_nulls FROM Netflix1 WHERE type IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS title_nulls FROM Netflix1 WHERE title IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS director_nulls FROM Netflix1 WHERE director IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS cast_nulls FROM Netflix1 WHERE cast IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS country_nulls FROM Netflix1 WHERE country IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS release_year_nulls FROM Netflix1 WHERE release_year IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS rating_nulls FROM Netflix1 WHERE rating IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS duration_nulls FROM Netflix1 WHERE duration IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS listed_in_nulls FROM Netflix1 WHERE listed_in IS NULL")
netflix_nulls.show()

netflix_nulls = spark.sql("SELECT COUNT (*) AS desctiption_nulls FROM Netflix1 WHERE description IS NULL")
netflix_nulls.show()

+----------+
|type_nulls|
+----------+
|         1|
+----------+

+-----------+
|title_nulls|
+-----------+
|          2|
+-----------+

+--------------+
|director_nulls|
+--------------+
|          2636|
+--------------+

+----------+
|cast_nulls|
+----------+
|       826|
+----------+

+-------------+
|country_nulls|
+-------------+
|          832|
+-------------+

+------------------+
|release_year_nulls|
+------------------+
|                 2|
+------------------+

+------------+
|rating_nulls|
+------------+
|           6|
+------------+

+--------------+
|duration_nulls|
+--------------+
|             5|
+--------------+

+---------------+
|listed_in_nulls|
+---------------+
|              3|
+---------------+

+-----------------+
|desctiption_nulls|
+-----------------+
|                3|
+-----------------+



In [9]:
# remove rows for columns having <10 null values
# director, cast & country columns not as important to be NOT NULL/
#  can be dealt with later
netflix_clean_nulls = spark.sql("SELECT type, \
                  title, \
                  director, \
                  cast, \
                  country, \
                  release_year, \
                  rating, \
                  duration, \
                  listed_in, \
                  description \
                  FROM Netflix1 \
                  WHERE type IS NOT NULL AND \
                  title IS NOT NULL AND \
                  release_year IS NOT NULL AND \
                  rating IS NOT NULL AND \
                  duration IS NOT NULL AND \
                  listed_in IS NOT NULL AND \
                  description IS NOT NULL")
netflix_clean_nulls.createOrReplaceTempView("Netflix2")

In [10]:
disney_nulls = spark.sql("SELECT COUNT (*) AS type_nulls FROM Disney1 WHERE type IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS title_nulls FROM Disney1 WHERE title IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS director_nulls FROM Disney1 WHERE director IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS cast_nulls FROM Disney1 WHERE cast IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS country_nulls FROM Disney1 WHERE country IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS release_year_nulls FROM Disney1 WHERE release_year IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS rating_nulls FROM Disney1 WHERE rating IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS duration_nulls FROM Disney1 WHERE duration IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS listed_in_nulls FROM Disney1 WHERE listed_in IS NULL")
disney_nulls.show()

disney_nulls = spark.sql("SELECT COUNT (*) AS desctiption_nulls FROM Disney1 WHERE description IS NULL")
disney_nulls.show()

+----------+
|type_nulls|
+----------+
|         0|
+----------+

+-----------+
|title_nulls|
+-----------+
|          0|
+-----------+

+--------------+
|director_nulls|
+--------------+
|           473|
+--------------+

+----------+
|cast_nulls|
+----------+
|       190|
+----------+

+-------------+
|country_nulls|
+-------------+
|          218|
+-------------+

+------------------+
|release_year_nulls|
+------------------+
|                 0|
+------------------+

+------------+
|rating_nulls|
+------------+
|           3|
+------------+

+--------------+
|duration_nulls|
+--------------+
|             0|
+--------------+

+---------------+
|listed_in_nulls|
+---------------+
|              1|
+---------------+

+-----------------+
|desctiption_nulls|
+-----------------+
|                0|
+-----------------+



In [11]:
# remove rows for columns having <10 null values
# director, cast & country columns not as important to be NOT NULL/
#  can be dealt with later
disney_clean_nulls = spark.sql("SELECT type, \
                  title, \
                  director, \
                  cast, \
                  country, \
                  release_year, \
                  rating, \
                  duration, \
                  listed_in, \
                  description \
                  FROM Disney1 \
                  WHERE rating IS NOT NULL AND \
                  listed_in IS NOT NULL")
disney_clean_nulls.createOrReplaceTempView("Disney2")

NETFLIX table: Netflix2

DISNEY table: Disney2

In [12]:
# table counts
spark.sql("SELECT COUNT (*) FROM Netflix2").show()
spark.sql("SELECT COUNT (*) FROM Disney2").show()

+--------+
|count(1)|
+--------+
|    8799|
+--------+

+--------+
|count(1)|
+--------+
|    1446|
+--------+



# Data Exploration and Analysis

# Machine learning method selection and model training

In [14]:
# Importing necessary modules

from pyspark.ml.feature import (CountVectorizer, RegexTokenizer, StopWordsRemover, IDF, StringIndexer)

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

Disney Data

In [15]:
Disney3 = disney_clean_nulls.select("title", "director", "cast", "rating", "listed_in", "description")
Disney3.show(3)

+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|               title|            director|                cast|rating|           listed_in|         description|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|
|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|
|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
only showing top 3 rows



In [16]:
Disney3.select("rating").distinct().show()

+-----------------+
|           rating|
+-----------------+
|             TV-Y|
|December 25, 2020|
|               PG|
|    United States|
|         TV-Y7-FV|
|            TV-PG|
|                G|
|            TV-14|
|             TV-G|
|            TV-Y7|
|            PG-13|
+-----------------+



In [17]:
Disney3.select("listed_in").distinct().show()

+--------------------+
|           listed_in|
+--------------------+
|[Musical,  Romanc...|
|[Family,  Fantasy...|
|[Biographical,  D...|
|   [Drama,  Fantasy]|
|[Coming of Age,  ...|
|[Comedy,  Coming ...|
|[Drama,  Music,  ...|
|[Comedy,  Family,...|
|[Action-Adventure...|
|[Documentary,  Fa...|
|[Animation,  Fami...|
|[Animation,  Family]|
|[Docuseries,  Fam...|
|[Animation,  Fami...|
|[Biographical,  H...|
|[Docuseries,  Sci...|
|[Animation,  Kids...|
|[Animation,  Kids...|
|[Action-Adventure...|
|              [2016]|
+--------------------+
only showing top 20 rows



Natural Language Processing

In [18]:
from pyspark.sql.functions import length

Disney4 = Disney3.withColumn('title_length', length(Disney3['title']))
Disney4.show(3)

+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|               title|            director|                cast|rating|           listed_in|         description|title_length|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|
|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|
|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+
only showing top 3 rows



In [19]:
# Tokenize words in title column
tokenizer1 = RegexTokenizer(inputCol= 'title', outputCol='Disney_words', pattern='\\W')
tokenized = tokenizer1.transform(Disney4)
tokenized.show(3)

# Tokenize words in description column
tokenizer2 = RegexTokenizer(inputCol= 'description', outputCol='Disney_tokens', pattern='\\W')
tokenized2 = tokenizer2.transform(tokenized)
tokenized2.show(3, truncate=False)

+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+--------------------+
|               title|            director|                cast|rating|           listed_in|         description|title_length|        Disney_words|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+------------+--------------------+
|Duck the Halls: A...|[Alonso Ramirez R...|[Chris Diamantopo...|  TV-G|[Animation,  Family]|Join Mickey and t...|          48|[duck, the, halls...|
|Ernest Saves Chri...|       [John Cherry]|[Jim Varney,  Noe...|    PG|            [Comedy]|Santa Claus passe...|          22|[ernest, saves, c...|
|Ice Age: A Mammot...|      [Karen Disher]|[Raymond Albert R...|  TV-G|[Animation,  Come...|Sid the Sloth is ...|          28|[ice, age, a, mam...|
+--------------------+--------------------+--------------------+------+--------------------+--------------------

In [43]:
# Remove stopwords in tokenized Disney_words column
Disney_remover = StopWordsRemover(inputCol='Disney_words', outputCol='filtered_title')
removed_Dis = Disney_remover.transform(tokenized2)

# Processing title column
Disney_cv = CountVectorizer(inputCol='filtered_title', outputCol='vec_title')
Disney_idf = IDF(inputCol='vec_title', outputCol='Disney_tfidf')
Disney_numeric = StringIndexer(inputCol='rating', outputCol='label')
Disney_Assembler = VectorAssembler(inputCols=['Disney_tfidf','title_length'], outputCol='features')

In [44]:
from pyspark.ml import Pipeline

# Creaing data pipeline
Disney_pipeline = Pipeline(stages=[Disney_numeric, tokenizer1, Disney_remover, Disney_cv, Disney_idf, Disney_Assembler])
                        #tokenizer2, remover2, Disney_cv2, Disney_idf2, Disney_Assembler2])

In [45]:
Disney_fit = Disney_pipeline.fit(Disney4)
Disney_clean = Disney_fit.transform(Disney4)
Disney_cleaned = Disney_clean.select('label', 'features')

In [28]:
Disney4.printSchema()

root
 |-- title: string (nullable = true)
 |-- director: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- cast: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rating: string (nullable = true)
 |-- listed_in: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- description: string (nullable = true)
 |-- title_length: integer (nullable = true)



In [47]:
Disney_cleaned.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(1857,[8,14,37,55...|
|  3.0|(1857,[14,1574,16...|
|  0.0|(1857,[14,59,96,1...|
|  1.0|(1857,[58,173,590...|
|  6.0|(1857,[370,1579,1...|
|  5.0|(1857,[1035,1856]...|
|  5.0|(1857,[243,607,13...|
|  1.0|(1857,[18,122,139...|
|  2.0|(1857,[14,41,112,...|
|  1.0|(1857,[0,72,98,50...|
|  0.0|(1857,[273,1434,1...|
|  2.0|(1857,[6,153,1856...|
|  1.0|(1857,[77,464,111...|
|  1.0|(1857,[30,670,185...|
|  0.0|(1857,[681,1856],...|
|  1.0|(1857,[764,1710,1...|
|  1.0|(1857,[0,255,534,...|
|  1.0|(1857,[0,1630,185...|
|  3.0|(1857,[385,1856],...|
|  2.0|(1857,[1217,1856]...|
+-----+--------------------+
only showing top 20 rows



Naive Bayes Classifier

In [81]:
# Building the model
NB = NaiveBayes()

In [82]:
# Splitting data into training and testing datasets 
Disney_train, Disney_test = Disney_cleaned.randomSplit([0.7, 0.3])

In [83]:
# Fitting the Naive Bayes model
Disney_model = NB.fit(Disney_train)
Disney_results = Disney_model.transform(Disney_test)
Disney_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(4373,[0,1,2,8,24...|[-300.65705534359...|[0.99990897564269...|       0.0|
|  0.0|(4373,[0,1,2,8,34...|[-358.80728188929...|[0.99999998597869...|       0.0|
|  0.0|(4373,[0,1,2,9,22...|[-242.99858610923...|[0.99999999999619...|       0.0|
|  0.0|(4373,[0,1,2,15,3...|[-385.87033757774...|[1.51270410802234...|       1.0|
|  0.0|(4373,[0,1,2,15,4...|[-247.22642363918...|[0.99999999958346...|       0.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



# Model Evaluation

In [84]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [85]:
# Evaluating NB model
accuracy_eval = MulticlassClassificationEvaluator()



In [86]:
accuracy = accuracy_eval.evaluate(Disney_results)
print(accuracy)

0.3138789630307071


Building model on rating as label and description column as feature

In [87]:
# Remove stopwords in tokenized Disney_tokens column
remover2 = StopWordsRemover(inputCol='Disney_tokens', outputCol='filtered_description')
removed_Disney = remover2.transform(removed_Dis)

# Processing description column
Disney_cv2 = CountVectorizer(inputCol='filtered_description', outputCol='vec_description')
Disney_idf2 = IDF(inputCol='vec_description', outputCol='Disney_tfidf2')
Disney_numeric2 = StringIndexer(inputCol='rating', outputCol='label')
Disney_Assembler2 = VectorAssembler(inputCols=['Disney_tfidf2','title_length'], outputCol='features')

In [88]:
# Creaing data pipeline
Disney_pipeline = Pipeline(stages=[Disney_numeric2, tokenizer2, remover2, Disney_cv2, Disney_idf2, Disney_Assembler2])                        

In [89]:
Disney_fit2 = Disney_pipeline.fit(Disney4)
Disney_clean2 = Disney_fit2.transform(Disney4)
Disney_cleaned2 = Disney_clean2.select('label', 'features')

In [90]:
# Splitting data into training and testing datasets 
Disney_train2, Disney_test2 = Disney_cleaned2.randomSplit([0.7, 0.3])

In [91]:
# Fitting the Naive Bayes model
Disney_model2 = NB.fit(Disney_train2)
Disney_results2 = Disney_model2.transform(Disney_test2)
Disney_results2.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(4373,[0,1,2,8,34...|[-365.62661739659...|[0.99999946297615...|       0.0|
|  0.0|(4373,[0,1,2,9,22...|[-234.29006985565...|[0.99999999999999...|       0.0|
|  0.0|(4373,[0,1,2,10,1...|[-316.19606082521...|[0.79078415137162...|       0.0|
|  0.0|(4373,[0,1,2,10,1...|[-402.56173510099...|[1.35681568698504...|       4.0|
|  0.0|(4373,[0,1,2,15,5...|[-399.64235703781...|[1.52659887630714...|       4.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [92]:
# Evaluating NB model
accuracy2 = accuracy_eval.evaluate(Disney_results2)
print(accuracy2)

0.34431647400515286


Netflix Data

In [93]:
Netflix3 = netflix_clean_nulls.select("title", "director", "cast", "rating", "listed_in", "description")
Netflix3.show()

+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|               title|            director|                cast|rating|           listed_in|         description|
+--------------------+--------------------+--------------------+------+--------------------+--------------------+
|Dick Johnson Is Dead|   [Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|
|       Blood & Water|                null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|
|           Ganglands|   [Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|
|Jailbirds New Orl...|                null|                null| TV-MA|[Docuseries,  Rea...|Feuds, flirtation...|
|        Kota Factory|                null|[Mayur More,  Jit...| TV-MA|[International TV...|In a city of coac...|
|       Midnight Mass|     [Mike Flanagan]|[Kate Siegel,  Za...| TV-MA|[TV Dramas,  TV H

In [94]:
Netflix3.select("rating").distinct().show()

+--------------------+
|              rating|
+--------------------+
|    November 1, 2020|
|    Shavidee Trotter|
|       Adriane Lenox|
|                TV-Y|
|       Maury Chaykin|
|                2019|
|                2017|
|                  UR|
| Keppy Ekpenyong ...|
|      Benn Northover|
|                  PG|
|         Jide Kosoko|
|               TV-MA|
|     Jowharah Jones"|
|            TV-Y7-FV|
|                2006|
|      Itziar Aizpuru|
|                  NR|
|               TV-PG|
|               NC-17|
+--------------------+
only showing top 20 rows



In [95]:
Netflix3.select("listed_in").distinct().show(50)

+--------------------+
|           listed_in|
+--------------------+
|[Kids' TV,  Korea...|
|[Comedies,  Drama...|
|[Kids' TV,  TV Th...|
|[International TV...|
|[TV Dramas,  TV S...|
|[Action & Adventu...|
|[Classic Movies, ...|
|[Classic Movies, ...|
|            [71 min]|
|[ Janeane Garofalo"]|
|[Crime TV Shows, ...|
|[Comedies,  Inter...|
|[Kids' TV,  TV Sc...|
|[Comedies,  Music...|
|     [ Margaret Cho]|
|[Action & Adventure]|
|[Action & Adventu...|
|[Comedies,  Roman...|
|[Action & Adventu...|
|[Independent Movi...|
|[Reality TV,  TV ...|
|[Anime Features, ...|
|[Action & Adventu...|
|     [Documentaries]|
|[British TV Shows...|
|          [TV Shows]|
|[Crime TV Shows, ...|
|[TV Dramas,  TV S...|
|[British TV Shows...|
|[Comedies,  Cult ...|
|[Dramas,  Interna...|
|[Kids' TV,  Reali...|
|          [Comedies]|
|[Comedies,  LGBTQ...|
|[Action & Adventu...|
|[International Mo...|
|[Documentaries,  ...|
|[Children & Famil...|
|[Dramas,  Faith &...|
|[TV Comedies,  TV...|
|[Internati

In [96]:
Netflix4 = Netflix3.withColumn('title_length', length(Netflix3['title']))
Netflix4.show(3)

+--------------------+-----------------+--------------------+------+--------------------+--------------------+------------+
|               title|         director|                cast|rating|           listed_in|         description|title_length|
+--------------------+-----------------+--------------------+------+--------------------+--------------------+------------+
|Dick Johnson Is Dead|[Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|          20|
|       Blood & Water|             null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|          13|
|           Ganglands|[Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|           9|
+--------------------+-----------------+--------------------+------+--------------------+--------------------+------------+
only showing top 3 rows



In [98]:
# Tokenize words in title column
reg_tokenizer3 = RegexTokenizer(inputCol="title", outputCol="Net_words", pattern='\\W')
reg_tokenized3 = reg_tokenizer3.transform(Netflix3)

# Tokenize words in description column
reg_tokenizer4 = RegexTokenizer(inputCol="description", outputCol="Net_tokens", pattern='\\W')
reg_tokenized4 = reg_tokenizer4.transform(reg_tokenized3)

reg_tokenized4.show(3)

+--------------------+-----------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|               title|         director|                cast|rating|           listed_in|         description|           Net_words|          Net_tokens|
+--------------------+-----------------+--------------------+------+--------------------+--------------------+--------------------+--------------------+
|Dick Johnson Is Dead|[Kirsten Johnson]|                null| PG-13|     [Documentaries]|As her father nea...|[dick, johnson, i...|[as, her, father,...|
|       Blood & Water|             null|[Ama Qamata,  Kho...| TV-MA|[International TV...|After crossing pa...|      [blood, water]|[after, crossing,...|
|           Ganglands|[Julien Leclercq]|[Sami Bouajila,  ...| TV-MA|[Crime TV Shows, ...|To protect his fa...|         [ganglands]|[to, protect, his...|
+--------------------+-----------------+--------------------+------+--------------

Running model on Netflix title column

In [100]:
# Remove stopwords in tokenized words column
Netflix_remover = StopWordsRemover(inputCol='Net_words', outputCol='filtered_title')
Netflix_removed = Netflix_remover.transform(reg_tokenized4)

# Processing title column
Netflix_cv = CountVectorizer(inputCol='filtered_title', outputCol='vec_title')
Netflix_idf = IDF(inputCol='vec_title', outputCol='Netflix_tfidf')
Netflix_numeric = StringIndexer(inputCol='rating', outputCol='label')
Netflix_Assembler = VectorAssembler(inputCols=['Netflix_tfidf','title_length'], outputCol='features')

In [102]:
# Creaing data pipeline
Netflix_pipeline = Pipeline(stages=[Netflix_numeric, reg_tokenizer3, Netflix_remover, Netflix_cv, Netflix_idf, Netflix_Assembler])   

In [103]:
Netflix_fit = Netflix_pipeline.fit(Netflix4)
Netflix_clean = Netflix_fit.transform(Netflix4)
Netflix_cleaned = Netflix_clean.select('label', 'features')

In [104]:
# Splitting data into training and testing datasets 
Netflix_train, Netflix_test = Netflix_cleaned.randomSplit([0.7, 0.3])

In [105]:
# Fitting the Naive Bayes model 3
Netflix_model = NB.fit(Netflix_train)
Netflix_results = Netflix_model.transform(Netflix_test)
Netflix_results.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(8786,[0,6,306,87...|[-133.00964010522...|[0.99608505909534...|       0.0|
|  0.0|(8786,[0,14,84,87...|[-119.27701652726...|[0.89907596148904...|       0.0|
|  0.0|(8786,[0,20,647,8...|[-145.66312945080...|[0.08215241394413...|       3.0|
|  0.0|(8786,[0,54,87,38...|[-190.37203731860...|[0.99999999999954...|       0.0|
|  0.0|(8786,[0,66,2204,...|[-180.16328493820...|[4.36707716673711...|       3.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



Model Evaluation

In [106]:
# Evaluating NB model 3
accuracy3 = accuracy_eval.evaluate(Netflix_results)
print(accuracy3)

0.27992713411574566


Running model on description column

In [107]:
# Remove stopwords in tokenized Net_tokens column
Netflix_remover2 = StopWordsRemover(inputCol='Net_tokens', outputCol='filtered_description')
removed_Netflix = Netflix_remover2.transform(Netflix_removed)

# Processing Netflix description column
Netflix_cv2 = CountVectorizer(inputCol='filtered_description', outputCol='vec_description')
Netflix_idf2 = IDF(inputCol='vec_description', outputCol='Netflix_tfidf2')
Netflix_numeric2 = StringIndexer(inputCol='rating', outputCol='label')
Netflix_Assembler2 = VectorAssembler(inputCols=['Netflix_tfidf2','title_length'], outputCol='features')

In [108]:
# Creaing data pipeline
Netflix_pipeline2 = Pipeline(stages=[Netflix_numeric2, reg_tokenizer4, Netflix_remover2, Netflix_cv2, Netflix_idf2, Netflix_Assembler2]) 

In [109]:
Netflix_fit2 = Netflix_pipeline2.fit(Netflix4)
Netflix_clean2 = Netflix_fit2.transform(Netflix4)
Netflix_cleaned2 = Netflix_clean2.select('label', 'features')

In [110]:
# Splitting data into training and testing datasets 
Netflix_train2, Netflix_test2 = Netflix_cleaned2.randomSplit([0.7, 0.3])

In [111]:
# Fitting the Naive Bayes model 4
Netflix_model2 = NB.fit(Netflix_train2)
Netflix_results2 = Netflix_model2.transform(Netflix_test2)
Netflix_results2.show(5)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(18945,[0,1,2,8,1...|[-500.08243614814...|[0.99999999985188...|       0.0|
|  0.0|(18945,[0,1,2,18,...|[-963.59648177771...|[2.16964423154668...|       1.0|
|  0.0|(18945,[0,1,19,23...|[-741.98563489087...|[5.69095791750769...|       1.0|
|  0.0|(18945,[0,2,7,29,...|[-666.41727468452...|[1.0,2.5928826361...|       0.0|
|  0.0|(18945,[0,2,9,81,...|[-919.24359865525...|[8.60055443255628...|       1.0|
+-----+--------------------+--------------------+--------------------+----------+
only showing top 5 rows



In [112]:
# Evaluating NB model 3
accuracy4 = accuracy_eval.evaluate(Netflix_results2)
print(accuracy4)

0.34025981924791665
