In [4]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 35 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 36.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=5b05561dd3a8c3229930dd86dac1b4e69d348cee0833d78faf050af193c6fde2
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [5]:
pip install findspark



In [6]:
# Importing the necessary modules
import findspark
findspark.init()

# Initialize a SparkSession
from pyspark.sql import SparkSession

# Creating SparkSession
spark = SparkSession.builder.appName('TP').getOrCreate()

# Calling the session variable object
spark

In [7]:
netflix_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/netflix_titles.csv"
disney_url = "https://raw.githubusercontent.com/AmandaClinnie/DS625-TeamProject/main/disney_plus_titles.csv"

from pyspark import SparkFiles
spark.sparkContext.addFile(netflix_url)
spark.sparkContext.addFile(disney_url)

netflix_df = spark.read.csv("file:///"+SparkFiles.get("netflix_titles.csv"), header=True, inferSchema= True)
disney_df = spark.read.csv("file:///"+SparkFiles.get("disney_plus_titles.csv"), header=True, inferSchema= True)

In [8]:
netflix_df.printSchema()
disney_df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [9]:
netflix_df.show(3)
disney_df.show(3)

+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|       director|                cast|      country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+---------------+--------------------+-------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|Kirsten Johnson|                null|United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|           null|Ama Qamata, Khosi...| South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglands|Julien Leclercq|Sami Bouajila, Tr...|         null|Septem

In [10]:
disney_df.count()
#netflix_df.count()

1450

In [11]:

netflix_df.filter(netflix_df.director.isNull()).show()

+-------+-------+--------------------+--------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|director|                cast|       country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------+--------------------+--------------+------------------+------------+------+---------+--------------------+--------------------+
|     s2|TV Show|       Blood & Water|    null|Ama Qamata, Khosi...|  South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s4|TV Show|Jailbirds New Orl...|    null|                null|          null|September 24, 2021|        2021| TV-MA| 1 Season|Docuseries, Reali...|Feuds, flirtation...|
|     s5|TV Show|        Kota Factory|    null|Mayur More, Jiten...|         India|September 24, 2021|        2021| TV-MA|2 S

In [12]:
disney_cl01 = disney_df.filter(disney_df.director.isNotNull())

In [13]:
disney_cl02 = disney_cl01.filter(disney_cl01.release_year.isNotNull())

In [14]:
disney_cl03 = disney_cl02.filter(disney_cl02.cast.isNotNull())

In [15]:
disney_nb = disney_cl03.filter(disney_cl01.rating.isNotNull())

In [16]:
disney_nb.count()

879

In [17]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import * 
from pyspark.ml import Pipeline 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.feature import StringIndexer 
from pyspark.ml.classification import NaiveBayes 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [18]:

indexers = [
StringIndexer(inputCol="rating", outputCol = "rating_index"),  
StringIndexer(inputCol="cast", outputCol = "cast_index"),
StringIndexer(inputCol="release_year", outputCol = "year_index"),
StringIndexer(inputCol="director", outputCol = "label")]

pipeline = Pipeline(stages=indexers) 

In [19]:
indexed_disney = pipeline.fit(disney_nb).transform(disney_nb) 
indexed_disney.show(5,False) 
#We have given False for turn off default truncation

+-------+-----+------------------------------------------------+---------------------------------+--------------------------------------------------------------------------------------------+-------------+-----------------+------------+------+--------+-------------------------+-------------------------------------------------------------------------+------------+----------+----------+-----+
|show_id|type |title                                           |director                         |cast                                                                                        |country      |date_added       |release_year|rating|duration|listed_in                |description                                                              |rating_index|cast_index|year_index|label|
+-------+-----+------------------------------------------------+---------------------------------+--------------------------------------------------------------------------------------------+-------------+-------

In [20]:
vectorAssembler = VectorAssembler(inputCols = ["rating_index", "year_index", "cast_index"],outputCol = "features") 
vindexed_disney = vectorAssembler.transform(indexed_disney) 
vindexed_disney.show(5, False)

+-------+-----+------------------------------------------------+---------------------------------+--------------------------------------------------------------------------------------------+-------------+-----------------+------------+------+--------+-------------------------+-------------------------------------------------------------------------+------------+----------+----------+-----+----------------+
|show_id|type |title                                           |director                         |cast                                                                                        |country      |date_added       |release_year|rating|duration|listed_in                |description                                                              |rating_index|cast_index|year_index|label|features        |
+-------+-----+------------------------------------------------+---------------------------------+--------------------------------------------------------------------------------

In [21]:
splits = vindexed_disney.randomSplit([0.6,0.4], 42) 
# optional value 42 is seed for sampling 
train_df = splits[0] 
test_df = splits[1]

In [22]:
nb = NaiveBayes(modelType="multinomial")

In [23]:
nbmodel = nb.fit(train_df)

In [24]:
predictions_df = nbmodel.transform(test_df)
predictions_df.show(5, True)

+-------+-----+--------------------+--------------------+--------------------+-------------+------------------+------------+------+--------+--------------------+--------------------+------------+----------+----------+-----+----------------+--------------------+--------------------+----------+
|show_id| type|               title|            director|                cast|      country|        date_added|release_year|rating|duration|           listed_in|         description|rating_index|cast_index|year_index|label|        features|       rawPrediction|         probability|prediction|
+-------+-----+--------------------+--------------------+--------------------+-------------+------------------+------------+------+--------+--------------------+--------------------+------------+----------+----------+-----+----------------+--------------------+--------------------+----------+
|     s1|Movie|Duck the Halls: A...|Alonso Ramirez Ra...|Chris Diamantopou...|         null| November 26, 2021|       

In [25]:
predictions_df.select([c for c in predictions_df.columns if c in ['director','rating','label','cast']]).show()

+--------------------+--------------------+------+-----+
|            director|                cast|rating|label|
+--------------------+--------------------+------+-----+
|Alonso Ramirez Ra...|Chris Diamantopou...|  TV-G|153.0|
|          P.J. Hogan|Isla Fisher, Hugh...|    PG|448.0|
|      Michael Hegner|Tom Kane, Anthony...| TV-Y7| 15.0|
|       Salvador Simó|Dove Cameron, Sof...|  TV-G|499.0|
|      Michael Hegner|Tom Kane, Anthony...| TV-Y7| 15.0|
|      Michael Hegner|Tom Kane, Anthony...| TV-Y7| 15.0|
|Roberts Gannaway,...|Daveigh Chase, Ch...|     G|488.0|
|        Charles Haid|Kyle Massey, Kay ...|  TV-G| 62.0|
|        James Lapine|Michael Fox, Nath...|    PG|314.0|
|     Steven Tsuchida|Tyra Banks, Franc...| TV-14|531.0|
|Christopher Sande...|Daveigh Chase, Ch...| TV-PG|219.0|
|           Paul Hoen|Paul Kiernan, Tim...|  TV-G|  0.0|
| Katie Bauer Murdock|       Devin E. Haqq| TV-PG|103.0|
|         Peyton Reed|Paul Rudd, Evange...| PG-13|125.0|
|         Joss Whedon|Robert Do

In [26]:
netflix_df.count()

8809

In [27]:
netflix_cl01 = netflix_df.filter(netflix_df.director.isNotNull())

In [28]:
netflix_cl02 = netflix_cl01.filter(netflix_cl01.release_year.isNotNull())

In [29]:
netflix_cl03 = netflix_cl02.filter(netflix_cl02.cast.isNotNull())

In [30]:
netflix_nb = netflix_cl03.filter(netflix_cl01.rating.isNotNull())

In [31]:
netflix_nb.count()

5700

In [32]:


indexersN = [

StringIndexer(inputCol="rating", outputCol = "rating_index"),  

StringIndexer(inputCol="cast", outputCol = "cast_index"),

StringIndexer(inputCol="release_year", outputCol = "year_index"),

StringIndexer(inputCol="director", outputCol = "label")]



pipelineN = Pipeline(stages=indexersN) 



In [33]:
indexed_netflix = pipelineN.fit(netflix_nb).transform(netflix_nb) 
indexed_netflix.show(5,False) 

+-------+-------+--------------------------------+-----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------+------------------+------------+------+---------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+----------+------+
|show_id|type   |title                           |director                     |cast                                                                                                                                                                                                                                 

In [34]:
vectorAssembler = VectorAssembler(inputCols = ["rating_index", "year_index", "cast_index"],outputCol = "features") 
vindexed_netflix = vectorAssembler.transform(indexed_netflix) 
vindexed_netflix.show(5, False)

+-------+-------+--------------------------------+-----------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------+------------------+------------+------+---------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------+------------+----------+----------+------+-----------------+
|show_id|type   |title                           |director                     |cast                                                                                                                                                                                                               

In [35]:
splits = vindexed_netflix.randomSplit([0.6,0.4], 42) 
# optional value 42 is seed for sampling 
train_df = splits[0] 
test_df = splits[1]

In [36]:
nbN = NaiveBayes(modelType="multinomial")

In [37]:
nbmodelN = nbN.fit(train_df)

In [38]:
predictionsN_df = nbmodelN.transform(test_df)
predictionsN_df.show(5, True)

+----------------+-------------+--------------------+-------------+--------------------+--------------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+------------+----------+----------+------+------------------+--------------------+--------------------+----------+
|         show_id|         type|               title|     director|                cast|             country|    date_added|release_year|              rating|            duration|           listed_in|         description|rating_index|cast_index|year_index| label|          features|       rawPrediction|         probability|prediction|
+----------------+-------------+--------------------+-------------+--------------------+--------------------+--------------+------------+--------------------+--------------------+--------------------+--------------------+------------+----------+----------+------+------------------+--------------------+--------------------+----

In [39]:
predictionsN_df.select([c for c in predictionsN_df.columns if c in ['director','rating','label','cast']]).show()

+--------------------+--------------------+--------------------+------+
|            director|                cast|              rating| label|
+--------------------+--------------------+--------------------+------+
|       United States|      March 31, 2017|Classic Movies, D...|3982.0|
|           Joe Penna|Anna Kendrick, To...|               TV-MA|2253.0|
|       Curtis Graham|Olu Jacobs, Richa...|               TV-14|1460.0|
|         Suhas Kadav|Sourav Chakrabort...|               TV-Y7|   5.0|
|         Suhas Kadav|Sourav Chakrabort...|                TV-Y|   5.0|
|Akshay Sanjeev Ch...|Shailendra Pandey...|               TV-Y7| 893.0|
|  Edward James Olmos|Edward James Olmo...|                   R|1667.0|
|           Kiran Rao|Aamir Khan, Monic...|               TV-MA|2520.0|
|      Abbas Tyrewala|Imran Khan, Genel...|               TV-14| 823.0|
|  Ashutosh Gowariker|Aamir Khan, Gracy...|                  PG|  34.0|
|Anusha Rizvi, Mah...|Omkar Das Manikpu...|               TV-MA|