In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.2.2'
#spark_version = 'spark-3.<enter version>'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Waiting for headers] [1 InRelease 14.2 kB/88.7 kB 16%] [Connected to cloud.                                                                               Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [1 InRelease 88.7 kB/88.7 kB 100%] [Connected to cloud.r-project.org (13.227                                                                               Get:3 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Get:4 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
                                                                               Get:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
                                                                               Hit:6 http://ppa.launchpad.net

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NaiveBayes").getOrCreate()

In [55]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://moviepickfiles.s3.us-east-2.amazonaws.com/combined_df.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("combined_df.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+---+--------+--------------------+------+------------+--------------------+--------------------+-------+------------+---------+----------+
|_c0|    Mood|      original_title|    id|release_year|               genre|            overview|runtime|vote_average|   budget|   revenue|
+---+--------+--------------------+------+------------+--------------------+--------------------+-------+------------+---------+----------+
|  0|Inspired|         I Am Legend|  6479|        2007|Drama,Horror,Acti...|Robert Neville is...|  101.0|         6.9|150000000| 585349010|
|  1|Thrilled|Straight Outta Co...|277216|        2015|         Drama,Music|In 1987, five you...|  147.0|         7.7| 28000000| 201634991|
|  2|   Happy|    2 Fast 2 Furious|   584|        2003|Action,Crime,Thri...|"It's a major dou...|  107.0|         6.2| 76000000| 236350661|
|  3|   Happy|       Love Actually|   508|        2003|Comedy,Romance,Drama|Follows seemingly...|  135.0|         7.0| 40000000| 244931766|
|  4|   Happy|      

In [56]:
df.columns

['_c0',
 'Mood',
 'original_title',
 'id',
 'release_year',
 'genre',
 'overview',
 'runtime',
 'vote_average',
 'budget',
 'revenue']

In [57]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature 
data_df = df.withColumn('length', length(df['overview']))

data_df = data_df.drop('_c0',
 'id',
 'vote_average',
 'budget',
 'revenue')

data_df = data_df.withColumnRenamed("Mood","mood")

data_df.show()


+--------+--------------------+------------+--------------------+--------------------+-------+------+
|    mood|      original_title|release_year|               genre|            overview|runtime|length|
+--------+--------------------+------------+--------------------+--------------------+-------+------+
|Inspired|         I Am Legend|        2007|Drama,Horror,Acti...|Robert Neville is...|  101.0|   375|
|Thrilled|Straight Outta Co...|        2015|         Drama,Music|In 1987, five you...|  147.0|   648|
|   Happy|    2 Fast 2 Furious|        2003|Action,Crime,Thri...|"It's a major dou...|  107.0|   307|
|   Happy|       Love Actually|        2003|Comedy,Romance,Drama|Follows seemingly...|  135.0|   166|
|   Happy|       Love Actually|        2003|Comedy,Romance,Drama|Follows seemingly...|  135.0|   166|
|Romantic|       Love Actually|        2003|Comedy,Romance,Drama|Follows seemingly...|  135.0|   166|
|     Sad|       Love Actually|        2003|Comedy,Romance,Drama|Follows seemingly

### Feature Transformations


In [58]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='mood',outputCol='label')
tokenizer = Tokenizer(inputCol="overview", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

X = idf
y = pos_neg_to_num

In [59]:
pos_neg_to_num

StringIndexer_8d5739df4a66

In [60]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [61]:
# Create a and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [62]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [63]:
# Show label and resulting features
mood_index = cleaned.select(['mood','label', 'features'])

In [64]:
from pyspark.ml.classification import NaiveBayes
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3])

# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [65]:
# Tranform the model with the testing data
test_results = predictor.transform(testing)
test_results.show(40)

+---------+--------------------+------------+--------------------+--------------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|     mood|      original_title|release_year|               genre|            overview|             runtime|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+---------+--------------------+------------+--------------------+--------------------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    Angry|              8 Mile|        2002|               Drama|"The setting is D...| all he has to do...|   340|  5.0|["the, setting, i...|["the, setting, d.

In [68]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.167767


In [69]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://moviepickfiles.s3.us-east-2.amazonaws.com/movies_cleaned3.csv"
spark.sparkContext.addFile(url)
full_df = spark.read.csv(SparkFiles.get("movies_cleaned3.csv"), sep=",", header=True)

# Show DataFrame
full_df.show()

+------+--------------------+------------+--------------------+--------------------+-------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+---------+----------+----------+--------------------+--------------------+
|    id|      original_title|release_year|               genre|            overview|runtime|vote_average|          keywording|original_language|      prod_companies|      prod_countries|        release_date|   budget|   revenue|popularity|    languages_spoken|             casting|
+------+--------------------+------------+--------------------+--------------------+-------+------------+--------------------+-----------------+--------------------+--------------------+--------------------+---------+----------+----------+--------------------+--------------------+
| 19995|              Avatar|        2009|Action,Adventure,...|In the 22nd centu...|    162|         7.2|culture clash,fut...|               en|Ingenious 

In [70]:
from pyspark.sql.functions import lit

full_df = full_df.withColumn('length', length(full_df['overview']))
full_df = full_df.withColumn('mood',lit(0))

big_df = full_df.select('mood','overview','length','original_title','genre','release_year','runtime')
big_df.show()

+----+--------------------+------+--------------------+--------------------+------------+-------+
|mood|            overview|length|      original_title|               genre|release_year|runtime|
+----+--------------------+------+--------------------+--------------------+------------+-------+
|   0|In the 22nd centu...|   175|              Avatar|Action,Adventure,...|        2009|    162|
|   0|Captain Barbossa,...|   176|Pirates of the Ca...|Adventure,Fantasy...|        2007|    169|
|   0|A cryptic message...|   239|             Spectre|Action,Adventure,...|        2015|    148|
|   0|Following the dea...|   428|The Dark Knight R...|Action,Crime,Dram...|        2012|    165|
|   0|John Carter is a ...|   341|         John Carter|Action,Adventure,...|        2012|    132|
|   0|The seemingly inv...|   281|         SpiderMan 3|Fantasy,Action,Ad...|        2007|    139|
|   0|When the kingdom'...|   506|             Tangled|    Animation,Family|        2010|    100|
|   0|When Tony Star

In [72]:
big_df.count()

3096

In [73]:
from pyspark.ml import Pipeline
data_prep_pipeline1 = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

cleaner1 = data_prep_pipeline1.fit(big_df)
cleaned1 = cleaner1.transform(big_df)

In [74]:
test_results = predictor.transform(cleaned1)
test_results.show(100)

+----+--------------------+------+--------------------+--------------------+------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|mood|            overview|length|      original_title|               genre|release_year|runtime|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+----+--------------------+------+--------------------+--------------------+------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   0|In the 22nd centu...|   175|              Avatar|Action,Adventure,...|        2009|    162|[in, the, 22nd, c...|[22nd, century,, ...|(262144,[19684,53...|(262144,[19684,53...|(262145,[19684,53...|[-1670.0198940992...|[0.711942

In [75]:
test_results = predictor.transform(cleaned1)
test_results.show(20)

+----+--------------------+------+--------------------+--------------------+------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|mood|            overview|length|      original_title|               genre|release_year|runtime|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+----+--------------------+------+--------------------+--------------------+------------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|   0|In the 22nd centu...|   175|              Avatar|Action,Adventure,...|        2009|    162|[in, the, 22nd, c...|[22nd, century,, ...|(262144,[19684,53...|(262144,[19684,53...|(262145,[19684,53...|[-1670.0198940992...|[0.711942

In [76]:
test_results.count()

3096

In [78]:
test_result_DF = test_results.select('original_title','genre','release_year','runtime','prediction').toPandas()


In [79]:
test_result_DF.to_csv('test_df.csv')


cp: cannot create regular file 'drive/My Drive/': No such file or directory


In [81]:
from google.colab import files

test_result_DF.to_csv('results_with_mood.csv')

files.download('results_with_mood.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [25]:
# from google.colab import drive
# drive.mount('drive')

MessageError: ignored

In [None]:
# test_results.select('label').show()
# mood_index.select('mood','label').dropDuplicates().show()

+-----+
|label|
+-----+
|  0.0|
|  0.0|
|  3.0|
|  0.0|
|  1.0|
|  1.0|
|  5.0|
|  0.0|
|  2.0|
|  3.0|
|  6.0|
|  0.0|
|  3.0|
|  5.0|
|  5.0|
|  0.0|
|  1.0|
|  4.0|
|  2.0|
|  0.0|
+-----+
only showing top 20 rows

+----------------+-----+
|            mood|label|
+----------------+-----+
|        Inspired|  1.0|
|        Thrilled|  4.0|
|             Sad|  3.0|
|       Emotional|  6.0|
|        Romantic|  2.0|
|Thrilled/Excited|  7.0|
|           Happy|  0.0|
|           Angry|  5.0|
+----------------+-----+



In [None]:
# test_DF = test_results.select('label').toPandas()
# mood_df = mood_index.select('mood','label').dropDuplicates().toPandas()

In [None]:
# mood_df.to_csv('mood_index.csv')
# !cp mood_index.csv "drive/My Drive/"

In [None]:
# test_DF.to_csv('test_df.csv')
# !cp test_df.csv "drive/My Drive/"