In [2]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu bionic InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (52.85.151.8)] [Conn                                                                               Get:2 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
                                                                               Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Waiting for headers] [Connected to cloud.r-project.org (52.85.151.8)] [Wait                                                                               Get:4 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]
0% [1 InRelease gpgv 242 kB] [4 InRelease 50.4 kB/74.6 kB 68%] [Waiting for hea                                                                               Get:5 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:6 https://developer.download.nvidia.c

In [3]:
# PostgreSQL driver
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-05-15 19:31:34--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-05-15 19:31:34 (5.58 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Twitter_Sentiment_NLP").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [5]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://ilanp-bucket.s3.us-west-2.amazonaws.com/sentiment_analysis_10k.csv"
spark.sparkContext.addFile(url)
tweet_df = spark.read.csv(SparkFiles.get("sentiment_analysis_10k.csv"), sep=",", header=True, inferSchema=True)

In [6]:
tweet_df.show(5)

+--------+----------+--------------------+--------+--------------+--------------------+--------+
|polarity|        id|                date|   query|          user|                text|new_date|
+--------+----------+--------------------+--------+--------------+--------------------+--------+
|       0|2051199119|Fri Jun 05 21:04:...|NO_QUERY|    alicatpurr|My moonstone pend...|  6/5/09|
|       0|2051199378|Fri Jun 05 21:04:...|NO_QUERY|      joshwehe|Watching baseball...|  6/5/09|
|       0|2051200441|Fri Jun 05 21:05:...|NO_QUERY|    qwerkyqook|RIP cute black ma...|  6/5/09|
|       0|2051201409|Fri Jun 05 21:05:...|NO_QUERY|       Lizfig3|@pandafandanga we...|  6/5/09|
|       0|2051201881|Fri Jun 05 21:05:...|NO_QUERY|sweet_ctstrphe|lost my  voice  w...|  6/5/09|
+--------+----------+--------------------+--------+--------------+--------------------+--------+
only showing top 5 rows



In [7]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [8]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature
data_df = tweet_df.withColumn('length', length(tweet_df['text']))
data_df.show()

+--------+----------+--------------------+--------+---------------+--------------------+--------+------+
|polarity|        id|                date|   query|           user|                text|new_date|length|
+--------+----------+--------------------+--------+---------------+--------------------+--------+------+
|       0|2051199119|Fri Jun 05 21:04:...|NO_QUERY|     alicatpurr|My moonstone pend...|  6/5/09|   126|
|       0|2051199378|Fri Jun 05 21:04:...|NO_QUERY|       joshwehe|Watching baseball...|  6/5/09|    63|
|       0|2051200441|Fri Jun 05 21:05:...|NO_QUERY|     qwerkyqook|RIP cute black ma...|  6/5/09|    53|
|       0|2051201409|Fri Jun 05 21:05:...|NO_QUERY|        Lizfig3|@pandafandanga we...|  6/5/09|    80|
|       0|2051201881|Fri Jun 05 21:05:...|NO_QUERY| sweet_ctstrphe|lost my  voice  w...|  6/5/09|    62|
|       0|2051201994|Fri Jun 05 21:05:...|NO_QUERY|    thBIKINIboy|@yohnnywalker jay...|  6/5/09|   136|
|       0|2051202056|Fri Jun 05 21:05:...|NO_QUERY|   M

In [9]:
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='polarity',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [10]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [11]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [12]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [13]:
cleaned.select(['label','features']).show(truncate=False)

+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                                                                                                     |
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3], 21)

In [15]:
from pyspark.ml.classification import NaiveBayes
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [16]:
test_results = predictor.transform(testing)
test_results.show(20) 

+--------+----------+--------------------+--------+--------------+--------------------+--------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|polarity|        id|                date|   query|          user|                text|new_date|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------+----------+--------------------+--------+--------------+--------------------+--------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       0|1468159766|Tue Apr 07 00:02:...|NO_QUERY|    amcpodcast|@Skunkie Sorry, I...|  4/7/09|    68|  0.0|[@skunkie, sorry,...|[@skunkie, sorry,...|(262144,[2437,108...|(262144,[2437,108...|(26

In [17]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

acc_eval = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='prediction')
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" % acc)


Accuracy of model at predicting reviews was: 0.674267


In [None]:
# Store environmental variable
from getpass import getpass
password = getpass('Provide Password')

# Configure settings for RDS
mode = "overwrite"
jdbc_url="jdbc:postgresql://database-1.c3f2jo4rdylg.us-west-2.rds.amazonaws.com:5432/sentiment_analysis"
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}

In [None]:
test_results["polarity",'text','new_date',"length","label", "token_text","features", "prediction"].show(truncate=False)

In [None]:
# Write DataFrame to active_user table in RDS
test_results["polarity",'text','new_date',"length","label", "token_text","features", "prediction"].write.jdbc(url=jdbc_url, table='Test_Results2', mode=mode, properties=config)