In [1]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, to_date
from pyspark.sql.types import FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
# Check if Spark is running
sc

In [4]:
filename = '2016_US_election_tweets.csv'

# Read input file from hadoop directory and convert to pandas
df = spark.read.csv('/user1/CA2/'+filename,header=True)


In [5]:
df = df.select('candidate_id', "created_at", "tweet_text")

In [6]:
df.show()

+-------------------+-------------------+--------------------+
|       candidate_id|         created_at|          tweet_text|
+-------------------+-------------------+--------------------+
|                  1|2016-08-30 14:41:22|@zitto007 @Matthe...|
|                  1|2016-08-30 14:41:22|I think @HumaAbed...|
|                  1|2016-08-30 14:41:24|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|@HillaryClinton @...|
|                  1|2016-08-30 14:41:26|                null|
|                  1|2016-08-30 14:41:26|                null|
|                  3|2016-08-30 14:41:27|                null|
|                  1|2016-08-30 14:41:28|@HillaryClinton @...|
|                  1|2016-08-30 14:41:29|                null|
|                  3|2016-08-30 14:41:31|@BrinckJeff @P

In [7]:
df_no_nulls = df.na.drop()
tweets = df_no_nulls.filter(df.candidate_id == 2)

In [8]:
tweets.show(truncate=60)

+------------+-------------------+------------------------------------------------------------+
|candidate_id|         created_at|                                                  tweet_text|
+------------+-------------------+------------------------------------------------------------+
|           2|2017-02-17 08:46:52|@realDonaldTrump see even he saying the media and journal...|
|           2|2017-02-17 08:48:31|    @jackschofield @realDonaldTrump  https://t.co/d6xhiIE14B|
|           2|2017-02-17 08:48:50|MT @VoteTrumpPics: Thank you @realDonaldTrump for being a...|
|           2|2017-02-17 08:48:53|Accidentally stumbled upon a 'popping' vid, the stuff com...|
|           2|2017-02-17 08:49:05|@realDonaldTrump I hope your tax plan roll out is a plan ...|
|           2|2017-02-17 11:51:36|@realDonaldTrump where is your psych evaluation?  You nee...|
|           2|2017-02-17 11:51:38|@realDonaldTrump that's the point you're at? Rush Limbaug...|
|           2|2017-02-17 11:51:40|      

In [9]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]

sentiment_udf = udf(analyze_sentiment, FloatType())
spark.udf.register("sentiment_udf", sentiment_udf)

<function __main__.analyze_sentiment(text)>

In [10]:
# Convert timestamp to date
tweets = tweets.withColumn("date", to_date(col("created_at")))

# Calculate sentiment score for each tweet
tweets = tweets.withColumn("sentiment", sentiment_udf(col("tweet_text")))


In [11]:
tweets.show(truncate=40)

[Stage 3:>                                                          (0 + 1) / 1]

+------------+-------------------+----------------------------------------+----------+---------+
|candidate_id|         created_at|                              tweet_text|      date|sentiment|
+------------+-------------------+----------------------------------------+----------+---------+
|           2|2017-02-17 08:46:52|@realDonaldTrump see even he saying t...|2017-02-17|  -0.2023|
|           2|2017-02-17 08:48:31|@jackschofield @realDonaldTrump  http...|2017-02-17|      0.0|
|           2|2017-02-17 08:48:50|MT @VoteTrumpPics: Thank you @realDon...|2017-02-17|   0.3612|
|           2|2017-02-17 08:48:53|Accidentally stumbled upon a 'popping...|2017-02-17|    -0.34|
|           2|2017-02-17 08:49:05|@realDonaldTrump I hope your tax plan...|2017-02-17|   0.2942|
|           2|2017-02-17 11:51:36|@realDonaldTrump where is your psych ...|2017-02-17|     0.34|
|           2|2017-02-17 11:51:38|@realDonaldTrump that's the point you...|2017-02-17|      0.0|
|           2|2017-02-17 11:51

                                                                                

In [12]:
type(tweets)

pyspark.sql.dataframe.DataFrame

In [13]:
tweets.count()

                                                                                

41901010

In [14]:
tweets = tweets.select('tweet_text','date','sentiment')

In [15]:
tweets.show(truncate = 80)

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------+----------+---------+
|                                                                      tweet_text|      date|sentiment|
+--------------------------------------------------------------------------------+----------+---------+
|@realDonaldTrump see even he saying the media and journalist lie about shit a...|2017-02-17|  -0.2023|
|                        @jackschofield @realDonaldTrump  https://t.co/d6xhiIE14B|2017-02-17|      0.0|
|MT @VoteTrumpPics: Thank you @realDonaldTrump for being a man of your word. h...|2017-02-17|   0.3612|
|Accidentally stumbled upon a 'popping' vid, the stuff coming out of the pimpl...|2017-02-17|    -0.34|
|@realDonaldTrump I hope your tax plan roll out is a plan to roll out your tax...|2017-02-17|   0.2942|
|@realDonaldTrump where is your psych evaluation?  You need serious profession...|2017-02-17|     0.34|
|         @realDonaldTrump that's the point you're at? Rush Limb

                                                                                

In [16]:
import pyspark.sql.functions as F
import matplotlib.pyplot as plt

In [17]:
daily_sentiment_df = tweets.groupBy("date") \
    .agg(F.mean("sentiment").alias("avg_sentiment")) \
    .orderBy("date")

In [20]:
tweets.unpersist()
daily_sentiment_df.unpersist()

DataFrame[date: date, avg_sentiment: double]

In [None]:
daily_sentiment_df.count()



In [None]:
daily_sentiment_pd = daily_sentiment_df.toPandas()

In [21]:
daily_sentiment_df.show()

ERROR:root:KeyboardInterrupt while sending command.               (4 + 4) / 106]
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 