In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, to_date
from pyspark.sql.types import FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [6]:
# Check if Spark is running
sc

In [7]:
filename = '2016_US_election_tweets.csv'

# Read input file from hadoop directory and convert to pandas
df = spark.read.csv('/user1/CA2/'+filename,header=True)


                                                                                

In [8]:
df = df.select('candidate_id', "created_at", "tweet_text")

In [9]:
df.show()

+-------------------+-------------------+--------------------+
|       candidate_id|         created_at|          tweet_text|
+-------------------+-------------------+--------------------+
|                  1|2016-08-30 14:41:22|@zitto007 @Matthe...|
|                  1|2016-08-30 14:41:22|I think @HumaAbed...|
|                  1|2016-08-30 14:41:24|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|@HillaryClinton @...|
|                  1|2016-08-30 14:41:26|                null|
|                  1|2016-08-30 14:41:26|                null|
|                  3|2016-08-30 14:41:27|                null|
|                  1|2016-08-30 14:41:28|@HillaryClinton @...|
|                  1|2016-08-30 14:41:29|                null|
|                  3|2016-08-30 14:41:31|@BrinckJeff @P

In [10]:
df_no_nulls = df.na.drop()
tweets = df_no_nulls.filter(df.candidate_id < 2)

In [12]:
tweets.show(truncate=60)

+------------+-------------------+------------------------------------------------------------+
|candidate_id|         created_at|                                                  tweet_text|
+------------+-------------------+------------------------------------------------------------+
|           1|2016-08-30 14:41:22|@zitto007 @MatthewHrenak @FoxNews @HillaryClinton And you...|
|           1|2016-08-30 14:41:22|I think @HumaAbedin should be ashamed that she didn't sta...|
|           1|2016-08-30 14:41:25|@HillaryClinton @realDonaldTrump @CNN #trumpPence16 https...|
|           1|2016-08-30 14:41:28|@HillaryClinton @Comeridethwhale so are you! trump &amp; ...|
|           1|2016-08-30 14:41:33|@HillaryClinton https://t.co/pgck0ifrzC atleast @realDona...|
|           1|2016-08-30 14:41:33|Working people: @HillaryClinton, not @realDonaldTrump, is...|
|           1|2016-08-30 14:41:34|@FoxNews no, @JohnKerry the media needs 2 stop covering u...|
|           1|2016-08-30 14:41:36|#NYTim

In [13]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]

sentiment_udf = udf(analyze_sentiment, FloatType())
spark.udf.register("sentiment_udf", sentiment_udf)

<function __main__.analyze_sentiment(text)>

In [14]:
# Convert timestamp to date
tweets = tweets.withColumn("date", to_date(col("created_at")))

# Calculate sentiment score for each tweet
tweets = tweets.withColumn("sentiment", sentiment_udf(col("tweet_text")))


In [15]:
tweets.show(truncate=40)

[Stage 3:>                                                          (0 + 1) / 1]

+------------+-------------------+----------------------------------------+----------+---------+
|candidate_id|         created_at|                              tweet_text|      date|sentiment|
+------------+-------------------+----------------------------------------+----------+---------+
|           1|2016-08-30 14:41:22|@zitto007 @MatthewHrenak @FoxNews @Hi...|2016-08-30|  -0.6166|
|           1|2016-08-30 14:41:22|I think @HumaAbedin should be ashamed...|2016-08-30|  -0.5994|
|           1|2016-08-30 14:41:25|@HillaryClinton @realDonaldTrump @CNN...|2016-08-30|      0.0|
|           1|2016-08-30 14:41:28|@HillaryClinton @Comeridethwhale so a...|2016-08-30|  -0.8016|
|           1|2016-08-30 14:41:33|@HillaryClinton https://t.co/pgck0ifr...|2016-08-30|    -0.75|
|           1|2016-08-30 14:41:33|Working people: @HillaryClinton, not ...|2016-08-30|   0.3818|
|           1|2016-08-30 14:41:34|@FoxNews no, @JohnKerry the media nee...|2016-08-30|  -0.7783|
|           1|2016-08-30 14:41

                                                                                

In [20]:
time_based_sentiment = tweets.groupBy("candidate_id", "date") \
    .mean("sentiment") \
    .withColumnRenamed("avg(sentiment)", "sentiment") \
    .orderBy("candidate_id", "date")

2023-05-08 18:56:54,331 ERROR executor.Executor: Exception in task 43.0 in stage 8.0 (TID 155)
java.lang.OutOfMemoryError: Java heap space
	at java.util.Arrays.copyOf(Arrays.java:3236)
	at java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:118)
	at java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:93)
	at java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:153)
	at org.apache.spark.util.ByteBufferOutputStream.write(ByteBufferOutputStream.scala:41)
	at java.io.ObjectOutputStream$BlockDataOutputStream.write(ObjectOutputStream.java:1853)
	at java.io.ObjectOutputStream.write(ObjectOutputStream.java:709)
	at org.apache.spark.util.Utils$.writeByteBuffer(Utils.scala:242)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:53)
	at org.apache.spark.scheduler.DirectTaskResult$$Lambda$2504/1746259086.apply$mcV$sp(Unknown Source)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apach

In [None]:
time_based_sentiment.show()

In [None]:
df = time_based_sentiment.toPandas()