In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, to_date
from pyspark.sql.types import FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Check if Spark is running
sc

In [3]:
filename = '2016_US_election_tweets.csv'

# Read input file from hadoop directory and convert to pandas
df = spark.read.csv('/user1/CA2/'+filename,header=True)


In [4]:
df = df.select('candidate_id', "created_at", "tweet_text")

In [5]:
df.show()

+-------------------+-------------------+--------------------+
|       candidate_id|         created_at|          tweet_text|
+-------------------+-------------------+--------------------+
|                  1|2016-08-30 14:41:22|@zitto007 @Matthe...|
|                  1|2016-08-30 14:41:22|I think @HumaAbed...|
|                  1|2016-08-30 14:41:24|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|@HillaryClinton @...|
|                  1|2016-08-30 14:41:26|                null|
|                  1|2016-08-30 14:41:26|                null|
|                  3|2016-08-30 14:41:27|                null|
|                  1|2016-08-30 14:41:28|@HillaryClinton @...|
|                  1|2016-08-30 14:41:29|                null|
|                  3|2016-08-30 14:41:31|@BrinckJeff @P

In [6]:
df_no_nulls = df.na.drop()
tweets = df_no_nulls.filter(df.candidate_id == 2)

In [7]:
tweets.show(truncate=60)

+------------+-------------------+------------------------------------------------------------+
|candidate_id|         created_at|                                                  tweet_text|
+------------+-------------------+------------------------------------------------------------+
|           2|2017-02-17 08:46:52|@realDonaldTrump see even he saying the media and journal...|
|           2|2017-02-17 08:48:31|    @jackschofield @realDonaldTrump  https://t.co/d6xhiIE14B|
|           2|2017-02-17 08:48:50|MT @VoteTrumpPics: Thank you @realDonaldTrump for being a...|
|           2|2017-02-17 08:48:53|Accidentally stumbled upon a 'popping' vid, the stuff com...|
|           2|2017-02-17 08:49:05|@realDonaldTrump I hope your tax plan roll out is a plan ...|
|           2|2017-02-17 11:51:36|@realDonaldTrump where is your psych evaluation?  You nee...|
|           2|2017-02-17 11:51:38|@realDonaldTrump that's the point you're at? Rush Limbaug...|
|           2|2017-02-17 11:51:40|      

In [8]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]

sentiment_udf = udf(analyze_sentiment, FloatType())
spark.udf.register("sentiment_udf", sentiment_udf)

<function __main__.analyze_sentiment(text)>

In [9]:
# Convert timestamp to date
tweets = tweets.withColumn("date", to_date(col("created_at")))

# Calculate sentiment score for each tweet
tweets = tweets.withColumn("sentiment", sentiment_udf(col("tweet_text")))


In [10]:
tweets.show(truncate=40)

[Stage 3:>                                                          (0 + 1) / 1]

+------------+-------------------+----------------------------------------+----------+---------+
|candidate_id|         created_at|                              tweet_text|      date|sentiment|
+------------+-------------------+----------------------------------------+----------+---------+
|           2|2017-02-17 08:46:52|@realDonaldTrump see even he saying t...|2017-02-17|  -0.2023|
|           2|2017-02-17 08:48:31|@jackschofield @realDonaldTrump  http...|2017-02-17|      0.0|
|           2|2017-02-17 08:48:50|MT @VoteTrumpPics: Thank you @realDon...|2017-02-17|   0.3612|
|           2|2017-02-17 08:48:53|Accidentally stumbled upon a 'popping...|2017-02-17|    -0.34|
|           2|2017-02-17 08:49:05|@realDonaldTrump I hope your tax plan...|2017-02-17|   0.2942|
|           2|2017-02-17 11:51:36|@realDonaldTrump where is your psych ...|2017-02-17|     0.34|
|           2|2017-02-17 11:51:38|@realDonaldTrump that's the point you...|2017-02-17|      0.0|
|           2|2017-02-17 11:51

                                                                                

In [None]:
#num_partitions = tweets.rdd.getNumPartitions()
#print(f"Number of partitions: {num_partitions}")

In [None]:
#num_partitions = 300  # Adjust this value based on the guidelines above
#tweets = tweets.repartition(num_partitions)

In [None]:
#num_partitions = tweets.rdd.getNumPartitions()
#print(f"Number of partitions: {num_partitions}")

In [11]:
time_based_sentiment = tweets.groupBy("candidate_id", "date") \
    .mean("sentiment") \
    .withColumnRenamed("avg(sentiment)", "sentiment") \
    .orderBy("candidate_id", "date")

In [12]:
from pyspark.sql.functions import col

# Replace these values with the desired candidate and date
selected_candidate = 2
given_date = "2016-01-01"

filtered_df = time_based_sentiment.filter(
    (col("candidate_id") == selected_candidate) & (col("date") >= given_date)
)

In [None]:
df = filtered_df.toPandas()



In [None]:
import matplotlib.pyplot as plt
plt.plot(df['date'][1:], df['sentiment'][1:])