In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, to_date
from pyspark.sql.types import FloatType
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# Check if Spark is running
sc

In [3]:
filename = '2016_US_election_tweets.csv'

# Read input file from hadoop directory and convert to pandas
df = spark.read.csv('/user1/CA2/'+filename,header=True)


In [4]:
df = df.select('candidate_id', "created_at", "tweet_text")

In [5]:
df.show()

+-------------------+-------------------+--------------------+
|       candidate_id|         created_at|          tweet_text|
+-------------------+-------------------+--------------------+
|                  1|2016-08-30 14:41:22|@zitto007 @Matthe...|
|                  1|2016-08-30 14:41:22|I think @HumaAbed...|
|                  1|2016-08-30 14:41:24|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|                null|
|                  1|2016-08-30 14:41:25|@HillaryClinton @...|
|                  1|2016-08-30 14:41:26|                null|
|                  1|2016-08-30 14:41:26|                null|
|                  3|2016-08-30 14:41:27|                null|
|                  1|2016-08-30 14:41:28|@HillaryClinton @...|
|                  1|2016-08-30 14:41:29|                null|
|                  3|2016-08-30 14:41:31|@BrinckJeff @P

In [6]:
df_no_nulls = df.na.drop()
tweets = df_no_nulls.filter(df.candidate_id < 2)

In [7]:
tweets.show(truncate=60)

+------------+-------------------+------------------------------------------------------------+
|candidate_id|         created_at|                                                  tweet_text|
+------------+-------------------+------------------------------------------------------------+
|           1|2016-08-30 14:41:22|@zitto007 @MatthewHrenak @FoxNews @HillaryClinton And you...|
|           1|2016-08-30 14:41:22|I think @HumaAbedin should be ashamed that she didn't sta...|
|           1|2016-08-30 14:41:25|@HillaryClinton @realDonaldTrump @CNN #trumpPence16 https...|
|           1|2016-08-30 14:41:28|@HillaryClinton @Comeridethwhale so are you! trump &amp; ...|
|           1|2016-08-30 14:41:33|@HillaryClinton https://t.co/pgck0ifrzC atleast @realDona...|
|           1|2016-08-30 14:41:33|Working people: @HillaryClinton, not @realDonaldTrump, is...|
|           1|2016-08-30 14:41:34|@FoxNews no, @JohnKerry the media needs 2 stop covering u...|
|           1|2016-08-30 14:41:36|#NYTim

In [8]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment(text):
    return analyzer.polarity_scores(text)["compound"]

sentiment_udf = udf(analyze_sentiment, FloatType())
spark.udf.register("sentiment_udf", sentiment_udf)

<function __main__.analyze_sentiment(text)>

In [9]:
# Convert timestamp to date
tweets = tweets.withColumn("date", to_date(col("created_at")))

# Calculate sentiment score for each tweet
tweets = tweets.withColumn("sentiment", sentiment_udf(col("tweet_text")))


In [10]:
tweets.show(truncate=40)

[Stage 3:>                                                          (0 + 1) / 1]

+------------+-------------------+----------------------------------------+----------+---------+
|candidate_id|         created_at|                              tweet_text|      date|sentiment|
+------------+-------------------+----------------------------------------+----------+---------+
|           1|2016-08-30 14:41:22|@zitto007 @MatthewHrenak @FoxNews @Hi...|2016-08-30|  -0.6166|
|           1|2016-08-30 14:41:22|I think @HumaAbedin should be ashamed...|2016-08-30|  -0.5994|
|           1|2016-08-30 14:41:25|@HillaryClinton @realDonaldTrump @CNN...|2016-08-30|      0.0|
|           1|2016-08-30 14:41:28|@HillaryClinton @Comeridethwhale so a...|2016-08-30|  -0.8016|
|           1|2016-08-30 14:41:33|@HillaryClinton https://t.co/pgck0ifr...|2016-08-30|    -0.75|
|           1|2016-08-30 14:41:33|Working people: @HillaryClinton, not ...|2016-08-30|   0.3818|
|           1|2016-08-30 14:41:34|@FoxNews no, @JohnKerry the media nee...|2016-08-30|  -0.7783|
|           1|2016-08-30 14:41

                                                                                

In [11]:
num_partitions = tweets.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

Number of partitions: 106


In [12]:
num_partitions = 300  # Adjust this value based on the guidelines above
tweets = tweets.repartition(num_partitions)

In [None]:
num_partitions = tweets.rdd.getNumPartitions()
print(f"Number of partitions: {num_partitions}")

In [None]:
df = tweets.toPandas()

In [13]:
time_based_sentiment = tweets.groupBy("candidate_id", "date") \
    .mean("sentiment") \
    .withColumnRenamed("avg(sentiment)", "sentiment") \
    .orderBy("candidate_id", "date")

In [14]:
time_based_sentiment.show()



+------------+----------+--------------------+
|candidate_id|      date|           sentiment|
+------------+----------+--------------------+
|           .|      null|                 0.0|
|         000|      null|                 0.0|
|           1|      null|0.012597482133958006|
|           1|1542-01-01| -0.4521999955177307|
|           1|1922-01-01| 0.10270000249147415|
|           1|1969-01-01|-0.27320000529289246|
|           1|1980-01-01|                 0.0|
|           1|1991-01-01| 0.42149999737739563|
|           1|1996-01-01| -0.3400000035762787|
|           1|2005-01-01|                0.25|
|           1|2008-01-01|-0.27320000529289246|
|           1|2009-01-01|-0.49390000104904175|
|           1|2010-01-01|                 0.0|
|           1|2015-01-01|-0.18076666196187338|
|           1|2016-01-01| 0.04318333168824514|
|           1|2016-08-30|-0.01751336550423...|
|           1|2016-08-31|-0.02701517064363...|
|           1|2016-09-01|-0.03145809520396605|
|           1

                                                                                

In [None]:
df = time_based_sentiment.toPandas()