In [1]:
import json
import sys
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns


df_posts = pd.read_csv("../DATA/cleaned_posts.csv")
df_comments = pd.read_csv("../DATA/cleaned_comments.csv")

In [2]:
#Fixing typing from read_csv
df_posts['text'] = df_posts['text'].astype(str)
df_comments['text'] = df_comments['text'].astype(str)
print(df_posts.columns)

Index(['selftext', 'created_utc', 'ups', 'subreddit', 'link_flair_text',
       'title', 'text'],
      dtype='object')


In [3]:
#Import and download pretrained VADER model
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\brian\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
def add_sentiment_scores(df):
    # Initialize VADER SentimentIntensityAnalyzer
    sia = SentimentIntensityAnalyzer()

    #Ensure typing, removing empty strings
    df['text'] = df['text'].astype(str).str.strip()

    # Calculate sentiment scores and add to dataset
    df['compound_sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['compound'])
    df['positive_sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['pos'])
    df['negative_sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['neg'])
    df['neutral_sentiment_score'] = df['text'].apply(lambda x: sia.polarity_scores(x)['neu'])
    return df

#Calculate scores for both datasets
posts_scored = add_sentiment_scores(df_posts)
comments_scored = add_sentiment_scores(df_comments)


In [5]:
def filter_by_keyword(df, keyword):
    # Filter for posts/comments that contain the specified keyword (case insensitive)
    return df[df['text'].str.contains(keyword, case=False, na=False)]

posts_filtered = filter_by_keyword(posts_scored, 'biden')
comments_filtered = filter_by_keyword(comments_scored, 'biden')

In [6]:
posts_filtered.describe()

Unnamed: 0,created_utc,ups,compound_sentiment_score,positive_sentiment_score,negative_sentiment_score,neutral_sentiment_score
count,989.0,989.0,989.0,989.0,989.0,989.0
mean,1721478000.0,84.67543,0.047356,0.082413,0.07639,0.841185
std,265878.0,325.855018,0.429822,0.117014,0.117899,0.153137
min,1720915000.0,0.0,-0.9976,0.0,0.0,0.244
25%,1721266000.0,1.0,-0.2023,0.0,0.0,0.743
50%,1721521000.0,1.0,0.0,0.0,0.0,0.849
75%,1721660000.0,15.0,0.34,0.151,0.14,1.0
max,1722113000.0,4988.0,0.9995,0.697,0.66,1.0


In [7]:
comments_filtered.describe()

Unnamed: 0,created_utc,ups,compound_sentiment_score,positive_sentiment_score,negative_sentiment_score,neutral_sentiment_score
count,10934.0,10934.0,10934.0,10934.0,10934.0,10934.0
mean,1721476000.0,8.995336,0.079552,0.107372,0.089781,0.802845
std,278825.8,39.755202,0.615735,0.086197,0.079944,0.105864
min,1720915000.0,-175.0,-0.9993,0.0,0.0,0.24
25%,1721269000.0,1.0,-0.4767,0.046,0.024,0.739
50%,1721501000.0,2.0,0.0644,0.098,0.08,0.804
75%,1721662000.0,6.0,0.65285,0.153,0.133,0.87
max,1722124000.0,1567.0,0.9993,0.737,0.76,1.0


In [8]:
num_rows = posts_scored.shape[0]
print(f'Number of rows in posts_scored: {num_rows}')
num_rows = posts_filtered.shape[0]
print(f'Number of rows in posts_filtered: {num_rows}')
num_rows = comments_scored.shape[0]
print(f'Number of rows in comments_scored: {num_rows}')
num_rows = comments_filtered.shape[0]
print(f'Number of rows in comments_filtered: {num_rows}')


Number of rows in posts_scored: 6146
Number of rows in posts_filtered: 989
Number of rows in comments_scored: 80285
Number of rows in comments_filtered: 10934


In [9]:
comments_filtered.describe()

Unnamed: 0,created_utc,ups,compound_sentiment_score,positive_sentiment_score,negative_sentiment_score,neutral_sentiment_score
count,10934.0,10934.0,10934.0,10934.0,10934.0,10934.0
mean,1721476000.0,8.995336,0.079552,0.107372,0.089781,0.802845
std,278825.8,39.755202,0.615735,0.086197,0.079944,0.105864
min,1720915000.0,-175.0,-0.9993,0.0,0.0,0.24
25%,1721269000.0,1.0,-0.4767,0.046,0.024,0.739
50%,1721501000.0,2.0,0.0644,0.098,0.08,0.804
75%,1721662000.0,6.0,0.65285,0.153,0.133,0.87
max,1722124000.0,1567.0,0.9993,0.737,0.76,1.0


In [10]:
#We did some investigation of some of the "True neutral" comments as a score of zero seemed erroneous
#These were just posts that were too short to contain any rich semantic information 
filtered = posts_scored[posts_scored['compound_sentiment_score'] == 0.0]
print(filtered[['compound_sentiment_score', 'positive_sentiment_score', 'negative_sentiment_score', 'neutral_sentiment_score']])

      compound_sentiment_score  positive_sentiment_score  \
3                          0.0                       0.0   
4                          0.0                       0.0   
5                          0.0                       0.0   
6                          0.0                       0.0   
7                          0.0                       0.0   
...                        ...                       ...   
6137                       0.0                       0.0   
6140                       0.0                       0.0   
6143                       0.0                       0.0   
6144                       0.0                       0.0   
6145                       0.0                       0.0   

      negative_sentiment_score  neutral_sentiment_score  
3                          0.0                      1.0  
4                          0.0                      1.0  
5                          0.0                      1.0  
6                          0.0                 

In [11]:
#first, I'll tackle the posts

#so, biden posted his dropping out on twitter at 1:46 pm EST 
#in UTC, this time is: 1:46 PM - 4 hours = 9:46 PM UTC on July 21.

from datetime import datetime
import pytz

# Define the UTC time for 5:46 PM on July 21, 2023
utc_time = datetime(2024, 7, 21, 17, 46, 0, tzinfo=pytz.utc)  # 17:46 is 5:46 PM

#now we can get the unix timestamp which is what our data uses to identify when tweets were made
unix_timestamp = int(utc_time.timestamp())
print(unix_timestamp)
#the timestamp for Biden's twitter post is 1721598360. Any UTC below this indicates a tweet
#before the announcement, and any UTC above indicates a tweet after the announcement


df_before = pd.DataFrame()
df_after = pd.DataFrame()

def before_after_dropout_dataframes(a_dataframe):
    """
    This function creates two new dataframes each respectively containing posts from before and after Biden's dropout announcement
    based on his UTC time of dropout we found already.
    """
    global df_after, df_before

    df_before = a_dataframe[a_dataframe["created_utc"] < 1721583960].copy()
    df_after = a_dataframe[a_dataframe["created_utc"] >= 1721598360].copy()



df_before_dem = pd.DataFrame()
df_after_dem = pd.DataFrame()

df_before_rep = pd.DataFrame()
df_after_rep = pd.DataFrame()

df_before_PD = pd.DataFrame()
df_after_PD = pd.DataFrame()



def seperate_by_party(before_df, after_df):
    """
    This function creates several new dataframes each respectively containing posts from the corresponding subreddits
    """
    global df_before_dem, df_after_dem, df_before_rep, df_after_rep, df_before_PD, df_after_PD

    df_before_dem = before_df[before_df["subreddit"] == "democrats"].copy()
    df_after_dem = after_df[after_df["subreddit"] == "democrats"].copy()

    df_before_rep = before_df[before_df["subreddit"] == "Republican"].copy()
    df_after_rep = after_df[after_df["subreddit"] == "Republican"].copy()

    df_before_PD = before_df[before_df["subreddit"] == "PoliticalDiscussion"].copy()
    df_after_PD = after_df[after_df["subreddit"] == "PoliticalDiscussion"].copy()


before_after_dropout_dataframes(posts_filtered) #run this BEFORE seperate_by_party()
seperate_by_party(df_before, df_after)


def ttest(dataframe1, dataframe2):
    """A t-test using compound sentiment score to see if the change in sentiment is statistically significant
    """
    from scipy.stats import ttest_ind
    
    g1 = dataframe1["compound_sentiment_score"]
    g2 = dataframe2["compound_sentiment_score"]

    # Perform the t-test
    t_statistic, p_value = ttest_ind(g1, g2)

    return t_statistic, p_value


1721583960


In [12]:
#Now let's perform t-tests on the compound sentiments for the POSTS for each political subreddit

dem_post_ttest = ttest(df_before_dem, df_after_dem)

rep_post_ttest = ttest(df_before_rep, df_after_rep)

PD_post_ttest = ttest(df_before_PD, df_after_PD)

print(f"Sentiment t-test statistics for the democrats subreddit: T-statistic: {dem_post_ttest[0]}, P-value: {dem_post_ttest[1]}")
print(f"Sentiment t-test statistics for the Republican subreddit: T-statistic: {rep_post_ttest[0]}, P-value: {rep_post_ttest[1]}")
print(f"Sentiment t-test statistics for the PoliticalDiscussion subreddit: T-statistic: {PD_post_ttest[0]}, P-value: {PD_post_ttest[1]}")

Sentiment t-test statistics for the democrats subreddit: T-statistic: -0.18769776861679455, P-value: 0.8512097164028811
Sentiment t-test statistics for the Republican subreddit: T-statistic: 0.4576751932220733, P-value: 0.6476235706301625
Sentiment t-test statistics for the PoliticalDiscussion subreddit: T-statistic: -1.5855714800727285, P-value: 0.11424095694942944


In [13]:
#Now we will make dataframes for the comments and subsequently find their t-statistics

df_comment_after = pd.DataFrame()
df_comment_before = pd.DataFrame()

def before_after_dropout_comment_dataframes(a_comment_dataframe):
    """
    This function creates two new dataframes each respectively containing comments from before and after Biden's dropout announcement
    based on his UTC time of dropout we found already.
    """
    global df_comment_after, df_comment_before

    df_comment_before = a_comment_dataframe[a_comment_dataframe["created_utc"] < 1721583960].copy()
    df_comment_after = a_comment_dataframe[a_comment_dataframe["created_utc"] >= 1721598360].copy()



before_after_dropout_comment_dataframes(comments_filtered)


df_comment_before_dem = pd.DataFrame()
df_comment_after_dem = pd.DataFrame()

df_comment_before_rep = pd.DataFrame()
df_comment_after_rep = pd.DataFrame()

df_comment_before_PD = pd.DataFrame()
df_comment_after_PD = pd.DataFrame()


def seperate_comments_by_party(before_comment_df, after_comment_df):
    """
    This function creates several new dataframes each respectively containing comments from the corresponding subreddits
    """
    global df_comment_before_dem, df_comment_after_dem, df_comment_before_rep, df_comment_after_rep, df_comment_before_PD, df_comment_after_PD

    df_comment_before_dem = before_comment_df[before_comment_df["subreddit"] == "democrats"].copy()
    df_comment_after_dem = after_comment_df[after_comment_df["subreddit"] == "democrats"].copy()

    df_comment_before_rep = before_comment_df[before_comment_df["subreddit"] == "Republican"].copy()
    df_comment_after_rep = after_comment_df[after_comment_df["subreddit"] == "Republican"].copy()

    df_comment_before_PD = before_comment_df[before_comment_df["subreddit"] == "PoliticalDiscussion"].copy()
    df_comment_after_PD = after_comment_df[after_comment_df["subreddit"] == "PoliticalDiscussion"].copy()

seperate_comments_by_party(df_comment_after, df_comment_before)


In [14]:
#Now we do the t-tests for the comments in each subreddit

dem_comment_ttest = ttest(df_comment_before_dem, df_comment_after_dem)

rep_comment_ttest = ttest(df_comment_before_rep, df_comment_after_rep)

PD_comment_ttest = ttest(df_comment_before_PD, df_comment_after_PD)

print(f"Sentiment t-test statistics for the democrats subreddit: T-statistic: {dem_comment_ttest[0]}, P-value: {dem_comment_ttest[1]}")
print(f"Sentiment t-test statistics for the Republican subreddit: T-statistic: {rep_comment_ttest[0]}, P-value: {rep_comment_ttest[1]}")
print(f"Sentiment t-test statistics for the PoliticalDiscussion subreddit: T-statistic: {PD_comment_ttest[0]}, P-value: {PD_comment_ttest[1]}")

Sentiment t-test statistics for the democrats subreddit: T-statistic: 5.262430303507806, P-value: 1.5580719024135326e-07
Sentiment t-test statistics for the Republican subreddit: T-statistic: -0.49501670298273087, P-value: 0.6207061448726928
Sentiment t-test statistics for the PoliticalDiscussion subreddit: T-statistic: 4.976674121177761, P-value: 6.633847775331306e-07
