In [1]:
import pandas as pd
import os
import datetime as dt

import matplotlib.pyplot as plt
from plotly.offline import plot, iplot, init_notebook_mode
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from plotly_functions import *

In [2]:
init_notebook_mode(connected=True)
cf.go_offline()

# Functions

## Getting data from the csvs

In [3]:
def get_data(policy, timeframe):
    df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_opinions.csv")

    # Standardising datetime format
    dates = []
    for row in df['Comment Datetime']:
        if '/' in row:
            if '/' in row[-4:]:
                date = dt.datetime.strptime(row, "%d/%m/%y")
            else: 
                date = dt.datetime.strptime(row, "%d/%m/%Y")
            dates.append(date.date())
        elif '-' in row:
            date = dt.datetime.strptime(row, "%Y-%m-%d")
            dates.append(date.date())

    df['Comment Datetime'] = dates

    # Truncate according to the policy timeframe
    start, end = pd.to_datetime([timeframe[0], timeframe[1]], format='%d%b%Y')
    df_within_date = df[(df['Comment Datetime'] >= start) & (df['Comment Datetime'] <= end)]
    df_within_date = df_within_date[['Comment Datetime', 'Vader_compound_score']]
    return df_within_date

In [4]:
def sentiment_with_cases_static(policy, timeframe):
    df_within_date = get_data(policy, timeframe)

    # Getting daily confirmed cases
    daily_cases = pd.read_excel(f"{parent_path}/Analysis/Sentiment Analysis_Topic Modelling/Covid-19 SG.xlsx") 
    daily_cases.reset_index(inplace=True)
    daily_confirmed = daily_cases.iloc[:, 1:3]

    # Finding the mean vader sentiment score per day
    grouped_date = df_within_date.groupby(by=["Comment Datetime"]).mean()
    grouped_date.reset_index(inplace=True)

    # Finding the moving average (7-day)
    grouped_date['SMA_7'] = grouped_date.Vader_compound_score.rolling(7, min_periods=1).mean()

    # Setting the number of cases within the policy timeframe 
    daily_confirmed_within_timeframe = daily_confirmed.loc[(daily_confirmed["Date"]>= timeframe[0]) & (daily_confirmed["Date"]<= timeframe[1])]

    # Plotting the figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]]) # NEW
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['SMA_7'], mode='lines', 
                  name="Vader 7-day MA"), secondary_y=False)
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['Vader_compound_score'], mode='lines', 
                  name="Vader Compound Score", opacity=.5), secondary_y=False)

    fig.add_bar(x = daily_confirmed_within_timeframe["Date"], y = daily_confirmed_within_timeframe["Daily Confirmed "],
                name="Cases Count", secondary_y=True)

    #fig.update_layout(width = 1200, height=800)
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                      title_text=f"Timeseries Sentiment Analysis of {policy} with Daily Number of Cases",
                      yaxis_range=[-2,1], 
                      yaxis2_range=[0,1426*2])
                      
    fig.update_layout(width = 1200, height=800)
    fig.update_xaxes(title_text='Date', ticks="outside", showgrid=False)
    fig.update_yaxes(title_text='Vader Compound Score', ticks="outside", showgrid=False)
    fig.update_yaxes(title_text='Number of Cases', ticks="outside", showgrid=False, secondary_y=True)

    fig.write_image(f"timeseries sentiment plots/{policy}_with_cases.png", engine="kaleido")
    # fig.show()
    print(policy, "(with cases, static) is done!")

In [5]:
def sentiment_with_cases_html(policy, timeframe):
    df_within_date = get_data(policy, timeframe)

    # Getting daily confirmed cases
    daily_cases = pd.read_excel(f"{parent_path}/Analysis/Sentiment Analysis_Topic Modelling/Covid-19 SG.xlsx") 
    daily_cases.reset_index(inplace=True)
    daily_confirmed = daily_cases.iloc[:, 1:3]

    # Finding the mean vader sentiment score per day
    grouped_date = df_within_date.groupby(by=["Comment Datetime"]).mean()
    grouped_date.reset_index(inplace=True)

    # Finding the moving average (7-day)
    grouped_date['SMA_7'] = grouped_date.Vader_compound_score.rolling(7, min_periods=1).mean()

    # Setting the number of cases within the policy timeframe 
    daily_confirmed_within_timeframe = daily_confirmed.loc[(daily_confirmed["Date"]>= timeframe[0]) & (daily_confirmed["Date"]<= timeframe[1])]

    # Plotting the figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]]) # NEW
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['SMA_7'], mode='lines', 
                  name="Vader 7-day MA"), secondary_y=False)
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['Vader_compound_score'], mode='lines', 
                  name="Vader Compound Score", opacity=.5,visible='legendonly'), secondary_y=False)
    fig.add_bar(x = daily_confirmed_within_timeframe["Date"], y = daily_confirmed_within_timeframe["Daily Confirmed "],
                name="Cases Count", secondary_y=True, visible='legendonly')

    #fig.update_layout(width = 1200, height=800)
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                      title_text=f"Timeseries Sentiment Analysis of {policy} with Daily Number of Cases",
                      yaxis_range=[-2,1],
                      yaxis2_range=[0,1426*2])
                      
    fig.update_layout(width = 1200, height=800)
    fig.update_xaxes(title_text='Date', ticks="outside", showgrid=False)
    fig.update_yaxes(title_text='Vader Compound Score', ticks="outside", showgrid=False)
    fig.update_yaxes(title_text='Number of Cases', ticks="outside", showgrid=False, secondary_y=True)

    fig.write_html(f"timeseries sentiment plots/{policy}_with_cases.html")
    # fig.show()
    print(policy, "(with cases, html) is done!")


In [6]:
def sentiment_with_comments_static(policy, timeframe):
    df_within_date = get_data(policy, timeframe)

    # Finding the mean vader sentiment score per day
    grouped_date = df_within_date.groupby(by=["Comment Datetime"]).mean()
    grouped_date.reset_index(inplace=True)

    # Finding the moving average (7-day)
    grouped_date['SMA_7'] = grouped_date.Vader_compound_score.rolling(7, min_periods=1).mean()

    # Finding the number of comments per day
    grouped_date_counts = df_within_date.groupby(by=["Comment Datetime"]).count()
    grouped_date_counts.reset_index(inplace=True)
    grouped_date['Comment Count'] = grouped_date_counts['Vader_compound_score']

    # Plotting the figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]]) # NEW
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['SMA_7'], mode='lines', 
                  name="Vader 7-day MA"), secondary_y=False)
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['Vader_compound_score'], mode='lines', 
                  name="Vader Compound Score", opacity=.5), secondary_y=False)
    fig.add_bar(x = grouped_date['Comment Datetime'], y = grouped_date['Comment Count'],
                name="Comment Count", secondary_y=True)

    #fig.update_layout(width = 1200, height=800)
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                      title_text=f"Timeseries Sentiment Analysis of {policy} with Daily Number of Comments",
                      yaxis_range=[-2,1])

    max_num_comments = grouped_date['Comment Count'].max()
    fig.update_layout(yaxis2_range=[0,max_num_comments*2.5])
    fig.update_layout(width = 1200, height=800)
    fig.update_xaxes(title_text='Date')
    fig.update_yaxes(title_text='Vader Compound Score')
    fig.update_yaxes(title_text='Comment Count', secondary_y=True)

    fig.write_image(f"timeseries sentiment plots/{policy}_with_comments.png", engine="kaleido")
    # fig.show()
    print(policy, "(with comments, static) is done!")

In [7]:
def sentiment_with_comments_html(policy, timeframe):
    df_within_date = get_data(policy, timeframe)

    # Finding the mean vader sentiment score per day
    grouped_date = df_within_date.groupby(by=["Comment Datetime"]).mean()
    grouped_date.reset_index(inplace=True)

    # Finding the moving average (7-day)
    grouped_date['SMA_7'] = grouped_date.Vader_compound_score.rolling(7, min_periods=1).mean()

    # Finding the number of comments per day
    grouped_date_counts = df_within_date.groupby(by=["Comment Datetime"]).count()
    grouped_date_counts.reset_index(inplace=True)
    grouped_date['Comment Count'] = grouped_date_counts['Vader_compound_score']

    # Plotting the figure 
    fig = make_subplots(specs=[[{"secondary_y": True}]]) # NEW
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['SMA_7'], mode='lines', 
                  name="Vader 7-day MA"), secondary_y=False)
    fig.add_trace(go.Scatter(x = grouped_date['Comment Datetime'], y = grouped_date['Vader_compound_score'], mode='lines', 
                  name="Vader Compound Score", visible='legendonly', opacity=.5), secondary_y=False)

    fig.add_bar(x = grouped_date['Comment Datetime'], y = grouped_date['Comment Count'],
                name="Comment Count", secondary_y=True, visible='legendonly')

    #fig.update_layout(width = 1200, height=800)
    fig.update_layout(legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
                      title_text=f"Timeseries Sentiment Analysis of {policy} with Daily Number of Comments",
                      yaxis_range=[-1,1])

    max_num_comments = grouped_date['Comment Count'].max()
    fig.update_layout(yaxis2_range=[0,max_num_comments*2.5])
    fig.update_layout(width = 1200, height=800)
    fig.update_xaxes(title_text='Date')
    fig.update_yaxes(title_text='Vader Compound Score')
    fig.update_yaxes(title_text='Comment Count', secondary_y=True)

    fig.write_html(f"timeseries sentiment plots/{policy}_with_comments.html")
    # fig.show()
    print(policy, "(with comments, html) is done!")


In [8]:
def emotion_count(policy):
    anger_df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_anger.csv")
    fear_df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_fear.csv")
    joy_df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_joy.csv")
    neutral_df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_neu.csv")
    sad_df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_sad.csv")

    emotions = ['anger', 'fear','sad','neutral','joy']
    counts = [len(anger_df), len(fear_df), len(sad_df), len(neutral_df), len(joy_df)] 

    fig = go.Figure([go.Bar(x=emotions, y=counts, text=counts)])
    fig.update_layout(title_text=f"Emotion Counts for {policy}")
    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    # fig.show()
    fig.write_image(f"timeseries sentiment plots/Emotions/{policy}_emotions.png", engine="kaleido")

    print(policy, "(emotion count) is done!")

# Plotting out the graphs

In [9]:
# parent_path = "/Users/joshuawong/Documents/GitHub/Covid-19-Singapore-Analysis"
parent_path = "C:/Users/user/Documents/GitHub/Covid-19-Singapore-Analysis"

policy_list = ["circuit breaker", "economic measures", "foreign worker", "mask", "social distancing", "tracetogether", "vaccination"]

policy_timeframe = {"circuit breaker":("1APR2020", "1JUL2020"), 
                    "economic measures":("1FEB2020", "1MAR2021"), 
                    "foreign worker":("1APR2020","1JAN2021"), 
                    "mask":("1FEB2020","1MAR2021"), 
                    "social distancing":("1FEB2020", "1FEB2021"), 
                    "tracetogether":("1MAR2020", "1MAR2021"), 
                    "vaccination":("1AUG2020","1MAR2021")}

for policy, timeframe in policy_timeframe.items():
    sentiment_with_comments_static(policy, timeframe)
    sentiment_with_comments_html(policy, timeframe)
    sentiment_with_cases_static(policy, timeframe)
    sentiment_with_cases_html(policy, timeframe)
    print('\n')
    print('*'*20)
    print('\n')

# for policy in policy_list:
#     emotion_count(policy)

circuit breaker (with comments, static) is done!
circuit breaker (with comments, html) is done!
circuit breaker (with cases, static) is done!
circuit breaker (with cases, html) is done!


********************


economic measures (with comments, static) is done!
economic measures (with comments, html) is done!
economic measures (with cases, static) is done!
economic measures (with cases, html) is done!


********************


foreign worker (with comments, static) is done!
foreign worker (with comments, html) is done!
foreign worker (with cases, static) is done!
foreign worker (with cases, html) is done!


********************


mask (with comments, static) is done!
mask (with comments, html) is done!
mask (with cases, static) is done!
mask (with cases, html) is done!


********************


social distancing (with comments, static) is done!
social distancing (with comments, html) is done!
social distancing (with cases, static) is done!
social distancing (with cases, html) is done!




In [14]:
df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/social_media_sentiments.csv")

# Standardising datetime format
dates = []
for row in df['Comment Datetime']:
    if '/' in row:
        if '/' in row[-4:]:
            date = dt.datetime.strptime(row, "%d/%m/%y")
        else: 
            date = dt.datetime.strptime(row, "%d/%m/%Y")
        dates.append(date.date())
    elif '-' in row:
        date = dt.datetime.strptime(row, "%Y-%m-%d")
        dates.append(date.date())

df['Comment Datetime'] = dates
df = df[['Comment Datetime', 'Emotions', 'Platform']]
df

platforms_list = ['Facebook', 'Instagram', 'Reddit', 'Hardwarezone', 'Twitter']
emotions_list = ['joy', 'sadness', 'neutral', 'fear', 'anger']

platform_dict = {}

for platform in platforms_list:
    for emotion in emotions_list:
        platform_dict[platform] = []

for platform in platforms_list:
    for emotion in emotions_list:
        platform_dict[platform].append(len(df[(df['Platform'] == platform) & (df["Emotions"]==emotion)].index))
  
plot = px.Figure(data=[
    go.Bar(name = emotions_list[0], x = platform_list, y = platform_dict[platform_list[0]]),
    go.Bar(name = emotions_list[1], x = platform_list, y = platform_dict[platform_list[1]]),
    go.Bar(name = emotions_list[2], x = platform_list, y = platform_dict[platform_list[2]]),
    go.Bar(name = emotions_list[3], x = platform_list, y = platform_dict[platform_list[3]]),
    go.Bar(name = emotions_list[4], x = platform_list, y = platform_dict[platform_list[4]]),
])
plot.update_layout(title_text="Number of comments per emotion per policy")
plot.update_xaxes(title_text='Platform', ticks="outside", showgrid=False)
plot.update_yaxes(title_text='Emotion', ticks="outside", showgrid=True)
plot.show()
plot.write_image("Number of comments per emotion per policy.png", engine="kaleido")

# Calculating Moving Average

In [2]:
parent_path = "C:/Users/user/Documents/GitHub/Covid-19-Singapore-Analysis"

def get_sentiment_data(df_list, policy):
    df = pd.read_csv(f"{parent_path}/Data/Sentiment Data/{policy}_valuable_sentiments.csv")

    # Standardising datetime format
    dates = []
    for row in df['Comment Datetime']:
        if '/' in row:
            if '/' in row[-4:]:
                date = dt.datetime.strptime(row, "%d/%m/%y")
            else: 
                date = dt.datetime.strptime(row, "%d/%m/%Y")
            dates.append(date.date())
        elif '-' in row:
            date = dt.datetime.strptime(row, "%Y-%m-%d")
            dates.append(date.date())

    df['Comment Datetime'] = dates
    df_list.append(df)

In [16]:
policy_list = ["circuit breaker", "economic measures", "foreign worker", "mask", "social distancing", "tracetogether", "vaccination"]

df_list = []
for policy in policy_list:
    get_sentiment_data(df_list, policy)
    print(f"{policy} done")

circuit breaker done
economic measures done
foreign worker done
mask done
social distancing done
tracetogether done
vaccination done


In [17]:
combined_sentiment_df = pd.concat(df_list)
combined_sentiment_df.drop(combined_sentiment_df.columns[[0,1]], axis=1, inplace=True)

combined_sentiment_df.head()

Unnamed: 0,Sentences,Comment Datetime,Textblob_polarity_score,Textblob_subjectivity_score,Vader_neg_score,Vader_neu_score,Vader_pos_score,Vader_compound_score,Emotions
0,kuma-mon wrote: Because need to handle issues ...,2021-02-03,0.061806,0.607176,0.046,0.845,0.108,0.7272,fear
1,You are not the only one.,2021-02-03,0.0,1.0,0.0,1.0,0.0,0.0,neutral
2,Some can even ki siao.,2021-02-03,0.0,0.0,0.0,1.0,0.0,0.0,neutral
3,"If can, ask to go office to work.",2021-02-03,0.0,0.0,0.0,1.0,0.0,0.0,joy
4,"If not, then have to make adjustment.",2021-02-03,0.0,0.0,0.0,1.0,0.0,0.0,neutral


In [24]:
average_sentiment_df = combined_sentiment_df.groupby(by=["Comment Datetime"]).mean()
average_sentiment_df.reset_index(inplace=True)
average_sentiment_df

Unnamed: 0,Comment Datetime,Textblob_polarity_score,Textblob_subjectivity_score,Vader_neg_score,Vader_neu_score,Vader_pos_score,Vader_compound_score
0,2018-08-15,0.052009,0.314744,0.081757,0.825675,0.092549,0.029372
1,2018-08-16,0.040755,0.314236,0.051000,0.806437,0.142563,0.174406
2,2020-01-03,0.030556,0.201389,0.007500,0.758000,0.234500,0.245250
3,2020-01-06,0.138889,0.187500,0.000000,0.872500,0.127500,0.067433
4,2020-01-08,0.100447,0.520969,0.155846,0.581615,0.262538,0.046638
...,...,...,...,...,...,...,...
428,2021-10-02,0.080850,0.309515,0.056574,0.861338,0.077201,0.007566
429,2021-10-03,0.094315,0.358138,0.051571,0.853536,0.094911,0.085654
430,2021-11-02,0.077866,0.300661,0.049187,0.857008,0.093813,0.085535
431,2021-11-03,0.110781,0.321823,0.062875,0.893625,0.043500,-0.058013


In [30]:
start, end = pd.to_datetime("2020-01-01", "2021-03-01"], format='%d%b%Y')

average_sentiment_df.drop(average_sentiment_df["Comment Datetime"] >= '2020-01-01')

SyntaxError: invalid syntax (<ipython-input-30-81639bfef80a>, line 1)