In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from datetime import date
from scipy.stats import permutation_test
import plotly.express as px

pd.options.mode.chained_assignment = None
sns.set_style('dark')

# The publisher page and sentiment information are split up because I am using two ProQuest accounts (double the GPU power).
us_apr_feb2021_publisher_page_df = pd.read_csv('csv/us_apr-feb2021_publisher_page_df.csv', index_col=[0])
pre_page_df = us_apr_feb2021_publisher_page_df[['page', 'total_page', 'page_prediction']]
pre_publisher_df = us_apr_feb2021_publisher_page_df[['publisher_name', 'total_published', 'publisher_prediction']]

publisher_page_us_mar_apr2022 = pd.read_csv('csv/publisher_page_us-mar-apr2022.csv', index_col=[0])
post_page_df = publisher_page_us_mar_apr2022[['page', 'total_page', 'page_prediction']]
post_publisher_df = publisher_page_us_mar_apr2022[['publisher_name', 'total_published', 'publisher_prediction']]

publisher_df = pd.merge(pre_publisher_df, post_publisher_df, on='publisher_name', how='inner')
page_df = pd.merge(pre_page_df, post_page_df, on='page', how='inner')

page_df['sentiment'] = page_df.eval("(page_prediction_x + page_prediction_y) / (total_page_x + total_page_y)")
page_df = page_df[['page', 'sentiment']]
publisher_df['sentiment'] = publisher_df.eval('(publisher_prediction_x + publisher_prediction_y) / (total_published_y + total_published_x)')
publisher_df = publisher_df[['publisher_name', 'sentiment']]
publisher_df = publisher_df[publisher_df['publisher_name'].isin(pre_publisher_df[:20]['publisher_name'])].round(2)
publisher_df.sort_values(by='sentiment', inplace=True)

In [33]:
publisher_df.to_csv('dashboard/publisher_df.csv')

In [34]:
plot_pub = publisher_df.head(10)
plot_pub['inverse_sentiment'] = plot_pub['sentiment'] * -1
fig = px.bar(plot_pub, x='publisher_name', y="inverse_sentiment")
fig

In [35]:
# Covid Health Data from OurWorldinData
health_data_df = pd.read_csv('csv/cleaned_health_data.csv', parse_dates=['date'], index_col=[0,1])
us_health_data = health_data_df.loc['United States']
us_sentim_2021_df = pd.read_csv('csv/us_apr-feb2021.csv', index_col=[0], parse_dates=['date'])
us_sentim_2022_df = pd.read_csv('csv/us-mar-apr2022.csv', index_col=[0], parse_dates=['date'])
df = pd.concat([us_sentim_2021_df,us_sentim_2022_df])

health_data_df = us_health_data[(us_health_data.date >= df.date.min()) & (us_health_data.date <= df.date.max())]
df = df.merge(health_data_df, on='date')
# df = df.set_index('date', drop=False)
# # Smoothing trends to help with the inherent noise of sentiment calculations.
df['smoothed_prediction'] = df.prediction.rolling(window=4, center=True, win_type='gaussian').mean(std=2)
df['smoothed_articles_per_day'] = df.articles_per_day.rolling(window=5, center=True, win_type='gaussian').mean(std=3)
df = df.round(4)

In [36]:
df.head()

Unnamed: 0,date,prediction,articles_per_day,stringency_index,positive_rate,new_cases_smoothed,new_deaths_smoothed,weekly_hosp_admissions,new_vaccinations_smoothed,new_people_vaccinated_smoothed,new_cases,smoothed_prediction,smoothed_articles_per_day
0,2020-04-01,-0.2856,1447.0,72.69,0.199,22722.429,804.286,,,,35819.0,,
1,2020-04-02,-0.2604,1488.0,72.69,0.204,24787.429,966.0,,,,32276.0,,
2,2020-04-03,-0.2372,1393.0,72.69,0.207,26766.571,1095.429,,,,32445.0,-0.2701,1263.3641
3,2020-04-04,-0.3091,915.0,72.69,0.209,28132.0,1225.0,,,,31722.0,-0.2638,1177.0858
4,2020-04-05,-0.2433,1064.0,72.69,0.21,29939.0,1377.571,,,,28776.0,-0.2633,1129.2987


In [37]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=df['date'], y=df['smoothed_prediction'], name="yaxis data"),
    secondary_y=True,
)

fig.add_trace(
    go.Scatter(x=df['date'], y=df['smoothed_articles_per_day'], name="yaxis2 data"),
    secondary_y=False, 
)

fig.update_layout(
    # margin=dict(l=20, r=20, t=70, b=80),
    height=750
)

Stationarity means that the statistical properties of a time series i.e. mean, variance and covariance do not change over time. Many statistical models require the series to be stationary to make effective and precise predictions.

The two test being used to test for stationarity is the Augmented Dickey Fuller (“ADF”) test and Kwiatkowski-Phillips-Schmidt-Shin (“KPSS”) test.

In [38]:
from statsmodels.tsa.stattools import adfuller, kpss, grangercausalitytests

**ADF test** is used to determine the presence of unit root in the series, and hence helps in understand if the series is stationary or not. The null and alternate hypothesis of this test are:

Null Hypothesis: The series has a unit root.

Alternate Hypothesis: The series has no unit root.

If the null hypothesis in failed to be rejected, this test may provide evidence that the series is non-stationary.

In [39]:
df.columns

Index(['date', 'prediction', 'articles_per_day', 'stringency_index',
       'positive_rate', 'new_cases_smoothed', 'new_deaths_smoothed',
       'weekly_hosp_admissions', 'new_vaccinations_smoothed',
       'new_people_vaccinated_smoothed', 'new_cases', 'smoothed_prediction',
       'smoothed_articles_per_day'],
      dtype='object')

In [40]:
def run_adf_test(column):
    adf_results = adfuller(column, autolag="AIC")
    return pd.Series(adf_results[0:4],index=[
                "Test Statistic",
                "p-value",
                "#Lags Used",
                "Number of Observations Used"])

def kpss_test(timeseries):
    print("Results of KPSS Test:")
    kpsstest = kpss(timeseries, regression="c", nlags="auto")
    kpss_output = pd.Series(
        kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"]
    )
    for key, value in kpsstest[3].items():
        kpss_output["Critical Value (%s)" % key] = value
    print(kpss_output)


In [41]:
for col_name in ['prediction', 'smoothed_prediction', 'positive_rate', 'new_cases_smoothed',
                 'smoothed_articles_per_day', 'weekly_hosp_admissions']:
    print(f"{col_name}: {round(run_adf_test(df[col_name].dropna())[1],3)}")

prediction: 0.109
smoothed_prediction: 0.066
positive_rate: 0.0
new_cases_smoothed: 0.015
smoothed_articles_per_day: 0.013
weekly_hosp_admissions: 0.02


In [42]:
for col_name in ['prediction', 'smoothed_prediction', 'positive_rate', 'new_cases_smoothed',
                 'smoothed_articles_per_day', 'weekly_hosp_admissions']:
    print(f"{col_name}: {round(kpss(df[col_name].dropna())[1],3)}");

prediction: 0.027
smoothed_prediction: 0.028
positive_rate: 0.1
new_cases_smoothed: 0.03
smoothed_articles_per_day: 0.01
weekly_hosp_admissions: 0.1



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is smaller than the p-value returned.



The test statistic is outside of the range of p-values available in the
look-up table. The actual p-value is greater than the p-value returned.




In [43]:
df.corr().round(2)

Unnamed: 0,prediction,articles_per_day,stringency_index,positive_rate,new_cases_smoothed,new_deaths_smoothed,weekly_hosp_admissions,new_vaccinations_smoothed,new_people_vaccinated_smoothed,new_cases,smoothed_prediction,smoothed_articles_per_day
prediction,1.0,-0.25,-0.13,-0.43,-0.22,0.06,-0.29,0.32,0.33,-0.22,0.95,-0.31
articles_per_day,-0.25,1.0,0.64,0.46,-0.01,0.22,0.41,0.27,0.37,0.05,-0.3,0.95
stringency_index,-0.13,0.64,1.0,0.15,-0.14,0.27,0.26,0.37,0.53,-0.12,-0.14,0.7
positive_rate,-0.43,0.46,0.15,1.0,0.79,0.47,0.88,-0.14,-0.16,0.74,-0.46,0.49
new_cases_smoothed,-0.22,-0.01,-0.14,0.79,1.0,0.47,0.86,-0.11,-0.16,0.91,-0.24,-0.01
new_deaths_smoothed,0.06,0.22,0.27,0.47,0.47,1.0,0.74,-0.09,0.01,0.39,0.06,0.25
weekly_hosp_admissions,-0.29,0.41,0.26,0.88,0.86,0.74,1.0,-0.13,-0.07,0.77,-0.32,0.47
new_vaccinations_smoothed,0.32,0.27,0.37,-0.14,-0.11,-0.09,-0.13,1.0,0.9,-0.09,0.35,0.3
new_people_vaccinated_smoothed,0.33,0.37,0.53,-0.16,-0.16,0.01,-0.07,0.9,1.0,-0.14,0.36,0.42
new_cases,-0.22,0.05,-0.12,0.74,0.91,0.39,0.77,-0.09,-0.14,1.0,-0.25,0.03


In [44]:
df.diff().corr('spearman').round(2)

Unnamed: 0,prediction,articles_per_day,stringency_index,positive_rate,new_cases_smoothed,new_deaths_smoothed,weekly_hosp_admissions,new_vaccinations_smoothed,new_people_vaccinated_smoothed,new_cases,smoothed_prediction,smoothed_articles_per_day
prediction,1.0,0.16,-0.01,-0.08,-0.05,0.02,-0.04,-0.03,-0.04,0.1,0.28,0.21
articles_per_day,0.16,1.0,0.03,0.04,0.09,0.06,0.05,0.04,0.03,0.16,0.27,0.67
stringency_index,-0.01,0.03,1.0,0.02,0.02,-0.05,0.04,0.16,0.16,-0.03,-0.03,-0.0
positive_rate,-0.08,0.04,0.02,1.0,0.54,0.09,0.61,0.09,0.15,0.07,-0.15,0.09
new_cases_smoothed,-0.05,0.09,0.02,0.54,1.0,0.43,0.7,0.23,0.23,0.2,-0.06,0.08
new_deaths_smoothed,0.02,0.06,-0.05,0.09,0.43,1.0,0.37,0.13,0.08,0.12,0.03,0.05
weekly_hosp_admissions,-0.04,0.05,0.04,0.61,0.7,0.37,1.0,0.14,0.17,0.07,-0.07,0.04
new_vaccinations_smoothed,-0.03,0.04,0.16,0.09,0.23,0.13,0.14,1.0,0.81,0.06,-0.06,0.01
new_people_vaccinated_smoothed,-0.04,0.03,0.16,0.15,0.23,0.08,0.17,0.81,1.0,0.07,-0.1,0.01
new_cases,0.1,0.16,-0.03,0.07,0.2,0.12,0.07,0.06,0.07,1.0,0.15,0.23


In [45]:
df.head()

Unnamed: 0,date,prediction,articles_per_day,stringency_index,positive_rate,new_cases_smoothed,new_deaths_smoothed,weekly_hosp_admissions,new_vaccinations_smoothed,new_people_vaccinated_smoothed,new_cases,smoothed_prediction,smoothed_articles_per_day
0,2020-04-01,-0.2856,1447.0,72.69,0.199,22722.429,804.286,,,,35819.0,,
1,2020-04-02,-0.2604,1488.0,72.69,0.204,24787.429,966.0,,,,32276.0,,
2,2020-04-03,-0.2372,1393.0,72.69,0.207,26766.571,1095.429,,,,32445.0,-0.2701,1263.3641
3,2020-04-04,-0.3091,915.0,72.69,0.209,28132.0,1225.0,,,,31722.0,-0.2638,1177.0858
4,2020-04-05,-0.2433,1064.0,72.69,0.21,29939.0,1377.571,,,,28776.0,-0.2633,1129.2987


In [46]:
df.rename({'prediction':'Sentiment Score', 'positive_rate': 'Covid Positive Rate', 'smoothed_prediction':'Smoothed Sentiment Score'}, axis=1, inplace=True)
relevant_cols = ['date','Sentiment Score','Smoothed Sentiment Score' ,'articles_per_day', 'smoothed_articles_per_day', 
             'Covid Positive Rate', 'new_deaths_smoothed', 'weekly_hosp_admissions', 'new_vaccinations_smoothed', 'stringency_index']
df = df[relevant_cols]

In [47]:
df.head()

Unnamed: 0,date,Sentiment Score,Smoothed Sentiment Score,articles_per_day,smoothed_articles_per_day,Covid Positive Rate,new_deaths_smoothed,weekly_hosp_admissions,new_vaccinations_smoothed,stringency_index
0,2020-04-01,-0.2856,,1447.0,,0.199,804.286,,,72.69
1,2020-04-02,-0.2604,,1488.0,,0.204,966.0,,,72.69
2,2020-04-03,-0.2372,-0.2701,1393.0,1263.3641,0.207,1095.429,,,72.69
3,2020-04-04,-0.3091,-0.2638,915.0,1177.0858,0.209,1225.0,,,72.69
4,2020-04-05,-0.2433,-0.2633,1064.0,1129.2987,0.21,1377.571,,,72.69


In [50]:
df.corr()

Unnamed: 0,Sentiment Score,Smoothed Sentiment Score,articles_per_day,smoothed_articles_per_day,Covid Positive Rate,new_deaths_smoothed,weekly_hosp_admissions,new_vaccinations_smoothed,stringency_index
Sentiment Score,1.0,0.947424,-0.251708,-0.306682,-0.425347,0.064836,-0.293902,0.324037,-0.129698
Smoothed Sentiment Score,0.947424,1.0,-0.29686,-0.343356,-0.459878,0.057644,-0.323345,0.352507,-0.139172
articles_per_day,-0.251708,-0.29686,1.0,0.954982,0.45721,0.221575,0.407432,0.265371,0.636744
smoothed_articles_per_day,-0.306682,-0.343356,0.954982,1.0,0.48929,0.250262,0.469051,0.302837,0.697909
Covid Positive Rate,-0.425347,-0.459878,0.45721,0.48929,1.0,0.471655,0.877456,-0.143687,0.148501
new_deaths_smoothed,0.064836,0.057644,0.221575,0.250262,0.471655,1.0,0.737059,-0.093359,0.26984
weekly_hosp_admissions,-0.293902,-0.323345,0.407432,0.469051,0.877456,0.737059,1.0,-0.125835,0.256069
new_vaccinations_smoothed,0.324037,0.352507,0.265371,0.302837,-0.143687,-0.093359,-0.125835,1.0,0.373659
stringency_index,-0.129698,-0.139172,0.636744,0.697909,0.148501,0.26984,0.256069,0.373659,1.0


In [48]:
df.to_csv("dashboard/covid_plot_data.csv")