In [None]:
!pip install pingouin

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime, timedelta
import locale
from scipy import stats
import pingouin as pg

import os
import time

In [3]:
# Clone github repo containing the data as csv file
! rm -rf covid_data
! git clone https://github.com/PatrickNiccolai2/covid_data

Cloning into 'covid_data'...
remote: Enumerating objects: 2040, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (67/67), done.[K
remote: Total 2040 (delta 24), reused 90 (delta 22), pack-reused 1948[K
Receiving objects: 100% (2040/2040), 334.22 MiB | 35.66 MiB/s, done.
Resolving deltas: 100% (95/95), done.


In [None]:
# Download the tweets with sentiment data
!unzip /content/covid_data/tweets_with_lang -d /content/covid_data/unzip_tweets_with_lang

In [5]:
# Read all of the tweets + other data into an array
all_files_text = []

# Iterate through folder we just unzipped
for filename in os.listdir("/content/covid_data/unzip_tweets_with_lang/tweets_with_lang"):
  if(filename[0] == "l"):
    # Open each file and read it
    file_path = "/content/covid_data/unzip_tweets_with_lang/tweets_with_lang/" + filename
    file = open(file_path, "r")
    lines = file.read()

    # Split the file into different parts
    lines_arr = lines.split("\n\n")
    lang = lines_arr[-1]
    sent = lines_arr[-2]
    id = lines_arr[-3]
    place = lines_arr[-5]
    coords = lines_arr[-5]
    time = lines_arr[-6]

    tweet_text = ""
    for i in range(7, len(lines_arr) + 1):
      tweet_text = tweet_text + lines_arr[-i]

    # Search the tweet text for a manufacturer, if one is mentioned
    if("johnson" not in tweet_text.lower()):
      man = "no_man"
      if("sputnik" in tweet_text.lower()):
        man = "sputnik"
      if("sinopharm" in tweet_text.lower() or "sinovac" in tweet_text.lower()):
        man = "sino"
      if("moderna" in tweet_text.lower() or "spikevax" in tweet_text.lower()):
        man = "moderna"
      if("pfizer" in tweet_text.lower() or "biontech" in tweet_text.lower() or "comirnaty" in tweet_text.lower()):
        man = "pfizer"
      if("janssen" in tweet_text.lower()):
        man = "janssen"
      if("astrazeneca" in tweet_text.lower() or "vaxzevria" in tweet_text.lower()):
        man = "astra"
        
      # Add all the info from the file to all_files_text
      full_tweet = [tweet_text, time, coords, place, id, man, sent, lang]
      all_files_text.append(full_tweet)
    file.close()


In [6]:
# Create a dataframe with all of the data
all_tweets_df = pd.DataFrame(all_files_text, columns=["tweet_text", "time", "coords", "place", "tweet_id", "man", "sentiment", "language"])
all_tweets_df = all_tweets_df.drop_duplicates(subset='tweet_id', keep="first")
all_tweets_df = all_tweets_df.loc[all_tweets_df["language"] == "en"]

In [7]:
# Convert strings to datetime so they can be used
def str_to_datetime(in_str):
  date = in_str.split(" ")[0]
  out_datetime = datetime.strptime(date,"%Y-%m-%d")
  return out_datetime

all_tweets_df["datetime"] = all_tweets_df["time"].apply(str_to_datetime)

In [8]:
# The sentiment is an array containing 3 values, convert it to one value
def sent_arr_to_val(sent_str):
  sent_str = sent_str[2:-2]
  sent_arr = sent_str.split()
    # We define the sentiment value as probability of positive sentiment minus probability of negative sentiment
  return float(sent_arr[2]) - float(sent_arr[0])

all_tweets_df["sent_avg"] = all_tweets_df["sentiment"].apply(sent_arr_to_val)

In [9]:
# Create dataframe for each brand
moderna_df = all_tweets_df[all_tweets_df["man"] == "moderna"]
pfizer_df = all_tweets_df[all_tweets_df["man"] == "pfizer"]
astra_df = all_tweets_df[all_tweets_df["man"] == "astra"]

In [10]:
# This cell combines the dataframes by data and gets the average sentiment for that date
aggregator = {'tweet_text' : 'count', 'time' : 'count', 'coords': 'count', 
              'tweet_id' : 'count', 'sentiment': 'count', 'sent_avg': 'mean'}

moderna_per_day = moderna_df.groupby(["datetime"]).agg(aggregator)
moderna_per_day = moderna_per_day.sort_index()
moderna_per_day.reset_index(inplace=True)
moderna_per_day = moderna_per_day.rename(columns = {'index':'datetime'})

pfizer_per_day = pfizer_df.groupby(["datetime"]).agg(aggregator)
pfizer_per_day = pfizer_per_day.sort_index()
pfizer_per_day.reset_index(inplace=True)
pfizer_per_day = pfizer_per_day.rename(columns = {'index':'datetime'})

astra_per_day = astra_df.groupby(["datetime"]).agg(aggregator)
astra_per_day = astra_per_day.sort_index()
astra_per_day.reset_index(inplace=True)
astra_per_day = astra_per_day.rename(columns = {'index':'datetime'})

In [11]:
# Run kruskal-wallis test comparing the daily avg sentiment across the brands
stats.kruskal(moderna_per_day["sent_avg"], pfizer_per_day["sent_avg"], astra_per_day["sent_avg"])

KruskalResult(statistic=12.639948728118789, pvalue=0.001799989650066879)

In [12]:
stats.chi2.ppf(1-0.05, 2)

5.991464547107979

In [13]:
# So, the chi sqaured critical value is 5.99. This is less that the kruskal result 
# of 12.6, meaning that we can conclude that the distributions of moderna, pfizer,
# and astrazeneca are different

In [14]:
def startMonth(date):
  month = date.month
  year = date.year
  comp_date = str(year) + "-" + str(month) + "-1"
  return datetime.strptime(comp_date, "%Y-%m-%d")

moderna_per_day["month"] = moderna_per_day["datetime"].apply(startMonth)
pfizer_per_day["month"] = pfizer_per_day["datetime"].apply(startMonth)
astra_per_day["month"] = astra_per_day["datetime"].apply(startMonth)


In [15]:
# Function to get data by month for a given dataframe
def get_data_for_months(dataframe):
  months_dict = {}
  months = dataframe.month.unique()
  for month in months:
    years = month.astype('datetime64[Y]').astype(int) + 1970
    months = month.astype('datetime64[M]').astype(int) % 12 + 1
    # We only look at certain months to compare to the other paper
    if((months == 12 and years == 2020) or (months == 1 and years == 2021) or (months == 2 and years == 2021) or (months == 3 and years == 2021)):
      months_dict[month] = dataframe.loc[dataframe["month"] == month]["sent_val"]

  data_arr = []
  months_arr = []
  for month in sorted(months_dict.keys()):
    years = month.astype('datetime64[Y]').astype(int) + 1970
    months = month.astype('datetime64[M]').astype(int) % 12 + 1
    if((months == 12 and years == 2020) or (months == 1 and years == 2021) or (months == 2 and years == 2021) or (months == 3 and years == 2021)):
      data_arr.append(months_dict[month])
      months_arr.append(str(month)[0:7])

  return data_arr, months_arr

In [16]:
# Get the avg daily sentiment for each month, for the brands
moderna_df = moderna_df.rename(columns={"sent_avg": "sent_val"})
moderna_df["month"] = moderna_df["datetime"].apply(startMonth)

moderna_data_arr, moderna_months_arr = get_data_for_months(moderna_df)

pfizer_df = pfizer_df.rename(columns={"sent_avg": "sent_val"})
pfizer_df["month"] = pfizer_df["datetime"].apply(startMonth)

pfizer_data_arr, pfizer_months_arr = get_data_for_months(pfizer_df)

astra_df = astra_df.rename(columns={"sent_avg": "sent_val"})
astra_df["month"] = astra_df["datetime"].apply(startMonth)

astra_data_arr, astra_months_arr = get_data_for_months(astra_df)

In [17]:
stats.chi2.ppf(1-0.05, 3)

7.814727903251179

In [18]:
# Do KW tests by month for each brand

In [19]:
stats.kruskal(moderna_data_arr[0],moderna_data_arr[1],moderna_data_arr[2],moderna_data_arr[3])

KruskalResult(statistic=2.953228993243689, pvalue=0.3988927021162145)

In [20]:
stats.chi2.ppf(1-0.3988927021162145, 3)

2.953228993243689

In [21]:
# Ok, so the p-value returned from kruskal gives the p for the chi2 value where kruskal=chi2

In [22]:
stats.kruskal(pfizer_data_arr[0],pfizer_data_arr[1],pfizer_data_arr[2],pfizer_data_arr[3])

KruskalResult(statistic=12.966827460574418, pvalue=0.004708894892141813)

In [23]:
stats.kruskal(astra_data_arr[0],astra_data_arr[1],astra_data_arr[2],astra_data_arr[3])

KruskalResult(statistic=14.325531121846787, pvalue=0.00249392695451474)

In [24]:
# Again, we compare the kruskal value to the chi squared value
# Interstingly, we can conclude that for pfizer and astrazeneca,
# different months have different distributions of sentiment, 
# however for moderna this is not the case

In [25]:
sputnik_df = all_tweets_df[all_tweets_df["man"] == "sputnik"]
sino_df = all_tweets_df[all_tweets_df["man"] == "sino"]

sputnik_df = sputnik_df.rename(columns={"sent_avg": "sent_val"})
sputnik_df["month"] = sputnik_df["datetime"].apply(startMonth)

sputnik_data_arr, sputnik_months_arr = get_data_for_months(sputnik_df)

sino_df = sino_df.rename(columns={"sent_avg": "sent_val"})
sino_df["month"] = sino_df["datetime"].apply(startMonth)

sino_data_arr, sino_months_arr = get_data_for_months(sino_df)

In [26]:
stats.kruskal(sputnik_data_arr[0],sputnik_data_arr[1],sputnik_data_arr[2],sputnik_data_arr[3])

KruskalResult(statistic=7.107958499015075, pvalue=0.0685352238431948)

In [27]:
stats.kruskal(sino_data_arr[0],sino_data_arr[1],sino_data_arr[2],sino_data_arr[3])

KruskalResult(statistic=4.137111459679279, pvalue=0.24703412762126067)

# Pairwise Kruskal-Wallis

In [28]:
# Do pairwise KW tests by month for each brand
# I think we don't need this anymore because we use Games-Howell test below

In [29]:
print("Moderna")
print("December vs January")
print(stats.kruskal(moderna_data_arr[0],moderna_data_arr[1]))
print("December vs February")
print(stats.kruskal(moderna_data_arr[0],moderna_data_arr[2]))
print("December vs March")
print(stats.kruskal(moderna_data_arr[0],moderna_data_arr[3]))
print("January vs February")
print(stats.kruskal(moderna_data_arr[1],moderna_data_arr[2]))
print("January vs March")
print(stats.kruskal(moderna_data_arr[1],moderna_data_arr[3]))
print("February vs March")
print(stats.kruskal(moderna_data_arr[2],moderna_data_arr[3]))

Moderna
December vs January
KruskalResult(statistic=2.685986821244379, pvalue=0.10123449372951232)
December vs February
KruskalResult(statistic=0.16400946321218926, pvalue=0.6854916106713289)
December vs March
KruskalResult(statistic=0.4585088760316366, pvalue=0.49832167763511914)
January vs February
KruskalResult(statistic=0.9969538996701885, pvalue=0.31804869926965307)
January vs March
KruskalResult(statistic=1.2143355654687866, pvalue=0.27047517096105644)
February vs March
KruskalResult(statistic=0.000775342816950797, pvalue=0.9777857923539504)


In [30]:
print("Pfizer")
print("December vs January")
print(stats.kruskal(pfizer_data_arr[0],pfizer_data_arr[1]))
print("December vs February")
print(stats.kruskal(pfizer_data_arr[0],pfizer_data_arr[2]))
print("December vs March")
print(stats.kruskal(pfizer_data_arr[0],pfizer_data_arr[3]))
print("January vs February")
print(stats.kruskal(pfizer_data_arr[1],pfizer_data_arr[2]))
print("January vs March")
print(stats.kruskal(pfizer_data_arr[1],pfizer_data_arr[3]))
print("February vs March")
print(stats.kruskal(pfizer_data_arr[2],pfizer_data_arr[3]))

Pfizer
December vs January
KruskalResult(statistic=3.0845194132268787, pvalue=0.07904061175564364)
December vs February
KruskalResult(statistic=1.7705935031435756, pvalue=0.18330864540701636)
December vs March
KruskalResult(statistic=5.375358907153374, pvalue=0.020423138667920226)
January vs February
KruskalResult(statistic=5.759373582652753, pvalue=0.016400918057897052)
January vs March
KruskalResult(statistic=10.147326345929136, pvalue=0.00144510681125009)
February vs March
KruskalResult(statistic=0.19089911644802385, pvalue=0.6621693419676211)


In [31]:
print("AstraZeneca")
print("December vs January")
print(stats.kruskal(astra_data_arr[0],astra_data_arr[1]))
print("December vs February")
print(stats.kruskal(astra_data_arr[0],astra_data_arr[2]))
print("December vs March")
print(stats.kruskal(astra_data_arr[0],astra_data_arr[3]))
print("January vs February")
print(stats.kruskal(astra_data_arr[1],astra_data_arr[2]))
print("January vs March")
print(stats.kruskal(astra_data_arr[1],astra_data_arr[3]))
print("February vs March")
print(stats.kruskal(astra_data_arr[2],astra_data_arr[3]))

AstraZeneca
December vs January
KruskalResult(statistic=6.371353846079353, pvalue=0.01159770818211136)
December vs February
KruskalResult(statistic=4.489278203354447, pvalue=0.03410807579714784)
December vs March
KruskalResult(statistic=12.784274136736956, pvalue=0.00034954536953429573)
January vs February
KruskalResult(statistic=0.020785471643531287, pvalue=0.8853649079462731)
January vs March
KruskalResult(statistic=1.9009801118611587, pvalue=0.16796865436133868)
February vs March
KruskalResult(statistic=1.748404236045532, pvalue=0.18607746793868557)


In [32]:
print("Sputnik V")
print("December vs January")
print(stats.kruskal(sputnik_data_arr[0],sputnik_data_arr[1]))
print("December vs February")
print(stats.kruskal(sputnik_data_arr[0],sputnik_data_arr[2]))
print("December vs March")
print(stats.kruskal(sputnik_data_arr[0],sputnik_data_arr[3]))
print("January vs February")
print(stats.kruskal(sputnik_data_arr[1],sputnik_data_arr[2]))
print("January vs March")
print(stats.kruskal(sputnik_data_arr[1],sputnik_data_arr[3]))
print("February vs March")
print(stats.kruskal(sputnik_data_arr[2],sputnik_data_arr[3]))

Sputnik V
December vs January
KruskalResult(statistic=3.481578947368412, pvalue=0.0620555024536233)
December vs February
KruskalResult(statistic=0.0752043834342288, pvalue=0.783904665603639)
December vs March
KruskalResult(statistic=1.6505263157894632, pvalue=0.19888731184130973)
January vs February
KruskalResult(statistic=4.894329151213864, pvalue=0.026945040152561488)
January vs March
KruskalResult(statistic=0.8148496240601446, pvalue=0.36669033117057837)
February vs March
KruskalResult(statistic=3.1742766913054505, pvalue=0.07480632674058366)


In [33]:
print("Sinopharm")
print("December vs January")
print(stats.kruskal(sino_data_arr[0],sino_data_arr[1]))
print("December vs February")
print(stats.kruskal(sino_data_arr[0],sino_data_arr[2]))
print("December vs March")
print(stats.kruskal(sino_data_arr[0],sino_data_arr[3]))
print("January vs February")
print(stats.kruskal(sino_data_arr[1],sino_data_arr[2]))
print("January vs March")
print(stats.kruskal(sino_data_arr[1],sino_data_arr[3]))
print("February vs March")
print(stats.kruskal(sino_data_arr[2],sino_data_arr[3]))

Sinopharm
December vs January
KruskalResult(statistic=2.120380434782618, pvalue=0.14535043563843503)
December vs February
KruskalResult(statistic=0.18530701754386314, pvalue=0.666851591455902)
December vs March
KruskalResult(statistic=0.3049066516558704, pvalue=0.5808226167842789)
January vs February
KruskalResult(statistic=1.0998350274067548, pvalue=0.29430231173962673)
January vs March
KruskalResult(statistic=3.195836947094525, pvalue=0.07382597228409642)
February vs March
KruskalResult(statistic=0.9570100589387827, pvalue=0.3279413426723332)


# Compare the 3 brands by month

In [34]:
# For each month, compare the three brands pairwise
# I think we also don't need this because of Games-Howell tests below

In [35]:
print("December")
print("Moderna vs Pfizer")
print(stats.kruskal(moderna_data_arr[0],pfizer_data_arr[0]))
print("Moderna vs AstraZeneca")
print(stats.kruskal(moderna_data_arr[0],astra_data_arr[0]))
print("Pfizer vs AstraZeneca")
print(stats.kruskal(pfizer_data_arr[0],astra_data_arr[0]))

December
Moderna vs Pfizer
KruskalResult(statistic=16.04571733551754, pvalue=6.183131767930413e-05)
Moderna vs AstraZeneca
KruskalResult(statistic=0.06617149304520231, pvalue=0.7969948943403355)
Pfizer vs AstraZeneca
KruskalResult(statistic=7.6638012934139, pvalue=0.005633967843268963)


In [36]:
print("January")
print("Moderna vs Pfizer")
print(stats.kruskal(moderna_data_arr[1],pfizer_data_arr[1]))
print("Moderna vs AstraZeneca")
print(stats.kruskal(moderna_data_arr[1],astra_data_arr[1]))
print("Pfizer vs AstraZeneca")
print(stats.kruskal(pfizer_data_arr[1],astra_data_arr[1]))

January
Moderna vs Pfizer
KruskalResult(statistic=5.315620467128346, pvalue=0.02113506611680189)
Moderna vs AstraZeneca
KruskalResult(statistic=1.866237855564744, pvalue=0.17190658525853525)
Pfizer vs AstraZeneca
KruskalResult(statistic=1.337904124629431, pvalue=0.24740391847824447)


In [37]:
print("February")
print("Moderna vs Pfizer")
print(stats.kruskal(moderna_data_arr[2],pfizer_data_arr[2]))
print("Moderna vs AstraZeneca")
print(stats.kruskal(moderna_data_arr[2],astra_data_arr[2]))
print("Pfizer vs AstraZeneca")
print(stats.kruskal(pfizer_data_arr[2],astra_data_arr[2]))

February
Moderna vs Pfizer
KruskalResult(statistic=1.5014964753672757, pvalue=0.22044124825174943)
Moderna vs AstraZeneca
KruskalResult(statistic=2.4356784456069818, pvalue=0.1186025712089243)
Pfizer vs AstraZeneca
KruskalResult(statistic=0.44104930572871126, pvalue=0.5066164345322403)


In [38]:
print("March")
print("Moderna vs Pfizer")
print(stats.kruskal(moderna_data_arr[3],pfizer_data_arr[3]))
print("Moderna vs AstraZeneca")
print(stats.kruskal(moderna_data_arr[3],astra_data_arr[3]))
print("Pfizer vs AstraZeneca")
print(stats.kruskal(pfizer_data_arr[3],astra_data_arr[3]))

March
Moderna vs Pfizer
KruskalResult(statistic=1.481727758841674, pvalue=0.22350439395321178)
Moderna vs AstraZeneca
KruskalResult(statistic=14.845770960676584, pvalue=0.00011666937159752654)
Pfizer vs AstraZeneca
KruskalResult(statistic=11.116774101358589, pvalue=0.0008555045322654087)


# Games-Howell

In [39]:
print(moderna_data_arr[0])

268       0.595511
823      -0.804023
1451      0.018192
2313     -0.028059
2462      0.604340
            ...   
105033    0.227078
105817    0.043568
106658    0.577966
110516    0.178700
110833    0.074501
Name: sent_val, Length: 134, dtype: float64


In [40]:
# Create a dataframe with all of the data by day, separated by bramd
gh_df = pd.DataFrame(all_files_text, columns=["tweet_text", "time", "coords", "place", "tweet_id", "man", "sentiment", "language"])
gh_df = gh_df.drop_duplicates(subset='tweet_id', keep="first")
gh_df = gh_df.loc[gh_df["language"] == "en"]
gh_df = gh_df.loc[gh_df["man"] != "no_man"]
gh_df = gh_df.loc[gh_df["man"] != "janssen"]

gh_df["datetime"] = gh_df["time"].apply(str_to_datetime)
gh_df["sent_avg"] = gh_df["sentiment"].apply(sent_arr_to_val)

aggregator = {'sent_avg': 'mean'}

gh_per_day_df = gh_df.groupby(["datetime", "man"]).agg(aggregator)

gh_per_day_df = gh_per_day_df.sort_index()
gh_per_day_df.reset_index(inplace=True)


In [41]:
print(gh_per_day_df)

       datetime      man  sent_avg
0    2020-01-25  sputnik  0.451401
1    2020-02-26  moderna  0.335324
2    2020-02-27  moderna  0.168734
3    2020-02-29  moderna -0.010340
4    2020-03-07  sputnik -0.969906
...         ...      ...       ...
1311 2021-09-08   pfizer -0.032835
1312 2021-09-08     sino  0.060707
1313 2021-09-09    astra -0.042038
1314 2021-09-09  moderna  0.002224
1315 2021-09-09   pfizer -0.266878

[1316 rows x 3 columns]


In [59]:
# This tests the overall distributions for each brand
full_range_all_results_df = pg.pairwise_gameshowell(data=gh_per_day_df, dv="sent_avg", between="man").round(3)
print(full_range_all_results_df)

         A        B  mean(A)  mean(B)   diff     se      T       df   pval  \
0    astra  moderna    0.032    0.110 -0.078  0.032 -2.445  636.881  0.105   
1    astra   pfizer    0.032    0.018  0.014  0.025  0.561  564.234  0.980   
2    astra     sino    0.032   -0.003  0.035  0.044  0.796  246.684  0.932   
3    astra  sputnik    0.032   -0.104  0.136  0.045  3.008  225.339  0.024   
4  moderna   pfizer    0.110    0.018  0.092  0.028  3.275  557.680  0.010   
5  moderna     sino    0.110   -0.003  0.113  0.046  2.472  278.721  0.100   
6  moderna  sputnik    0.110   -0.104  0.214  0.047  4.566  253.779  0.000   
7   pfizer     sino    0.018   -0.003  0.021  0.041  0.503  199.599  0.987   
8   pfizer  sputnik    0.018   -0.104  0.122  0.043  2.854  183.822  0.038   
9     sino  sputnik   -0.003   -0.104  0.101  0.056  1.811  298.552  0.369   

   hedges  
0  -0.193  
1   0.043  
2   0.078  
3   0.302  
4   0.246  
5   0.239  
6   0.451  
7   0.048  
8   0.279  
9   0.208  


In [43]:
print(moderna_df.head(10))

                                            tweet_text  \
268  BBC News - Moderna vaccine safe and effective,...   
335  @glenn_resists @smartalek180 I'm an old lady. ...   
392  @TheRealFTP The 1st one of Moderna had me cryi...   
395  @rongallo Good For You @rongallo I’m Fully Mod...   
524  Just got shot number 2 (Moderna). Sitting in m...   
584  COVID decimated restaurants and bars worldwide...   
621  Got on a #CovidVaccine waste list (vaccines le...   
823  @kaitlancollins @caityweaver Meanwhile out her...   
838  .@sbancel You have a responsibility to make su...   
851  @DianeBernaerts In my particular case, I recei...   

                          time coords place             tweet_id      man  \
268  2020-12-16 02:42:08+00:00   None  None  1339038071023517696  moderna   
335  2021-03-29 15:21:30+00:00   None  None  1376555119710515206  moderna   
392  2021-04-24 13:57:29+00:00   None  None  1385956059777404936  moderna   
395  2021-08-04 17:44:56+00:00   None  None  14229768

In [44]:
# For each brand, get only the data of the months we're testing
moderna_4_months_df = moderna_df.loc[moderna_df["month"] == "2020-12-01"]
moderna_4_months_df = moderna_4_months_df.append(moderna_df.loc[moderna_df["month"] == "2021-01-01"])
moderna_4_months_df = moderna_4_months_df.append(moderna_df.loc[moderna_df["month"] == "2021-02-01"])
moderna_4_months_df = moderna_4_months_df.append(moderna_df.loc[moderna_df["month"] == "2021-03-01"])

pfizer_4_months_df = pfizer_df.loc[pfizer_df["month"] == "2020-12-01"]
pfizer_4_months_df = pfizer_4_months_df.append(pfizer_df.loc[pfizer_df["month"] == "2021-01-01"])
pfizer_4_months_df = pfizer_4_months_df.append(pfizer_df.loc[pfizer_df["month"] == "2021-02-01"])
pfizer_4_months_df = pfizer_4_months_df.append(pfizer_df.loc[pfizer_df["month"] == "2021-03-01"])

astra_4_months_df = astra_df.loc[astra_df["month"] == "2020-12-01"]
astra_4_months_df = astra_4_months_df.append(astra_df.loc[astra_df["month"] == "2021-01-01"])
astra_4_months_df = astra_4_months_df.append(astra_df.loc[astra_df["month"] == "2021-02-01"])
astra_4_months_df = astra_4_months_df.append(astra_df.loc[astra_df["month"] == "2021-03-01"])

sputnik_4_months_df = sputnik_4_months_df = sputnik_df.loc[sputnik_df["month"] == "2020-12-01"]
sputnik_4_months_df = sputnik_4_months_df.append(sputnik_df.loc[sputnik_df["month"] == "2021-01-01"])
sputnik_4_months_df = sputnik_4_months_df.append(sputnik_df.loc[sputnik_df["month"] == "2021-02-01"])
sputnik_4_months_df = sputnik_4_months_df.append(sputnik_df.loc[sputnik_df["month"] == "2021-03-01"])

sino_4_months_df = sino_df.loc[sino_df["month"] == "2020-12-01"]
sino_4_months_df = sino_4_months_df.append(sino_df.loc[sino_df["month"] == "2021-01-01"])
sino_4_months_df = sino_4_months_df.append(sino_df.loc[sino_df["month"] == "2021-02-01"])
sino_4_months_df = sino_4_months_df.append(sino_df.loc[sino_df["month"] == "2021-03-01"])

In [45]:
# For each brand, pairwise compare months

In [87]:
moderna_4_months_results_df = pg.pairwise_gameshowell(data=moderna_4_months_df, dv="sent_val", between="month").round(3)
print(moderna_4_months_results_df)

           A          B  mean(A)  mean(B)   diff     se      T       df  \
0 2020-12-01 2021-01-01    0.231    0.093  0.138  0.070  1.975  250.217   
1 2020-12-01 2021-02-01    0.231    0.182  0.049  0.089  0.547  115.349   
2 2020-12-01 2021-03-01    0.231    0.203  0.028  0.069  0.406  263.214   
3 2021-01-01 2021-02-01    0.093    0.182 -0.090  0.093 -0.963  131.572   
4 2021-01-01 2021-03-01    0.093    0.203 -0.111  0.074 -1.491  258.799   
5 2021-02-01 2021-03-01    0.182    0.203 -0.021  0.092 -0.225  128.173   

    pval  hedges  
0  0.200   0.244  
1  0.947   0.081  
2  0.977   0.049  
3  0.771  -0.144  
4  0.444  -0.184  
5  0.996  -0.033  


In [61]:
pfizer_4_months_results_df = pg.pairwise_gameshowell(data=pfizer_4_months_df, dv="sent_val", between="month").round(3)
print(pfizer_4_months_results_df)

           A          B  mean(A)  mean(B)   diff     se      T       df  \
0 2020-12-01 2021-01-01    0.019   -0.029  0.048  0.040  1.195  583.267   
1 2020-12-01 2021-02-01    0.019    0.105 -0.086  0.047 -1.829  325.317   
2 2020-12-01 2021-03-01    0.019    0.124 -0.105  0.039 -2.668  609.360   
3 2021-01-01 2021-02-01   -0.029    0.105 -0.134  0.054 -2.487  444.353   
4 2021-01-01 2021-03-01   -0.029    0.124 -0.153  0.047 -3.239  651.404   
5 2021-02-01 2021-03-01    0.105    0.124 -0.019  0.054 -0.355  443.833   

    pval  hedges  
0  0.630   0.082  
1  0.262  -0.145  
2  0.039  -0.181  
3  0.063  -0.220  
4  0.007  -0.253  
5  0.985  -0.031  


In [62]:
astra_4_months_results_df = pg.pairwise_gameshowell(data=astra_4_months_df, dv="sent_val", between="month").round(3)
print(astra_4_months_results_df)

           A          B  mean(A)  mean(B)   diff     se      T       df  \
0 2020-12-01 2021-01-01    0.228    0.019  0.210  0.076  2.762  123.804   
1 2020-12-01 2021-02-01    0.228    0.071  0.157  0.083  1.890  132.418   
2 2020-12-01 2021-03-01    0.228   -0.023  0.251  0.068  3.685   96.148   
3 2021-01-01 2021-02-01    0.019    0.071 -0.052  0.073 -0.716  151.743   
4 2021-01-01 2021-03-01    0.019   -0.023  0.041  0.055  0.751  186.901   
5 2021-02-01 2021-03-01    0.071   -0.023  0.094  0.065  1.444  123.680   

    pval  hedges  
0  0.033   0.450  
1  0.237   0.322  
2  0.002   0.514  
3  0.891  -0.110  
4  0.876   0.089  
5  0.475   0.185  


In [63]:
sputnik_4_months_results_df = pg.pairwise_gameshowell(data=sputnik_4_months_df, dv="sent_val", between="month").round(3)
print(sputnik_4_months_results_df)

           A          B  mean(A)  mean(B)   diff     se      T      df   pval  \
0 2020-12-01 2021-01-01    0.071   -0.380  0.451  0.237  1.905  15.919  0.265   
1 2020-12-01 2021-02-01    0.071    0.102 -0.032  0.220 -0.144  17.498  0.999   
2 2020-12-01 2021-03-01    0.071   -0.208  0.279  0.214  1.301  15.812  0.576   
3 2021-01-01 2021-02-01   -0.380    0.102 -0.482  0.194 -2.487  17.093  0.098   
4 2021-01-01 2021-03-01   -0.380   -0.208 -0.172  0.187 -0.921  15.050  0.794   
5 2021-02-01 2021-03-01    0.102   -0.208  0.310  0.166  1.871  40.000  0.257   

   hedges  
0   0.860  
1  -0.053  
2   0.494  
3  -0.994  
4  -0.377  
5   0.569  


In [138]:
sino_4_months_results_df = pg.pairwise_gameshowell(data=sino_4_months_df, dv="sent_val", between="month").round(3)
print(sino_4_months_results_df)

           A          B  mean(A)  mean(B)   diff     se      T      df   pval  \
0 2020-12-01 2021-01-01    0.052   -0.176  0.228  0.144  1.581  32.078  0.403   
1 2020-12-01 2021-02-01    0.052    0.052 -0.000  0.138 -0.003  28.517  1.000   
2 2020-12-01 2021-03-01    0.052    0.136 -0.084  0.158 -0.534  36.316  0.950   
3 2021-01-01 2021-02-01   -0.176    0.052 -0.228  0.122 -1.866  39.990  0.259   
4 2021-01-01 2021-03-01   -0.176    0.136 -0.312  0.144 -2.166  43.688  0.149   
5 2021-02-01 2021-03-01    0.052    0.136 -0.084  0.138 -0.607  39.666  0.929   

   hedges  
0   0.504  
1  -0.001  
2  -0.169  
3  -0.567  
4  -0.621  
5  -0.183  


In [51]:
# Get a dataframe for each month, containing daily data for all brands
all_dec_arr = [moderna_4_months_df.loc[moderna_4_months_df["month"] == "2020-12-01"], pfizer_4_months_df.loc[pfizer_4_months_df["month"] == "2020-12-01"],
                      astra_4_months_df.loc[astra_4_months_df["month"] == "2020-12-01"], sputnik_4_months_df.loc[sputnik_4_months_df["month"] == "2020-12-01"],
                      sino_4_months_df.loc[sino_4_months_df["month"] == "2020-12-01"]]

all_dec_df = pd.concat(all_dec_arr)

all_jan_arr = [moderna_4_months_df.loc[moderna_4_months_df["month"] == "2021-01-01"], pfizer_4_months_df.loc[pfizer_4_months_df["month"] == "2021-01-01"],
                      astra_4_months_df.loc[astra_4_months_df["month"] == "2021-01-01"], sputnik_4_months_df.loc[sputnik_4_months_df["month"] == "2021-01-01"],
                      sino_4_months_df.loc[sino_4_months_df["month"] == "2021-01-01"]]

all_jan_df = pd.concat(all_jan_arr)

all_feb_arr = [moderna_4_months_df.loc[moderna_4_months_df["month"] == "2021-02-01 	"], pfizer_4_months_df.loc[pfizer_4_months_df["month"] == "2021-02-01 	"],
                      astra_4_months_df.loc[astra_4_months_df["month"] == "2021-02-01 	"], sputnik_4_months_df.loc[sputnik_4_months_df["month"] == "2021-02-01 	"],
                      sino_4_months_df.loc[sino_4_months_df["month"] == "2021-02-01 	"]]

all_feb_df = pd.concat(all_feb_arr)

all_mar_arr = [moderna_4_months_df.loc[moderna_4_months_df["month"] == "2021-03-01"], pfizer_4_months_df.loc[pfizer_4_months_df["month"] == "2021-03-01"],
                      astra_4_months_df.loc[astra_4_months_df["month"] == "2021-03-01"], sputnik_4_months_df.loc[sputnik_4_months_df["month"] == "2021-03-01"],
                      sino_4_months_df.loc[sino_4_months_df["month"] == "2021-03-01"]]

all_mar_df = pd.concat(all_mar_arr)

In [52]:
# For each month, pairwise compare brands

In [65]:
all_dec_results_df = pg.pairwise_gameshowell(data=all_dec_df, dv="sent_val", between="man").round(3)
print(all_dec_results_df)

         A        B  mean(A)  mean(B)   diff     se      T       df   pval  \
0    astra  moderna    0.228    0.231 -0.003  0.076 -0.034  129.957  1.000   
1    astra   pfizer    0.228    0.019  0.209  0.064  3.261   77.025  0.014   
2    astra     sino    0.228    0.052  0.176  0.127  1.391   24.639  0.639   
3    astra  sputnik    0.228    0.071  0.158  0.193  0.819   11.062  0.919   
4  moderna   pfizer    0.231    0.019  0.212  0.050  4.225  196.472  0.000   
5  moderna     sino    0.231    0.052  0.179  0.120  1.487   20.325  0.582   
6  moderna  sputnik    0.231    0.071  0.160  0.189  0.851   10.140  0.908   
7   pfizer     sino    0.019    0.052 -0.033  0.113 -0.291   16.123  0.998   
8   pfizer  sputnik    0.019    0.071 -0.052  0.184 -0.281    9.247  0.998   
9     sino  sputnik    0.052    0.071 -0.019  0.214 -0.087   15.629  1.000   

   hedges  
0  -0.005  
1   0.434  
2   0.386  
3   0.276  
4   0.402  
5   0.391  
6   0.277  
7  -0.074  
8  -0.089  
9  -0.034  


In [66]:
all_jan_results_df = pg.pairwise_gameshowell(data=all_jan_df, dv="sent_val", between="man").round(3)
print(all_jan_results_df)

         A        B  mean(A)  mean(B)   diff     se      T       df   pval  \
0    astra  moderna    0.019    0.093 -0.074  0.070 -1.050  218.997  0.832   
1    astra   pfizer    0.019   -0.029  0.047  0.057  0.838  203.273  0.918   
2    astra     sino    0.019   -0.176  0.195  0.102  1.907   33.842  0.333   
3    astra  sputnik    0.019   -0.380  0.399  0.157  2.543    8.351  0.169   
4  moderna   pfizer    0.093   -0.029  0.121  0.063  1.923  231.076  0.308   
5  moderna     sino    0.093   -0.176  0.269  0.106  2.538   38.748  0.103   
6  moderna  sputnik    0.093   -0.380  0.473  0.159  2.968    8.874  0.090   
7   pfizer     sino   -0.029   -0.176  0.147  0.097  1.514   28.285  0.562   
8   pfizer  sputnik   -0.029   -0.380  0.351  0.154  2.286    7.717  0.245   
9     sino  sputnik   -0.176   -0.380  0.204  0.176  1.161   12.610  0.772   

   hedges  
0  -0.142  
1   0.098  
2   0.441  
3   0.930  
4   0.201  
5   0.572  
6   1.076  
7   0.326  
8   0.816  
9   0.464  


In [68]:
all_feb_results_df = pg.pairwise_gameshowell(data=all_feb_df, dv="sent_val", between="man").round(3)
print(all_feb_results_df)

         A        B  mean(A)  mean(B)   diff     se      T       df   pval  \
0    astra  moderna    0.071    0.182 -0.111  0.095 -1.166  127.111  0.771   
1    astra   pfizer    0.071    0.105 -0.034  0.071 -0.480  161.684  0.989   
2    astra     sino    0.071    0.052  0.019  0.099  0.189   37.783  1.000   
3    astra  sputnik    0.071    0.102 -0.031  0.135 -0.229   32.049  0.999   
4  moderna   pfizer    0.182    0.105  0.077  0.087  0.884  110.423  0.902   
5  moderna     sino    0.182    0.052  0.130  0.112  1.163   52.708  0.772   
6  moderna  sputnik    0.182    0.102  0.080  0.145  0.553   40.336  0.981   
7   pfizer     sino    0.105    0.052  0.053  0.092  0.576   28.697  0.978   
8   pfizer  sputnik    0.105    0.102  0.003  0.130  0.022   27.410  1.000   
9     sino  sputnik    0.052    0.102 -0.050  0.147 -0.338   36.872  0.997   

   hedges  
0  -0.194  
1  -0.064  
2   0.048  
3  -0.054  
4   0.123  
5   0.299  
6   0.132  
7   0.138  
8   0.005  
9  -0.103  


In [69]:
all_mar_results_df = pg.pairwise_gameshowell(data=all_mar_df, dv="sent_val", between="man").round(3)
print(all_mar_results_df)

         A        B  mean(A)  mean(B)   diff     se      T       df   pval  \
0    astra  moderna   -0.023    0.203 -0.226  0.060 -3.755  235.986  0.002   
1    astra   pfizer   -0.023    0.124 -0.147  0.045 -3.226  625.000  0.011   
2    astra     sino   -0.023    0.136 -0.159  0.116 -1.372   26.722  0.650   
3    astra  sputnik   -0.023   -0.208  0.185  0.116  1.603   20.923  0.512   
4  moderna   pfizer    0.203    0.124  0.079  0.061  1.290  251.124  0.697   
5  moderna     sino    0.203    0.136  0.067  0.123  0.545   33.561  0.982   
6  moderna  sputnik    0.203   -0.208  0.411  0.123  3.353   26.335  0.019   
7   pfizer     sino    0.124    0.136 -0.012  0.116 -0.103   27.234  1.000   
8   pfizer  sputnik    0.124   -0.208  0.332  0.116  2.860   21.326  0.063   
9     sino  sputnik    0.136   -0.208  0.344  0.158  2.184   40.404  0.206   

   hedges  
0  -0.390  
1  -0.258  
2  -0.291  
3   0.379  
4   0.131  
5   0.120  
6   0.817  
7  -0.022  
8   0.673  
9   0.658  


In [132]:
# Print output in latex table form
def latex_print(gh_result_df):
  for index, row in gh_result_df.iterrows():
    if(isinstance(row["A"], datetime)):
      line_str = row["A"].month_name()[0:3] + " " + str(row["A"].year) + " & " + row["B"].month_name()[0:3] + " " + str(row["B"].year) + " & " + str(row["diff"]) + " & " + str(row["pval"]) + "\n \\\\ \n \\hline"
    else:
      line_str = row["A"] + " & " + row["B"] + " & " + str(row["diff"]) + " & " + str(row["pval"]) + "\n \\\\ \n \\hline"
    print(line_str)

In [126]:
latex_print(full_range_all_results_df)

astra & moderna & -0.078 & 0.105
 \\ 
 hline
astra & pfizer & 0.014 & 0.98
 \\ 
 hline
astra & sino & 0.035 & 0.932
 \\ 
 hline
astra & sputnik & 0.136 & 0.024
 \\ 
 hline
moderna & pfizer & 0.092 & 0.01
 \\ 
 hline
moderna & sino & 0.113 & 0.1
 \\ 
 hline
moderna & sputnik & 0.214 & 0.0
 \\ 
 hline
pfizer & sino & 0.021 & 0.987
 \\ 
 hline
pfizer & sputnik & 0.122 & 0.038
 \\ 
 hline
sino & sputnik & 0.101 & 0.369
 \\ 
 hline


In [133]:
latex_print(moderna_4_months_results_df)

Dec 2020 & Jan 2021 & 0.138 & 0.2
 \\ 
 \hline
Dec 2020 & Feb 2021 & 0.049 & 0.947
 \\ 
 \hline
Dec 2020 & Mar 2021 & 0.028 & 0.977
 \\ 
 \hline
Jan 2021 & Feb 2021 & -0.09 & 0.771
 \\ 
 \hline
Jan 2021 & Mar 2021 & -0.111 & 0.444
 \\ 
 \hline
Feb 2021 & Mar 2021 & -0.021 & 0.996
 \\ 
 \hline


In [134]:
latex_print(pfizer_4_months_results_df)

Dec 2020 & Jan 2021 & 0.048 & 0.63
 \\ 
 \hline
Dec 2020 & Feb 2021 & -0.086 & 0.262
 \\ 
 \hline
Dec 2020 & Mar 2021 & -0.105 & 0.039
 \\ 
 \hline
Jan 2021 & Feb 2021 & -0.134 & 0.063
 \\ 
 \hline
Jan 2021 & Mar 2021 & -0.153 & 0.007
 \\ 
 \hline
Feb 2021 & Mar 2021 & -0.019 & 0.985
 \\ 
 \hline


In [135]:
latex_print(pfizer_4_months_results_df)

Dec 2020 & Jan 2021 & 0.048 & 0.63
 \\ 
 \hline
Dec 2020 & Feb 2021 & -0.086 & 0.262
 \\ 
 \hline
Dec 2020 & Mar 2021 & -0.105 & 0.039
 \\ 
 \hline
Jan 2021 & Feb 2021 & -0.134 & 0.063
 \\ 
 \hline
Jan 2021 & Mar 2021 & -0.153 & 0.007
 \\ 
 \hline
Feb 2021 & Mar 2021 & -0.019 & 0.985
 \\ 
 \hline


In [136]:
latex_print(sputnik_4_months_results_df)

Dec 2020 & Jan 2021 & 0.451 & 0.265
 \\ 
 \hline
Dec 2020 & Feb 2021 & -0.032 & 0.999
 \\ 
 \hline
Dec 2020 & Mar 2021 & 0.279 & 0.576
 \\ 
 \hline
Jan 2021 & Feb 2021 & -0.482 & 0.098
 \\ 
 \hline
Jan 2021 & Mar 2021 & -0.172 & 0.794
 \\ 
 \hline
Feb 2021 & Mar 2021 & 0.31 & 0.257
 \\ 
 \hline


In [137]:
latex_print(sino_4_months_results_df)

Dec 2020 & Jan 2021 & 0.228 & 0.403
 \\ 
 \hline
Dec 2020 & Feb 2021 & -0.0 & 1.0
 \\ 
 \hline
Dec 2020 & Mar 2021 & -0.084 & 0.95
 \\ 
 \hline
Jan 2021 & Feb 2021 & -0.228 & 0.259
 \\ 
 \hline
Jan 2021 & Mar 2021 & -0.312 & 0.149
 \\ 
 \hline
Feb 2021 & Mar 2021 & -0.084 & 0.929
 \\ 
 \hline


In [139]:
latex_print(all_dec_results_df)

astra & moderna & -0.003 & 1.0
 \\ 
 \hline
astra & pfizer & 0.209 & 0.014
 \\ 
 \hline
astra & sino & 0.176 & 0.639
 \\ 
 \hline
astra & sputnik & 0.158 & 0.919
 \\ 
 \hline
moderna & pfizer & 0.212 & 0.0
 \\ 
 \hline
moderna & sino & 0.179 & 0.582
 \\ 
 \hline
moderna & sputnik & 0.16 & 0.908
 \\ 
 \hline
pfizer & sino & -0.033 & 0.998
 \\ 
 \hline
pfizer & sputnik & -0.052 & 0.998
 \\ 
 \hline
sino & sputnik & -0.019 & 1.0
 \\ 
 \hline


In [140]:
latex_print(all_jan_results_df)

astra & moderna & -0.074 & 0.832
 \\ 
 \hline
astra & pfizer & 0.047 & 0.918
 \\ 
 \hline
astra & sino & 0.195 & 0.333
 \\ 
 \hline
astra & sputnik & 0.399 & 0.169
 \\ 
 \hline
moderna & pfizer & 0.121 & 0.308
 \\ 
 \hline
moderna & sino & 0.269 & 0.103
 \\ 
 \hline
moderna & sputnik & 0.473 & 0.09
 \\ 
 \hline
pfizer & sino & 0.147 & 0.562
 \\ 
 \hline
pfizer & sputnik & 0.351 & 0.245
 \\ 
 \hline
sino & sputnik & 0.204 & 0.772
 \\ 
 \hline


In [141]:
latex_print(all_feb_results_df)

astra & moderna & -0.111 & 0.771
 \\ 
 \hline
astra & pfizer & -0.034 & 0.989
 \\ 
 \hline
astra & sino & 0.019 & 1.0
 \\ 
 \hline
astra & sputnik & -0.031 & 0.999
 \\ 
 \hline
moderna & pfizer & 0.077 & 0.902
 \\ 
 \hline
moderna & sino & 0.13 & 0.772
 \\ 
 \hline
moderna & sputnik & 0.08 & 0.981
 \\ 
 \hline
pfizer & sino & 0.053 & 0.978
 \\ 
 \hline
pfizer & sputnik & 0.003 & 1.0
 \\ 
 \hline
sino & sputnik & -0.05 & 0.997
 \\ 
 \hline


In [142]:
latex_print(all_mar_results_df)

astra & moderna & -0.226 & 0.002
 \\ 
 \hline
astra & pfizer & -0.147 & 0.011
 \\ 
 \hline
astra & sino & -0.159 & 0.65
 \\ 
 \hline
astra & sputnik & 0.185 & 0.512
 \\ 
 \hline
moderna & pfizer & 0.079 & 0.697
 \\ 
 \hline
moderna & sino & 0.067 & 0.982
 \\ 
 \hline
moderna & sputnik & 0.411 & 0.019
 \\ 
 \hline
pfizer & sino & -0.012 & 1.0
 \\ 
 \hline
pfizer & sputnik & 0.332 & 0.063
 \\ 
 \hline
sino & sputnik & 0.344 & 0.206
 \\ 
 \hline
