# Analysis
This notebook creates features from raw tables and visualizes the results.

## 1. Set environment
Import libraries

In [None]:
import numpy as np
import pandas as pd
from math import floor
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from wordcloud import WordCloud, STOPWORDS
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import statsmodels.api as sm
from isodate import parse_duration
from scipy.stats import ttest_ind
from stargazer.stargazer import Stargazer

The following cell parses json files. Avoid running it again.

Counts:
- All videos
    - 1,928
- All videos with at least one comment in first 12 hours
    - 1,846
- All videos with at least one comment in English in first 12 hours
    - 1,814
- All videos with at least one comment in first 12 hours and excluding fuzzy window
    - 1516
- All videos with at least one comment in English in first 12 hours excluding fuzzy window
    - 1504
- All comments
    - 1,197,454

Read datasets

In [None]:
# Video details table
d1 = pd.read_csv('../../dat/videoDetails.csv')

# Classified comments without dropping observations
# d2 = pd.read_csv('../../dat/videoFlagsFuzzy.csv')

# Classified ENGLISH comments without dropping observations
d3 = pd.read_csv('../../dat/videoFlagsFuzzyLangid.csv')

# Create dataframe for analysis
df = pd.merge(d1, d3, on='videoId', how='right')

Create masks for each time window

In [None]:
# Convert publishedAt to timestamp
df['publishedAt'] = pd.to_datetime(df['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ')

# Start of policy rollout
start = pd.Timestamp('2021-11-10 00:00:00')

# Time windows
hours = np.arange(12, 72+1, 12)

# Init dict in which to store masks
donuts = {}

# Iterate over time windows
for h in hours:

    # Set max time of upload before treatment
    lim = start - pd.Timedelta(hours=h)

    # Current mask
    d = {
        h:(
            # Pre-treatment
            df['publishedAt'].le(lim) |
            # Post treatment
            df['publishedAt'].ge('2021-11-11')
        )
    }

    # Append masks to empty dict
    donuts = {**donuts, **d}

# Delete temporary dict
del d

## 2. Feature creation
Add constant term

In [None]:
df['const'] = 1

Turn `definition` to dummy

In [None]:
df['definition'] = df['definition'].replace({'sd':'0','hd':'1'}).astype(int)

Create targets from counters

In [None]:
# Create all targets
for h in hours:

    # NCRs
    df[f'ncr1Post{h}'] = df[f'post{h}CommentsNeg1'] / df[f'post{h}CommentsNum'].replace(0, 1)
    df[f'ncr2Post{h}'] = df[f'post{h}CommentsNeg2'] / df[f'post{h}CommentsNum'].replace(0, 1)
    
    # PCRs
    df[f'pcr1Post{h}'] = df[f'post{h}CommentsPos1'] / df[f'post{h}CommentsNum'].replace(0, 1)
    df[f'pcr2Post{h}'] = df[f'post{h}CommentsPos2'] / df[f'post{h}CommentsNum'].replace(0, 1)
    
    # Relative ratios
    df[f'rel1Post{h}'] = np.where(
        (df[f'post{h}CommentsNeg1'] > 0) & (df[f'post{h}CommentsPos1'] > 0),
        df[f'post{h}CommentsNeg1'] / df[f'post{h}CommentsPos1'],
        np.nan
    )
    df[f'rel2Post{h}'] = np.where(
        (df[f'post{h}CommentsNeg2'] > 0) & (df[f'post{h}CommentsPos2'] > 0),
        df[f'post{h}CommentsNeg2'] / df[f'post{h}CommentsPos2'],
        np.nan
    )

Analyze most important words in video titles to create dummy variables

In [None]:
# All titles to single text
text = ' '.join(df['title'].str.title().tolist())

# Remove annoying strings
for string in ["'s",".","-"]:
    text = text.replace(string, '')

# All words to uppercase
text = text.upper()
# Stopwords
stopwords = set(list(STOPWORDS) + ['SAY','SAYS','S'])

# Plot wordcloud
wordcloud = WordCloud(
    background_color='white',
    max_words=25,
    stopwords=stopwords,
    max_font_size=40, 
    scale=3,
    random_state=42
).generate(text)

# Show wordcloud
fig = plt.figure(figsize=(12, 12))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()

Create dummy variables by topic

In [None]:
# Title to lowercase
df['title'] = df['title'].str.lower()

# Dictionary of keywords
topics = {
    'biden':'biden',
    'trump':'trump',
    'president':'biden|trump',
    'climate':'cop26|cop 26|climate',
    'economy':'inflation|infrastructure|bill|economy',
    'covid':'covid|covid19|covid-19|virus',
    'violence':'kill|murder|assassins| die|dead|shoot|shot'
}

# Create Indicator variables
for topic in topics.keys():
    df[topic] = np.where(df['title'].str.contains(topics[topic]), 1, 0)

Video title sentiment

In [None]:
clf = SentimentIntensityAnalyzer()
df['toneCom'] = df['title'].apply(lambda x: clf.polarity_scores(x)['compound'])
df['tonePos'] = df['title'].apply(lambda x: clf.polarity_scores(x)['pos'])
df['toneNeg'] = df['title'].apply(lambda x: clf.polarity_scores(x)['neg'])

Translate `duration` to seconds.

In [None]:
# YT-duration format to seconds
df['seconds'] = df['duration'].apply(lambda x: int(parse_duration(x).total_seconds()))

# log(seconds)
df['logSeconds'] = np.log(df['seconds'])

Sort data by upload date

In [None]:
df = df.sort_values('publishedAt', ascending=True).reset_index(drop=True)

Treatment indicator

In [None]:
df['treat'] = (df['publishedAt'] >= '2021-11-10').astype(int)

Declare running variable $R_i$ and interaction term $R_i \times T_i$
- Before: Seconds until treatment (control was positive, treatment was negative)
- Update: Seconds since treatment (control is negative, treatment is positive)

In [None]:
# Running variable
df['r'] = (df['publishedAt'] - pd.Timestamp('2021-11-10')).dt.total_seconds()

# Interaction
df['rTreat'] = df['r'].multiply(df['treat'])

## 3. Balance tests
List of windows in format `Post%H`

In [None]:
windows = [f'Post{h}' for h in hours]

### 3.1. Descriptive statistics
Number of available videos as a function of $h$

In [None]:
# All post%HCommentsNum columns
cols = [f'post{str(h)}CommentsNum' for h in hours]

# Merge to get videoId & post{h}CommentsNum (full list of videos!)
t = pd.merge(d1[['videoId','publishedAt']], d3[['videoId'] + cols], on='videoId', how='left')

# publishedAt to Timestamp
t['publishedAt'] = pd.to_datetime(t['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ')

# Create pre & post groups
t['treat'] = t['publishedAt'] >= '2021-11-10'

# Mask each column to avoid overlap with 2021-11-10
for h, col in zip(hours, cols):
    t[col] = t['publishedAt'].le(pd.Timestamp('2021-11-10') - pd.Timedelta(hours=h)) | t['publishedAt'].ge('2021-11-11')

# Group by treatment and get counts
t = t.groupby('treat').agg(dict(zip(cols, ['sum']*7))).transpose()

# Total number of videos column
t['total'] = t.sum(axis=1)

# Format
t.index = ['h = ' + str(h) for h in windows]
t

# To latex
# print(
#    t.to_latex(
#         caption='Number of available videos before and after November 10, 2021 for different values of $h$',
#         label='tab_dat_nobs'
#     )
# )

### 3.2. Balance

Balance table for videos closest to the cutoff: $R_{hours} \in [-36,-12] \cup [24,36]$

1. Difference in means for binary covariates
1. RDD for continuous covariates

$$X_i = \gamma_0 + \gamma_1 r_i + \gamma_2 T_i + \gamma_3 r_i T_i + V_i$$

In [None]:
# Mask to avoid overlap with November 10
mask = (df['r'] <= -12*60**2) | (df['r'] >= 24*60**2)
print(f'{mask.sum()} videos used in balance test.')

# Copy data
d = df[mask].copy()

# Create video length (minutes)
d['durationMins'] = d['seconds'].div(60)

# Order frequent-word variables by frequency
X = list(topics.keys()) + ['definition','durationMins','tonePos','toneNeg','toneCom']

# Regress each variable on r and treat
data = []
for x in X:
    m = sm.OLS(endog=d[x], exog=d[['const','r','treat','rTreat']]).fit()
    data.append((m.params['treat'], m.pvalues['treat']))

# Summary table
t = pd.DataFrame(data=data, index=X, columns=['Estimated Value','p-value'])
t.index.rename('Covariate', inplace=True)
t.reset_index(inplace=True)
t.round(3)

# To latex
# print(t.to_latex(caption='Regression discontinuities on observable characteristics',
#                  label='tab_dat_balance', float_format='%.3f', index=False))

## 4. Regression Analysis
Fit all polynomial models with $d \in \{1, 2\}$.

### 4.1. First-degree
$Y_i = \beta_0 + \beta_1 r_i + \beta_2 T_i + \beta_3 T_i \times r_i + \gamma X_i + U_i$

In [None]:
# Empty list to store results in
d1 = []

# Iterate over targets
for target in ['ncr1', 'ncr2', 'pcr1', 'pcr2', 'rel1', 'rel2']:
    # Iterate over PostHH windows
    for h, post in zip(hours, windows):

        # Masks for `rel1` & `rel2`
        if 'rel' in target:
            mask = donuts[h] & (df[f'{target}{post}'].notna())
        
        # Masks for `ncr` and `pcr`
        else:
            mask = donuts[h] & (df[f'{post.lower()}CommentsNum'] > 0)

        # Formula for model
        formula = f'{target}{post} ~ treat + r + I(r * treat)'

        # Fit model
        m = sm.OLS.from_formula(formula=formula, data=df[mask]).fit(cov_type='HC0')

        # Append to list
        d1.append(m)

$NCR(h)$ summary

In [87]:
# Summary table
d1Res = pd.DataFrame(
    {
        'y':np.repeat(['ncr1', 'ncr2', 'pcr1', 'pcr2', 'rel1', 'rel2'], 6), # Targets
        'h':list(hours)*6, # Time windows
        'T':[d1[i].params['treat'] for i in range(len(d1))], # Betas
        'se':[d1[i].bse['treat'] for i in range(len(d1))], # Standard errors
        'pval':[d1[i].pvalues['treat'] for i in range(len(d1))], # p-values
        'mean':[d1[i].predict({'r':0, 'treat':0}).item() for i in range(len(d1))] # Mean at r=0
    }
)

# Reindex table
d1Res.set_index(['y', 'h'], inplace=True)

# Stars function
def stars(pval):
    if pval <= 0.01:
        return '***'
    elif pval <= 0.05:
        return '**'
    elif pval <= 0.1:
        return '*'
    else:
        return ''

# Vectorize function
d1Res['sig'] = d1Res['pval'].apply(lambda x: stars(x))

# View
d1Res.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,T,se,pval,mean,sig
y,h,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ncr1,12,0.036,0.016,0.022,0.358,**
ncr1,24,0.027,0.015,0.082,0.364,*
ncr1,36,0.02,0.016,0.227,0.373,
ncr1,48,0.021,0.017,0.207,0.373,
ncr1,60,0.024,0.018,0.173,0.369,
ncr1,72,0.029,0.018,0.12,0.365,
ncr2,12,0.048,0.017,0.005,0.505,***
ncr2,24,0.04,0.017,0.018,0.511,**
ncr2,36,0.033,0.018,0.061,0.521,*
ncr2,48,0.037,0.018,0.039,0.52,**


### 4.3. Robustness checks
#### 4.3.1. Control for president

### 4.4. Visualizations
Linear and quadratic RDD for $NCR(12)$

In [None]:
# Linrear and quadratic models
l, q = d1[0], d2[0]

# Data and predictions for lines
t = pd.DataFrame({'r':df.loc[mask, 'r'],
                  'l':d1[0].fittedvalues.values,
                  'q':d2[0].fittedvalues.values})
t['treat'] = (t['r'] > 0).astype(int)

# Data for scatter
bin_length = 4
s = df.loc[mask, ['ncr1Post12','r']].copy()
s['bin'] = (s['r'].div(60 * 60) / bin_length).apply(lambda x: floor(x))
s = s.groupby('bin')['ncr1Post12'].mean().reset_index(name='mean')
s['bin'] = s['bin'].multiply(bin_length * 60 * 60)

# Initialize figure
fig, axs = plt.subplots(nrows=1, ncols=2)
fig.set_figheight(5)
fig.set_figwidth(14)

# Plot model on each axis
for i, ax in enumerate(axs):
    # Plot linear model
    if i == 0:
        series = 'l'
    else:
        series = 'q'
    # Lines
    ax.plot(t.loc[t['treat'].eq(0), 'r'], t.loc[t['treat'].eq(0), series], color='C1')
    ax.plot(t.loc[t['treat'].eq(1), 'r'], t.loc[t['treat'].eq(1), series], color='C1')
    # Scatter
    ax.scatter(s['bin'], s['mean'], color='C0', alpha=0.5)
    # Shaded regions
    ax.axvspan(xmin=0, xmax=24*60*60, color='gray', alpha=0.3)
    ax.axvspan(xmin=-12*60*60, xmax=0, color='C0', alpha=0.3)
    # X&Y axes
    ax.set_xticks(np.arange(-5*24*60*60, 6*24*60*60+1, 24 * 60 * 60))
    ax.set_xticklabels(np.arange(-5*24, 6*24+1, 24), rotation=45)
    ax.set_xlim(-5*24*60*60, 6*24*60*60)
    ax.set_ylim(0, 0.7)
    # Labels
    title = {'l':'First-degree polynomial','q':'Second-degree polynomial'}
    ax.set_title(f'{title[series]}')
    ax.set_xlabel('Hours until policy')
    ax.set_ylabel('Negative Comment Ration (h=12)')
    ax.grid(which='major', axis='x')
# Save and show
if 'google.colab' not in sys.modules:
    plt.savefig('../../fig/fig_d1d2.png', dpi=200, bbox_inches='tight')
plt.show()

All linear models

In [None]:
# Initialize figure
fig, axs = plt.subplots(nrows=6, ncols=2)
fig.set_figheight(20)
fig.set_figwidth(14)

# Plot within each axis
for i, ax_row in enumerate(axs):
    # Left-right plots
    for j, ax in enumerate(ax_row):
        if j == 0:
            outcome = f'ncr1{windows[i]}'
            dotColor = 'blue'
            ax.set_ylabel(f'NCR(h = {hours[i]})')
            model = d1[i]
        else:
            outcome = f'ncr2{windows[i]}'
            dotColor = 'lightblue'
            ax.set_ylabel(f'sNCR(h = {hours[i]})')
            model = d1[6+i]
        # Masks
        mask = df['post12CommentsNum'].gt(0) & df[outcome].notna()
        # Line plots
        t = df.loc[mask, ['treat','r']].assign(pred = model.fittedvalues)
        ax.plot(t.loc[t['treat'].eq(0), 'r'], t.loc[t['treat'].eq(0), 'pred'], color='C1')
        ax.plot(t.loc[t['treat'].eq(1), 'r'], t.loc[t['treat'].eq(1), 'pred'], color='C1')
        # Scatter plots
        s = df[mask].copy()
        s['bin'] = (s['r'].div(4*60*60)).apply(lambda x: floor(x))
        s = s.groupby('bin')[outcome].mean().reset_index(name='mean')
        s['bin'] = s['bin'] * (4*60*60)
        ax.scatter(x=s['bin'], y=s['mean'], color='C0', alpha=0.5)
        # Shades
        ax.axvspan(xmin=0, xmax=24*60*60, color='gray', alpha=0.3)
        ax.axvspan(xmin=-hours[i]*60*60, xmax=0, color='C0', alpha=0.3)
        # Axes
        ax.set_ylim(0, 0.7)
        ax.set_xticks(np.arange(-5*24*60*60, 6*24*60*60+1, 24*60*60))
        ax.set_xticklabels(np.arange(-5*24, 6*24+1, 24))
        ax.grid(which='major', axis='x')
        if i == 0:
            ax.set_title({0:'Negative Comment Ratio',1:'Somewhat Negative Comment Ratio'}[j])
        if i == 5:
            ax.set_xlabel('Hours until policy')
# Show & save
if 'google.colab' not in sys.modules:
    plt.savefig('../../fig/fig_res_d1all.png', dpi=200, bbox_inches='tight')
plt.show()

In [None]:
# Initialize figure
fig, axs = plt.subplots(nrows=6, ncols=2)
fig.set_figheight(20)
fig.set_figwidth(14)

# Plot within each axis
for i, ax_row in enumerate(axs):
    # Left-right plots
    for j, ax in enumerate(ax_row):
        if j == 0:
            outcome = f'ncr1{windows[i]}'
            dotColor = 'blue'
            ax.set_ylabel(f'NCR(h = {hours[i]})')
            model = d2[i]
        else:
            outcome = f'ncr2{windows[i]}'
            dotColor = 'lightblue'
            ax.set_ylabel(f'sNCR(h = {hours[i]})')
            model = d2[6+i]
        # Masks
        mask = df['post12CommentsNum'].gt(0) & df[outcome].notna()
        # Line plots
        t = df.loc[mask, ['treat','r']].assign(pred = model.fittedvalues)
        ax.plot(t.loc[t['treat'].eq(0), 'r'], t.loc[t['treat'].eq(0), 'pred'], color='C1')
        ax.plot(t.loc[t['treat'].eq(1), 'r'], t.loc[t['treat'].eq(1), 'pred'], color='C1')
        # Scatter plots
        s = df[mask].copy()
        s['bin'] = (s['r'].div(4*60*60)).apply(lambda x: floor(x))
        s = s.groupby('bin')[outcome].mean().reset_index(name='mean')
        s['bin'] = s['bin'] * (4*60*60)
        ax.scatter(x=s['bin'], y=s['mean'], color='C0', alpha=0.5)
        # Shades
        ax.axvspan(xmin=0, xmax=24*60*60, color='gray', alpha=0.3)
        ax.axvspan(xmin=-hours[i]*60*60, xmax=0, color='C0', alpha=0.3)
        # Axes
        ax.set_ylim(0, 0.7)
        ax.set_xticks(np.arange(-5*24*60*60, 6*24*60*60+1, 24*60*60))
        ax.set_xticklabels(np.arange(-5*24, 6*24+1, 24))
        ax.grid(which='major', axis='x')
        if i == 0:
            ax.set_title({0:'Negative Comment Ratio',1:'Somewhat Negative Comment Ratio'}[j])
        if i == 5:
            ax.set_xlabel('Hours until policy')
# Show & save
if 'google.colab' not in sys.modules:
    plt.savefig('../../fig/fig_res_d2all.png', dpi=200, bbox_inches='tight')
plt.show()

Comparing linear to quadratic models

In [None]:
# Goodness of fit
t = pd.DataFrame(
    {
        'd1R2a':[m.rsquared_adj for m in d1], 'd2R2a':[m.rsquared_adj for m in d2],
        'd1bic':[m.bic for m in d1], 'd2bic':[m.bic for m in d2],
        'd1aic':[m.aic for m in d1], 'd2aic':[m.aic for m in d2],
    }
)

t.round(4)