## Regression analysis for Bellingcat threads

This code estimates estimates a regression model for different characteristics of Bellingcat's threads on engagement

Set up environment

In [None]:
%matplotlib inline

# Import libraries
import pandas as pd
import numpy as np
import re
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import tkinter
import datetime
from datetime import datetime as datetime_1

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [None]:
# Set seed
np.random.seed(42)

# Define paths
path_data = "../../DataSources/"

In [None]:
# Load thread data
df_thread = pd.read_excel(path_data + "bellingcat_grouped_conversation_inclu_warPeriod_Final_lang_mode_thread_mention.xlsx", index_col=0)

# Load follower data
df_followers = pd.read_excel(path_data + "Followers_final.xlsx")

In [None]:
# Preprocess thread data
df_thread['year'] = df_thread.date.dt.year
df_thread['month'] = df_thread.date.dt.month

In [None]:
# Preprocess follower data
df_followers = df_followers[["Date","followers"]].groupby([pd.Grouper(freq="M",key="Date")])["followers"].max().reset_index()
df_followers['year'] = df_followers.Date.dt.year
df_followers['month'] = df_followers.Date.dt.month

In [None]:
# Join thread and follower data
df_thread = df_thread.merge(df_followers, how="left", on=["year","month"]).drop(columns=['month', 'year', 'Date'])

In [None]:
# Only keep english threads
df_thread = df_thread[df_thread['lang']=="en"]
# Drop observations before observation period
df_thread = df_thread[df_thread['date']>=datetime.datetime(year=2014,month=7,day=1)]
# Drop observations without followers
df_thread.dropna(subset=['followers'],inplace=True)

In [None]:
# Preprocess text for sentiment analysis
def preprocess_tweets(text):
    fo = open("Stopwords-en.txt","+r")
    stop_words = list(fo.read().split(','))
    translation={39:None}
    processed_tweet = text
    processed_tweet=' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    processed_tweet = " ".join(word for word in processed_tweet.split() if word not in str(stop_words).translate(translation))
    return(processed_tweet)

df_thread['processed_text'] = df_thread['text'].apply(lambda x: preprocess_tweets(x.lower()))

# Clean Rows where Processed Tweet is Empty (e.g. because if it is only a Link)
def EmptyRows(dataset): 
    nan_value = float("NaN")
    dataset.replace("",nan_value, inplace=True)
    dataset.dropna(subset = ['processed_text'], inplace= True)
    
EmptyRows(df_thread)

# Compute sentiment of thread
df_thread["polarity"] = df_thread["processed_text"].apply(lambda x: analyzer.polarity_scores(x))

# Group sentiment (positive vs. negative vs. neutral)
def sentimentPredict(sentiment):
    if sentiment['compound'] >= 0.05:
        return "positive"
    elif sentiment['compound'] <= -0.05: 
        return "negative"
    else:
        return "neutral"

df_thread["sentiment"] = df_thread["polarity"].apply(lambda x: sentimentPredict(x))

# Unstack polarity
df_thread = pd.concat([df_thread.drop(["polarity"], axis=1), df_thread["polarity"].apply(pd.Series)], axis=1)

# Set sentiment and polarity to NA for empty strings (e.g, because they only include a URL, Mention, etc.)
#df_thread.loc[df_thread["processed_text"] == "", "compound"] = np.nan
#df_thread.loc[df_thread["processed_text"] == "", "sentiment"] = np.nan

# Create numeric sentiment variable
df_thread.loc[df_thread["sentiment"] == "positive", "sentiment_num"] = 1
df_thread.loc[df_thread["sentiment"] == "negative", "sentiment_num"] = -1
df_thread.loc[df_thread["sentiment"] == "neutral", "sentiment_num"] = 0


In [None]:
# Compute engagement
df_thread["engagement"] = df_thread[["likes", "replies", "quotes", "retweets"]].sum(axis=1)
# Normalize engagement by follower and tweet count
df_thread["norm_engagement"] = df_thread["engagement"]/(df_thread["followers"] * df_thread["count"])

In [None]:
# Engagement by sentiment
df_thread.groupby("sentiment")["norm_engagement"].sum().reset_index()

In [None]:
df_thread[["likes", "replies", "quotes", "retweets", "engagement", "followers", "count", "norm_engagement"]]

In [None]:
df_thread
for cols in df_thread.columns:
    print(cols)

Regression analysis

In [None]:
# Define covariates
covariates = [ "urls", "media_photo", "media_videos", "hashtags", "mentions", "compound"]

# Drop missing data 
all_variables = []
all_variables.extend(covariates + ["norm_engagement"])
df_thread = df_thread.dropna(axis=0, subset=all_variables)

# Compute log engagement
df_thread["log_engagement"] = np.log(df_thread["norm_engagement"], where=df_thread["norm_engagement"] != 0)
# Set log_engagement to min(norm_engagement) for values == 0
min_engagement = np.min(df_thread.loc[df_thread["norm_engagement"] != 0, "norm_engagement"])
df_thread.loc[df_thread["norm_engagement"] == 0, "log_engagement"] = np.log(min_engagement)

In [None]:
# Distribution of engagement
quantiles = np.quantile(df_thread['log_engagement'], np.linspace(0.1, 0.9, 8))
sns.histplot(df_thread['log_engagement'], kde=True,  bins=30)

In [None]:
# Standardize independent variables
scaler = StandardScaler().fit(df_thread[covariates])
X = scaler.transform(df_thread[covariates])
X = sm.add_constant(X)

In [None]:
# Fit linear regression model (log_engagement)
model_lm = sm.OLS(endog=df_thread["log_engagement"], exog=sm.add_constant(df_thread[covariates]))
results = model_lm.fit(cov_type="HC3")
print(results.summary())

In [None]:
# Fit linear regression model (norm_engagement)
model_lm = sm.OLS(endog=df_thread["norm_engagement"], exog=sm.add_constant(df_thread[covariates]))
results = model_lm.fit(cov_type="HC3")
print(results.summary())