# Twitter Discourse and Emotions Around the Invasion of Ukraine - Companion code
## – A Text Analytics Approach 
### Gabriel Lindelöf


In [None]:
import pandas as pd
from pyarrow import feather
import numpy as np
from bertopic import BERTopic
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
tqdm.pandas() # used for apply with progress bar 


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.options.display.float_format = '{:,.6f}'.format

from datetime import datetime

def to_datetime(date):
    '''Convert to datetime object.'''
    date = datetime.fromisoformat(date[:-1])
    return date

In [None]:
columns = ['author_id', 'text', 'text_clean', 'public_metrics.retweet_count', 'public_metrics.like_count', 'author.public_metrics.followers_count']
df = feather.read_feather('data/ukraine_two_weeks_clean_shuffled_v2.feather', columns = columns) # import DF with relevant columns
len(df)

# Valence and intensity classification

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#nltk.download('vader_lexicon')

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(s):
    '''Let VADER analyze sentiment of texts.
    
    Parameter: s (pd.Series): A series containing the column text_clean.
    
    Returns: a pd.Series object containing the positive, negative, neutral score as well as compound score. 
    '''
    result = analyzer.polarity_scores(s.text_clean) 
    pos = result['pos']
    neg = result['neg']
    neu = result['neu']
    comp = result['compound']
    
    return pd.Series([pos, neg, neu, comp], index=['pos', 'neg', 'neu', 'comp'])

In [None]:
df[['pos', 'neg', 'neu', 'comp']] = df.progress_apply(get_sentiment, axis = 1) # Get sentiment of  texts.

In [None]:
def get_senti_categories(df):
    '''Adds categorical variables for sentiment. Also calcuates intensity.'''
    
    # look at compund score to decide category.
    df['is_neg'] = df.comp.apply(lambda x: True if x < 0 else False) 
    df['is_pos'] = df.comp.apply(lambda x: True if x > 0 else False)
    df['is_neu'] = df.comp.apply(lambda x: True if x == 0 else False)
    
    # also add single summarizing variable for other use cases.
    df['polarity'] = df.comp.apply(lambda x: 'negative' if x < 0 else ('positive' if x > 0 else 'neutral'))
    
    # calculate intensity
    df['intensity'] = df.comp.apply(abs)

    
    print('Negative: ', df.is_neg.sum()/len(df))
    print('Positive: ', df.is_pos.sum()/len(df))
    print('Neutral: ', df.is_neu.sum()/len(df))
    print('Mean sentiment: ', df.comp.sum()/len(df))
    return df


In [None]:
df = df.progress_apply(get_senti_categories, axis = 1) # get categorical variables.

# Sentiment to retweets

In [None]:
columns = ['author_id', 'text', 'text_clean', 'public_metrics.retweet_count', 'public_metrics.like_count', 'author.public_metrics.followers_count']
df = feather.read_feather('data/ukraine_two_weeks_clean_shuffled_v2.feather', columns = columns)

# Add previously calculated sentiment data to DF
x = feather.read_feather('data/vader_sentiment_shuffled_v2.feather') 
df = df.join(x)

In [None]:
df = df.rename(columns = {'public_metrics.retweet_count':'retweets', 'public_metrics.like_count':'likes'}) # easier names

In [None]:
pd.qcut(df['intensity'], q = 4).value_counts() # get quantiles of intensity

### Plot the relationship between intensity and number of retweets

In [None]:
fig = plt.figure()

df['bins'] = pd.qcut(df['intensity'], q = 4) # Divide into quantiles for more easily read plot.

# Group by quantile and add standard error.
ax = df[['bins','retweets']].groupby('bins').mean().plot(linestyle='--', marker='o', color='b', yerr = df[['bins','retweets']].groupby('bins').sem()) 


font = {'size' : 15}

plt.rc('font', **font)

fig = ax.figure
fig.set_size_inches(10, 8)
fig.tight_layout()


ax.set_ylabel('Retweets (mean)')
ax.set_xlabel('Intensity (quantile)')
ax.get_legend().remove()
ax.set_xticklabels(labels = ['', 'Q1', '', 'Q2', '', 'Q3', '', 'Q4', ''])
plt.show()
fig.savefig('plots/intensity_retweets.png', dpi=300)

### Linear regression intensity & retweets

In [None]:
from scipy import stats
X = df.intensity#[(df.retweets > 0)]
y = df.retweets


slope, intercept, r, p, std_err = stats.linregress(X, y) # Run linear regression. 
print("slope: ", slope)
print("intercept: ", intercept)
print("r: ", r)
print("r2: ", '{:f}'.format(r**2))
print("p: ", p)
print("std_err: ", std_err)


# Verify results using other package.
import statsmodels.api as sm
X = df.intensity
y = df.likes


X = sm.add_constant(X)
model = sm.OLS(y,X)
result = model.fit()
print(result.summary())

# Sentiment in topics

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic.backend._utils import select_backend
from bertopic import BERTopic
import pickle

model_name = 'model_1mil_v13_14t'
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

topic_model = BERTopic.load('models/{}'.format(model_name), embedding_model=sentence_model) # Load topic model.
topics = pickle.load(open('models/{}_all.pickle'.format(model_name),'rb')) # Load topic labels. 

# add topic info to DF
df['topic'] = topics 
df['topic_name'] = df.topic.apply(lambda x: topic_model.get_topic(x)[0][0])

In [None]:
# Custom topic names to be used in figures. 
topic_names = {
-1:'The invasion',
0:'NATO',
1:'Foreign\nstudents',
2:'Refugees &\nracism',
3:'Nazism',
4:'China',
5:'Cryptocurrency',
6:'Energy',
7:'Airplanes',
8:'Other\nconflicts',
9:'Attacked\ncities',
10:'Nuclear\nplants',
11:'Biolabs',
12:'COVID-19',
13:'National\nsymbols',
}

### Compare polarity between topics

In [None]:
mean_count = df[['topic', 'is_pos', 'is_neg', 'is_neu']].groupby('topic').sum() # Get number of tweets with each polarity by topic
mean_count['total'] = mean_count.sum(axis=1) # Add total column

# Convert to fractions of each polarity by topic. 
mean_count['is_pos_perc'] = mean_count.is_pos / mean_count.total 
mean_count['is_neg_perc'] = mean_count.is_neg / mean_count.total
mean_count['is_neu'] = mean_count.is_neu / mean_count.total

mean_count = mean_count.reset_index()
mean_count['topic_name'] = mean_count.topic.apply(lambda x: topic_names[x]) # Add custom topic names. 
mean_count = mean_count.set_index('topic')

In [None]:
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt
import seaborn as sns

@ticker.FuncFormatter
def major_formatter(x, pos):
    x = round(x, 1)
    label = str(-x) if x < 0 else str(x)
    return label

palette = sns.color_palette("deep")

sns.set_style("whitegrid")



font = {'size'   : 19}

plt.rc('font', **font)


positive_perc = mean_count.is_pos_perc.tolist()
negative_perc = [x * -1 for x in mean_count.is_neg_perc.tolist()]

fig = plt.figure()
ax = plt.subplot(111)
ax.bar(mean_count.topic_name, positive_perc, width=0.9, color=palette[2])
ax.bar(mean_count.topic_name, negative_perc, width=0.9, color=palette[3])

#ax.set_title('Percentage of positive versus negative tweets in for each topic', fontsize = 30, pad = 20)
ax.set_xlabel('Topic', fontsize = 30, labelpad=20)
ax.set_ylabel('Fraction of tweets',fontsize = 30, labelpad=20)




fig = ax.figure
fig.set_size_inches(30, 10)
fig.tight_layout()
ax.grid(True, axis = 'y')
plt.ylim([-0.8, 0.8])
ax.yaxis.set_major_formatter(major_formatter)


fig.show()
plt.savefig('plots/polarity_topics', dpi=300)

In [None]:
print('Total % positive: ', df.is_pos.sum()/len(df))
print('Total % negative: ', df.is_neg.sum()/len(df))
print('Total % neutral: ', df.is_neu.sum()/len(df))

In [None]:
mean_count.sort_values(by = 'is_pos_perc', ascending = False).round(3) # Compare most positive, changed to neg or neutral for other comparisons. 


In [None]:
# Compare most/least polarized topics 
mean_count['diff'] = mean_count.is_neg_perc - mean_count.is_pos_perc 
mean_count.sort_values(by = 'diff', ascending = False).round(3) 

### Compare intensity between topics

In [None]:
mean_intensity = df[['topic', 'intensity']].groupby('topic').mean() # Mean intensity by topic
mean_intensity = mean_intensity.reset_index()
mean_intensity['topic_name'] = mean_intensity.topic.apply(lambda x: topic_names[x]) # Get topic names
mean_intensity = mean_intensity.set_index('topic')

In [None]:
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt

font = {'size'   : 19}

plt.rc('font', **font)

fig = plt.figure()
ax = plt.subplot(111)
ax.bar(mean_intensity.topic_name, mean_intensity.intensity, width=0.9, color=palette[3])


ax.set_xlabel('Topic', fontsize = 30, labelpad=20)
ax.set_ylabel('Avergage Intensity',fontsize = 30, labelpad=20)

fig = ax.figure
fig.set_size_inches(30, 10)
fig.tight_layout()
ax.grid(True, axis = 'y')

ax.yaxis.set_major_formatter(major_formatter)


fig.show()
plt.savefig('plots/intensity_topics.png', dpi=300)

In [None]:
 # Create table of intensity and number of documents in topic. 
intensity_freq = df[['topic', 'intensity']].groupby('topic').mean().join(df.groupby('topic').size().rename('freq'))

In [None]:
# Test difference using linear regression.
import statsmodels.api as sm
X = intensity_freq.intensity
y = intensity_freq.freq

X = sm.add_constant(X)

model = sm.OLS(y, X)
result = model.fit()
print(result.summary())

### Tokenize tweets (used in estimate_primary_emotions.py)

In [None]:
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer(preserve_case=False, strip_handles=True) # Initiate tokenizer made for tweets. 
df['tok'] = df.text_clean.progress_apply(lambda x: tt.tokenize(x)) # Tokenize tweets, to be able to count individual words
df.to_feather('data/ukraine_two_weeks_clean_shuffled_v2_tok.feather')

In [None]:
len(stimuli)

## Contagion of emotions - machine learning approach

In [None]:
response = feather.read_feather('data/response_10000_roberta.feather') # Load estimated emotions of tweets calculated in estimate_emotions_roberta.py
stimuli = feather.read_feather('data/roberta_emotions_stimuli_0_1000.feather') # The same for the stimuli tweets
response['created_at'] = response['created_at'].apply(to_datetime)

In [None]:
# Classify tweet as the emotion with dominant score. 
stimuli['emotion'] = stimuli[['anger', 'joy', 'optimism','sadness']].idxmax(axis = 1)
response['emotion'] = response[['anger', 'joy', 'optimism','sadness']].idxmax(axis = 1)

In [None]:
# Merge into stimuli and response into a single DF for calculations. 
stimuli = pd.merge(stimuli, response[['author_id', 'created_at']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp'])

In [None]:
# How long time before response was timuli created?
stimuli['time_diff'] = stimuli.created_at_resp - stimuli.created_at 
stimuli['diff_mins'] = stimuli.time_diff.apply(lambda x: round(x.total_seconds() / 60, 2)) 

In [None]:
print(len(stimuli))
stimuli = stimuli[stimuli.diff_mins < 60]# Remove tweets made more than an hour before response. 
print(len(stimuli))

In [None]:
# Add count of how many stimuli tweets each response has
stimuli = pd.merge(stimuli, stimuli.groupby('followed_by').size().rename("followed_by_stim_count"), on="followed_by", how="left", suffixes = ['', '_count'])

In [None]:
stimuli = stimuli[stimuli.followed_by_stim_count >= 20] # Remove those with fewer than 20 stimuli 

In [None]:
len(stimuli)

In [None]:
def sample_emotions(n):
    '''Samples a random set of n tweets from all stimuli tweets. The fraction of tweets with each emotion in the random sample is then returned'''
    c = stimuli.emotion.sample(n, replace = True).value_counts(normalize=True) # Get fraction of each emotions in a random sample of n
    
    # set to 0 in case dictonary does not contain an emotion. 
    anger = 0
    joy = 0
    optimism = 0
    sadness = 0
    total = c.sum()
    
    # Get the fraction of each emotion that was randomly sampled. 
    if 'anger' in c.keys():
        anger = c['anger']
    if 'joy' in c.keys():
        joy = c['joy']
    if 'optimism' in c.keys():
        optimism = c['optimism']
    if 'sadness' in c.keys():
        sadness = c['sadness']
    return pd.Series([anger,joy,optimism,sadness,total], index = ['anger', 'joy', 'optimism','sadness','total'])
    


In [None]:
# Samples n random tweets for each user based on the number of tweets they had seen, and gets the fraction of each emotion in that sample. 
# Used to create a baseline simulating that users had seen tweets with random emotions, nullifying contagion. 
x = pd.DataFrame(stimuli.followed_by.value_counts()).followed_by.apply(sample_emotions) 

In [None]:
x = feather.read_feather('data/sample_emotions_1000.feather') # Load previous sampling to have consistent results of analysis. 

In [None]:
print("Baseline mean: ")
baseline = x[['anger', 'joy', 'optimism','sadness']].mean() # Get the mean of all response tweets. 
baseline

In [None]:
print("Baseline tandard error: ")
err = x[['anger', 'joy', 'optimism','sadness']].sem() # Get standard error for baseline
err

In [None]:
# Create DF containing stimuli and response tweets
stimuli = pd.merge(stimuli, response[['author_id', 'emotion']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp'])

In [None]:
resp_stimuli

In [None]:
# Create DF containing the response tweets and fractions of tweets in stimuli with each emotion
resp_stimuli = pd.DataFrame(stimuli[['followed_by', 'emotion', 'emotion_resp']].groupby('followed_by').emotion.value_counts(normalize = True)).rename(columns = {'emotion':'em'}).reset_index().pivot(index = 'followed_by'
,columns='emotion', values='em').fillna(0)

# Add info about response tweets emotion to DF.
resp_stimuli = pd.merge(resp_stimuli, response[['author_id', 'emotion']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp']).rename(columns = {'emotion':'emotion_resp'})

# Get mean stimuli for each response emotion
outcome_mean = resp_stimuli.groupby('emotion_resp').mean().drop('author_id', axis = 1)
outcome_error = resp_stimuli.groupby('emotion_resp').sem().drop('author_id', axis = 1)

In [None]:
# Create DF with standard error for all groups
outcome_error = outcome_error.transpose()
outcome_error.index = ['Anger', 'Joy', 'Optimism', 'Sadness']

error = pd.DataFrame(err, columns = ['Baseline'])
error.index.name = 'response_emotion_error'

error = error.transpose()
error = error.append(outcome_error)
error


In [None]:
# Create DF with baseline proportions.
props = pd.DataFrame(baseline, columns = ['Baseline'])
props.index.name = 'response_emotion'
props = props.transpose()

In [None]:
# Create DataFrames with the actual proportions for each emotion. 
anger = outcome_mean.loc['anger'].to_frame().transpose()
anger.index = ['Anger']
anger.index.name = 'response_polarity'

joy = outcome_mean.loc['joy'].to_frame().transpose()
joy.index = ['Joy']
joy.index.name = 'response_polarity'

optimism = outcome_mean.loc['anger'].to_frame().transpose()
optimism.index = ['Optimism']
optimism.index.name = 'response_polarity'

sadness = outcome_mean.loc['sadness'].to_frame().transpose()
sadness.index = ['Sadness']
sadness.index.name = 'response_polarity'

In [None]:
# Combine all proportions to one DF for visualizations. 
props = props.append(anger).append(joy).append(optimism).append(sadness)

In [None]:
# Convert to percent.
props = props*100 
error = error*100

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
@ticker.FuncFormatter
def major_formatter(x, pos):
    x = round(x, 1)
    label = str(-x) if x < 0 else str(x)
    return label

palette = sns.color_palette("deep")

sns.set_style("whitegrid")


width = 0.5 # bar width      

fig, ax = plt.subplots()
fig.set_size_inches(12, 8)

# Add bars for each emotion, starting at the height of the previous bar. 
ax.bar(props.index, props.anger, width, label = 'Anger', color = palette[3], yerr = error.anger, error_kw=dict(elinewidth=15))
ax.bar(props.index, props.sadness, width, label = 'Sadness', bottom = props.anger, color = palette[0], yerr = error.sadness, error_kw=dict(elinewidth=15))
ax.bar(props.index, props.optimism, width, label = 'Optimism', bottom = props.anger + props.sadness, color = palette[1], yerr = error.optimism, error_kw=dict(elinewidth=15))
ax.bar(props.index, props.joy, width, label = 'Joy', bottom = props.anger + props.sadness + props.optimism, color = palette[2])

# Create labels with standard error for each group and stimuli emotion
anglab = ['{}\n±\n{}'.format(round(props.anger[i], 2), round(error.anger[i], 2)) for i in range(len(props))]
ax.bar_label(ax.containers[1], labels = anglab, label_type='center')

sadlab = ['{}\n±\n{}'.format(round(props.sadness[i], 2), round(error.sadness[i], 2)) for i in range(len(props))]
ax.bar_label(ax.containers[3], labels = sadlab, label_type='center')

optlab = ['{}\n±\n{}'.format(round(props.optimism[i], 2), round(error.optimism[i], 2)) for i in range(len(props))]
ax.bar_label(ax.containers[5], labels = optlab, label_type='center')

joylab = ['{}\n±\n{}'.format(round(props.joy[i], 2), round(error.joy[i], 2)) for i in range(len(props))]
ax.bar_label(ax.containers[6], labels = joylab, label_type='center')



plt.ylim((0, 100)) 
font = {'size' : 13}

plt.rc('font', **font)

ax.set_ylabel('Percent tweets')
ax.set_ylabel('Response emotion')

handles, labels = ax.get_legend_handles_labels()
lgd = ax.legend(handles[::-1], labels[::-1],title = 'Stimuli emotion %', bbox_to_anchor=(1.01, 1), frameon=False)


fig.savefig('plots/emotion_contagion.png', dpi=300, bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()

In [None]:
resp_stimuli

In [None]:
from pingouin import mwu

# Create DF with the stimuli tweets proportions for each response tweet as well as baseline. 
resp_stimuli = pd.DataFrame(stimuli[['followed_by', 'emotion', 'emotion_resp']].groupby('followed_by').emotion.value_counts(normalize = True)).rename(columns = {'emotion':'em'}).reset_index().pivot(index = 'followed_by'
,columns='emotion', values='em').fillna(0)
resp_stimuli = pd.merge(resp_stimuli, response[['author_id', 'emotion']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp']).rename(columns = {'emotion':'emotion_resp'})
bl = x[['anger', 'joy', 'optimism', 'sadness']]
bl['emotion_resp'] = 'baseline'
resp_stimuli = resp_stimuli.append(bl)


sig_results = 0
for group in ['anger', 'joy', 'optimism', 'sadness']:
    for em in ['anger', 'joy', 'optimism', 'sadness']:
        a = resp_stimuli.groupby('emotion_resp').get_group(group)[em] # Get distribution for group to be compared
        b = resp_stimuli.groupby('emotion_resp').get_group('baseline')[em] # Get corresponding baseline
        
        mwu_result = mwu(a, b, alternative='two-sided') # Test significance
        if mwu_result['p-val'][0] < 0.05:
            print("\n\n*The {} group has a significantly different distribution of {} than baseline.".format(group, em))
            print(mwu_result)
            sig_results += 1
        else: 
            print("\n\nThe {} group does NOT have a significatly different distribution of {} than baseline.".format(group, em))
            print(mwu_result)
print("Significant results: ", sig_results)
        
            
        
       
        

## Contagion of emotions - polarity

In [None]:
stimuli[['pos', 'neg', 'neu', 'comp']] = stimuli.progress_apply(get_sentiment, axis = 1) # Get sentiment for all stimuli tweets.

In [None]:
response[['pos', 'neg', 'neu', 'comp']] = response.progress_apply(get_sentiment, axis = 1) # Get sentiment for all response tweets.

In [None]:
# Also get categorical variables
response = get_senti_categories(response) 
stimuli = get_senti_categories(stimuli)

In [None]:
def sample_polarity(n):
    '''Samples a random set of n tweets from all stimuli tweets. The fraction of tweets with each polarity in the random sample is then returned'''
    c = stimuli.polarity.sample(n, replace = True).value_counts(normalize=True)
    
    # Set to zero in case a stimuli does not contain a polarity.
    positive = 0
    negative = 0
    neutral = 0
    
    # Get fractions for each polarity
    if 'positive' in c.keys():
        positive = c['positive']
    if 'negative' in c.keys():
        negative = c['negative']
    if 'neutral' in c.keys():
        neutral = c['neutral']

        
    return pd.Series([positive,negative,neutral], index = ['positive', 'negative', 'neutral']) # Return as pd.Series. 
    


In [None]:
#x = pd.DataFrame(stimuli.followed_by.value_counts()).followed_by.apply(sample_polarity) # Run sampling to get baseline

In [None]:
x = feather.read_feather('data/sample_polarity_1000.feather') # Load previously taken sample for consistent results. 

In [None]:
print("Baseline mean: ")
baseline = x[['positive', 'negative', 'neutral']].mean() # Get baseline, mean of all tweets. 
baseline

In [None]:
print("Baseline tandard error: ")
err = x[['positive', 'negative', 'neutral']].sem() # Get standard error for baseline
err

In [None]:
# Create DF containing stimuli and response tweets
stimuli = pd.merge(stimuli, response[['author_id', 'polarity']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp'])

In [None]:
# Create DF containing the response tweets and fractions of tweets in stimuli with each polarity. 
resp_stimuli = pd.DataFrame(stimuli[['followed_by', 'polarity', 'polarity_resp']].groupby('followed_by').polarity.value_counts(normalize = True)).rename(columns = {'polarity':'pol'}).reset_index().pivot(index = 'followed_by'
,columns='polarity', values='pol').fillna(0)

# Add info about response tweets own polarity to DF.
resp_stimuli = pd.merge(resp_stimuli, response[['author_id', 'polarity']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp']).rename(columns = {'polarity':'polarity_resp'})

# Get mean stimuli for each response polarity
outcome_mean = resp_stimuli.groupby('polarity_resp').mean().drop('author_id', axis = 1)
outcome_error = resp_stimuli.groupby('polarity_resp').sem().drop('author_id', axis = 1)

In [None]:
# Create DF containing standard error for baseline
outcome_error = outcome_error.transpose()
outcome_error.index = ['Negative', 'Neutral', 'Positive']
error = pd.DataFrame(err, columns = ['Baseline'])
error.index.name = 'response_polarity_error'
error = error.transpose()
error = error.append(outcome_error)
error

In [None]:
# Create DF containing baseline proportions
props = pd.DataFrame(baseline, columns = ['Baseline'])
props.index.name = 'response_polarity'
props = props.transpose()

In [None]:
# Create a DF for each response group containing the proportions of stimuli.
neg = outcome_mean.loc['negative'].to_frame().transpose()
neg.index = ['Negative']
neg.index.name = 'response_polarity'

neu = outcome_mean.loc['neutral'].to_frame().transpose()
neu.index = ['Neutral']
neu.index.name = 'response_polarity'

pos = outcome_mean.loc['positive'].to_frame().transpose()
pos.index = ['Positive']
pos.index.name = 'response_polarity'

In [None]:
# Combine into a single DF for visualization. 
props = props.append(neg).append(neu).append(pos)

In [None]:
# Convert to percent. 
props = props*100
error = error*100

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
palette = sns.color_palette("deep")
@ticker.FuncFormatter
def major_formatter(x, pos):
    x = round(x, 1)
    label = str(-x) if x < 0 else str(x)
    return label



width = 0.5 # bar width     

fig, ax = plt.subplots()
fig.set_size_inches(15, 10)

# Add bars for each group, start on top of previous groups bar. 
ax.bar(props.index, props.positive, width, bottom = (props.neutral + props.negative), label = 'Positive', color = palette[2])#, yerr = pos_error
ax.bar(props.index, props.neutral, width, bottom = props.negative, label = 'Neutral', color = palette[7], yerr = error.neutral, error_kw=dict(elinewidth=15))
ax.bar(props.index, props.negative, width, label = 'Negative', color = palette[3], yerr = error.negative, error_kw=dict(elinewidth=15))


ax.grid(True, axis = 'y', which = 'both')
ax.yaxis.set_major_formatter(major_formatter)


# Add labels indicating standard error.
poslab = ['{}\n±\n{}'.format(round(props.positive[i], 2), round(error.positive[i], 2))  for i in range(len(props))]
ax.bar_label(ax.containers[0], labels = poslab, label_type='center')

neulab = ['{}\n±\n{}'.format(round(props.neutral[i], 2), round(error.neutral[i], 2))  for i in range(len(props))]
ax.bar_label(ax.containers[2], labels = neulab, label_type='center')

neglab = ['{}\n±\n{}'.format(round(props.negative[i], 2), round(error.negative[i], 2))  for i in range(len(props))]
ax.bar_label(ax.containers[4], labels = neglab, label_type='center')


plt.ylim((0, 100)) 


font = {'size' : 15}

plt.rc('font', **font)

ax.set_ylabel('Percent tweets')
ax.set_xlabel('Response polarity')
lgd = ax.legend(title = 'Stimuli polarity %', loc = 'upper right', bbox_to_anchor=(1.2, 1), frameon=False)
fig.savefig('plots/polarity_contagion.png', dpi=300, bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()

In [None]:
# Create DF containg each response tweets emotion and the stimuli distribution. 
resp_stimuli = pd.DataFrame(stimuli[['followed_by', 'polarity', 'polarity_resp']].groupby('followed_by').polarity.value_counts(normalize = True)).rename(columns = {'polarity':'pol'}).reset_index().pivot(index = 'followed_by'
,columns='polarity', values='pol').fillna(0)
resp_stimuli = pd.merge(resp_stimuli, response[['author_id', 'polarity']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp']).rename(columns = {'polarity':'polarity_resp'})
bl = x[['positive', 'negative', 'neutral']]
bl['polarity_resp'] = 'baseline'
resp_stimuli = resp_stimuli.append(bl)

In [None]:
# Compare distributions visually. 
resp_stimuli.groupby('polarity_resp').get_group('negative').negative.plot.hist() 
resp_stimuli.groupby('polarity_resp').get_group('positive').negative.plot.hist()
resp_stimuli.groupby('polarity_resp').get_group('baseline').negative.plot.hist()

In [None]:
from scipy import stats
stats.normaltest(resp_stimuli.groupby('polarity_resp').get_group('baseline').negative) # Test if normal distribution, was done for all groups and polarities.

In [None]:
# Test significance to baseline for each group and polarity.
sig_results = 0
for group in ['positive', 'negative', 'neutral']:
    
    for em in ['positive', 'negative', 'neutral']:
        a = resp_stimuli.groupby('polarity_resp').get_group(group)[em] # To be tested. 
        b = resp_stimuli.groupby('polarity_resp').get_group('baseline')[em] # Baseline for corresponding emotion. 
        
        mwu_result = mwu(a, b, alternative='less')
        if mwu_result['p-val'][0] < 0.05:
            print("\n\n*The {} group has a significantly different distribution of {} than baseline.".format(group, em))
            print(mwu_result)
            sig_results += 1
        else: 
            print("\n\nThe {} group does NOT have a significatly different distribution of {} than baseline.".format(group, em))
            print(mwu_result)
print("Significant results: ", sig_results)
        
            
        
       
        

In [None]:
# Compare using barplots, not used in final paper.
import seaborn as sns
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'polarity_resp', y = 'negative', notch = True)
fig.show()


fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'polarity_resp', y = 'positive', notch = True)
fig.show()


fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'polarity_resp', y = 'neutral', notch = True)
fig.show()

In [None]:
resp_stimuli = pd.DataFrame(stimuli[['followed_by', 'emotion', 'emotion_resp']].groupby('followed_by').emotion.value_counts(normalize = True)).rename(columns = {'emotion':'em'}).reset_index().pivot(index = 'followed_by'
,columns='emotion', values='em').fillna(0)

resp_stimuli = pd.merge(resp_stimuli, response[['author_id', 'emotion']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp']).rename(columns = {'emotion':'emotion_resp'})

bl = x[['anger', 'joy', 'optimism', 'sadness']]
bl['emotion_resp'] = 'baseline'
resp_stimuli = resp_stimuli.append(bl)

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'emotion_resp', y = 'anger', notch = True)
fig.show()


fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'emotion_resp', y = 'sadness', notch = True)
fig.show()


fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'emotion_resp', y = 'optimism', notch = True)
fig.show()

fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax = sns.boxplot(data = resp_stimuli, x = 'emotion_resp', y = 'joy', notch = True)
fig.show()

# Emotions in topics - lexical approach

In [None]:
pe = feather.read_feather('data/ukraine_two_weeks_clean_shuffled_v2_8emotions.feather') # Emotions labeled using estimate_primary_emotions.py
df = df.join(pe)

In [None]:
mean_words_topic = df[['topic', 'trust', 'fear', 'sadness', 'anger', 'surprise', 'disgust', 'joy', 'anticipation']].groupby('topic').mean() # Get mean count of words per topic.

In [None]:
# Convert to fractions, then multiply by 100 to get percentages. 
mean_words_topic[['trust', 'fear', 'sadness', 'anger', 'surprise', 'disgust', 'joy', 'anticipation']] = mean_words_topic[[
                'trust', 'fear', 'sadness', 'anger', 'surprise', 'disgust', 'joy', 'anticipation']].apply(lambda x: (x/x.sum()) *100, axis=1)

In [None]:
# Add topic names. 
mean_words_topic.reset_index(inplace = True)
mean_words_topic['topic_name'] = mean_words_topic.topic.apply(lambda x: topic_names[x]) 
mean_words_topic.set_index('topic', inplace = True)

In [None]:
# Get baseline, mean of all topics for each emotion
em_bl = pd.DataFrame(mean_words_topic.mean()).transpose()
em_bl['topic_name'] = 'baseline'
em_bl.index = [-2]
mean_words_topic = mean_words_topic.append(em_bl).sort_index()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
palette = sns.color_palette("deep")

 
fig, ax = plt.subplots()
fig.set_size_inches(30, 15)



width = 0.75 # Bar width 
bottom_total = 0 # Bottom bar location


# Plot bars on top of each other with set color and label.
em_col = {'trust':1, 'fear':4, 'sadness':0, 'anger':3, 'surprise':9, 'disgust':5, 'joy':2, 'anticipation':8}
for em in ['fear', 'trust', 'anger', 'anticipation', 'sadness', 'joy', 'surprise', 'disgust']:#[::-1]:
    ax.bar(mean_words_topic.topic_name, mean_words_topic[em], width, label = em, color = palette[em_col[em]], bottom = bottom_total)
    bottom_total +=  mean_words_topic[em]
    
handles, labels = ax.get_legend_handles_labels()
lgd = ax.legend(handles[::-1], labels[::-1],title = 'Emotion', bbox_to_anchor=(1.01, 1), frameon=False, fontsize = 25)
lgd.get_title().set_fontsize(25)

#ylabels = ['{:.0f}'.format(x) + 'k' for x in ax.get_yticks()/1000]

baseline_hatches = []
pos = 0
for i in range(8): # Prepares hatch locations for baseline. 
    baseline_hatches.append(pos)
    pos += len(mean_words_topic)
    
bars = ax.patches
for bar in baseline_hatches: # Add hatches. 
    bars[bar].set_hatch('/')

# Add difference from baseline as labels. 
for i, em in enumerate(['fear', 'trust', 'anger', 'anticipation', 'sadness', 'joy', 'surprise', 'disgust']):
    em_diffs = mean_words_topic[em] - mean_words_topic[em][-2] 
    lab = [('+\n{}'.format(round(em_diffs[i], 1))) if em_diffs[i] >= 0 else '━\n{}'.format(str(round(em_diffs[i], 1))[1:]) for i in range(-2, len(em_diffs)-2)]
    ax.bar_label(ax.containers[i], labels = lab, label_type='center', fontsize = 16, weight = 'bold')

ax.set_ylabel('Average %', fontsize = 30, labelpad=20)
ax.set_xlabel('Topics', fontsize = 30, labelpad=20)
sns.set(font_scale = 1.5)
plt.xlim([-1,len(mean_words_topic.topic_name)])
plt.ylim(0,100)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)

ax.texts[119].set_fontsize(10)

plt.show()

fig.savefig('plots/primary_emotions_topics.png', dpi=300, bbox_extra_artists=(lgd,), bbox_inches='tight')

## Emotions in topics - machine learning approach

In [None]:
topic_sample = feather.read_feather('data/sample_topic_emotions_5000_each_14t.feather') # Get tweets labeled in estimate_emotions_roberta.py
len(topic_sample)

In [None]:
# Get mean percentage of each emotion in topics
mean_em = topic_sample.groupby('topic').mean()*100

In [None]:
mean_em.reset_index(inplace = True)
mean_em['topic_name'] = mean_em.topic.apply(lambda x: topic_names[x]) # Add topic names.
mean_em.set_index('topic', inplace = True)

In [None]:
# Create a baseline representing mean emotion of all topics. 
em_bl = pd.DataFrame(mean_em.mean()).transpose()
em_bl['topic_name'] = 'baseline'
em_bl.index = [-2]
mean_em = mean_em.append(em_bl).sort_index()


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
palette = sns.color_palette("deep")

 
fig, ax = plt.subplots()
fig.set_size_inches(30, 15)

width = 0.75    
bottom_total = 0

# Place bars on top of eachother colored and labeled with emotions. 
em_col = {'anger':3, 'sadness':0, 'optimism':8, 'joy':2}
for em in ['anger', 'sadness', 'optimism', 'joy']:
    ax.bar(mean_em.topic_name, mean_em[em], width, label = em, color = palette[em_col[em]], bottom = bottom_total)
    bottom_total +=  mean_em[em]
    

    
handles, labels = ax.get_legend_handles_labels()
lgd = ax.legend(handles[::-1], labels[::-1],title = 'Emotion', bbox_to_anchor=(1.01, 1), frameon=False, fontsize = 25)
lgd.get_title().set_fontsize(25)

baseline_hatches = []
pos = 0
for i in range(4): # Prepare hatch locations for baseline bar. 
    baseline_hatches.append(pos)
    pos += len(mean_em)
    
bars = ax.patches
for bar in baseline_hatches: # Add hatches. 
    bars[bar].set_hatch('/')
    
for i, em in enumerate(['anger', 'sadness', 'optimism', 'joy']):
    em_diffs = mean_em[em] - mean_em[em][-2] 
    lab = [('+ {}'.format(str(round(em_diffs[i], 1)))) if em_diffs[i] >= 0 else '━ {}'.format(str(round(em_diffs[i], 1))[1:]) for i in range(-2, len(em_diffs)-2)]
    ax.bar_label(ax.containers[i], labels = lab, label_type='center', fontsize = 15, weight = 'bold')

ax.set_ylabel('Average %', fontsize = 30, labelpad=20)
ax.set_xlabel('Topics', fontsize = 30, labelpad=20)
sns.set(font_scale = 1.5)
plt.xlim([-1,len(mean_em.topic_name)])
plt.ylim(0,100)
plt.show()
fig.savefig('emotions_topics_roberta.png', dpi=300, bbox_extra_artists=(lgd,), bbox_inches='tight')

    

## Contagion of intensity 

In [None]:
# Add the response valence for each stimuli tweet. 
stimuli = pd.merge(stimuli, response[['author_id', 'comp']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp'])
stimuli = pd.merge(stimuli, response[['author_id', 'polarity']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp'])

In [None]:
stimuli['intensity_resp'] = stimuli.comp_resp.apply(abs) # Convert valence to intensity 

In [None]:
import matplotlib.pyplot as plt

# Plot intensity stimuli on intensity resp.
plt.scatter(stimuli.intensity, stimuli.intensity_resp, color='red', s = 0.1, alpha = 0.5)

In [None]:
fig = plt.figure()

# Group stimuli intensity into quantile bins. 
stimuli['bins'] = pd.cut(stimuli.intensity, bins = 4, duplicates = 'raise')

# Plot average intensity response for each quantile group. 
ax = stimuli[['bins','intensity_resp']].groupby('bins').mean().plot(linestyle='--', marker='o', color='b', yerr = stimuli[['bins','intensity_resp']].groupby('bins').sem())


fig = ax.figure
fig.set_size_inches(10, 8)d

ax.set_ylabel('Response intensity')
ax.set_xlabel('Stimuli intensity (quantile)')
ax.get_legend().remove()
ax.set_xticklabels(labels = ['', 'Q1', '', 'Q2', '', 'Q3', '', 'Q4', ''])

font = {'size' : 11}

plt.rc('font', **font)
plt.show()
fig.savefig('plots/stim_resp_intensity.png', dpi=300, bbox_inches='tight')


In [None]:
import statsmodels.api as sm

# Linear regression between intensity stimuli and intensity response. 
X = stimuli.intensity
y = stimuli.intensity_resp

X = sm.add_constant(X)

model = sm.OLS(y, X)
result = model.fit()
print(result.summary())

In [None]:
# Create variables indicating positivity and negativity separately. 
stimuli['comp_pos'] = stimuli.comp.apply(lambda x: abs(x) if x > 0 else 0)
stimuli['comp_neg'] = stimuli.comp.apply(lambda x: abs(x) if x < 0 else 0)

stimuli = pd.merge(stimuli, response[['author_id', 'anger', 'sadness', 'joy', 'optimism']], left_on="followed_by", how="left", right_on = 'author_id', suffixes = ['', '_resp'])
mean_stim = stimuli[['followed_by', 'pos', 'neg', 'neu', 'intensity', 'intensity_resp', 'comp', 'comp_resp', 'comp_pos', 'comp_neg', 'anger', 'sadness', 'joy', 'optimism', 'anger_resp', 'sadness_resp', 'joy_resp', 'optimism_resp']].groupby('followed_by').mean()

In [None]:
import statsmodels.api as sm

# Test what valence response positivity and negativity predicts. 
X = mean_stim[['comp_pos', 'comp_neg']]
y = mean_stim.comp_resp

X = sm.add_constant(X)

model = sm.OLS(y, X)
result = model.fit()
print(result.summary())