# Twitter Discourse and Emotions Around the Invasion of Ukraine - Companion code
## – A Text Analytics Approach 
### Gabriel Lindelöf


# Dataset cleaning and related plots

In [None]:
from pyarrow import feather
import pandas as pd
import seaborn as sns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.options.display.float_format = '{:,.15f}'.format

In [None]:
df = feather.read_feather('data/ukraine_two_weeks.feather')

In [None]:
from datetime import datetime

def to_datetime(date):
    date = datetime.fromisoformat(date[:-1])
    return date

df['created_at'] = df['created_at'].apply(to_datetime)

In [None]:
import re
def clean(doc):
    ''' Basic cleaning of raw text data. '''
    doc = re.sub(r'http\S+', '', doc) # remove any URLS
    doc = re.sub(r'@\S+', '', doc) # remove tagged @usernames 
    doc = re.sub(r'\\n', ' ', doc) # remove newlines
    doc = re.sub(r'[\s]+', ' ', doc) # remove extra spaces
    return doc

df['text_clean'] = df.text.apply(clean)

# Creat variables to filter by

In [None]:
df['url'] = df['entities.urls'].isna() == False # mark if tweet contains URL

In [None]:
df['author.id'].value_counts().quantile(q=0.999) # mark if in upper 99.9 percentile

In [None]:
df['duplicate'] =  df.duplicated(keep = 'first', subset = ['text']) # mark if identical text content

In [None]:
df['duplicate_same_author'] =  df.duplicated(keep = 'first', subset = ['text', 'author.id'])  # mark if identical text content same author

In [None]:
val_counts = df['author.id'].value_counts() # get n tweets each author
print("99.9% percentile of tweeters in our dataset: ", val_counts.quantile(q=0.999)) 
print(val_counts.describe())
val_counts_dict = val_counts.to_dict()
df["n_tweets"] = df['author.id'].apply(lambda x: val_counts_dict.get(x)) # add column indicating each tweets authors total n of tweets

In [None]:
df['tweets_999'] = df.n_tweets >= val_counts.quantile(q=0.999) # mark each tweet of users in the 99.9 percentile

# Filter tweets by variables

In [None]:
df.tweets_999.value_counts() 

In [None]:
val_counts.quantile(q=0.999)
print('After 99.9 quintile removal:')
df.tweets_999.value_counts()

In [None]:
print('Duplicate same author removal:')
df.duplicate_same_author.value_counts()

In [None]:
print('Duplicate removal:')
df.duplicate.value_counts()

In [None]:
df['clean'] = (df.duplicate == False) & (df.tweets_999 == False) # mark tweets matching criteria as clean, not to be removed

In [None]:
print('Before and after cleaning: ', df.clean.value_counts())

In [None]:
df.clean.sum() # validate final n of tweets

# Plot before & after

### Import and add data containing total number of tweets made matching query (not our dataset)

In [None]:
total = pd.read_csv('data/counts.csv') # read file containing total number of tweets made matching query, gathered from Count endpoint
total['start'] = total.start.apply(to_datetime) # convert to correct format
total['hour'] = total.start.dt.hour # add hour
total['date'] = total.start.dt.date # add date
total = total[(total.start >= '2022-02-24') & (total.start < '2022-03-10')] # filter to make sure no tweets outside time period

In [None]:
print('total: ', total.hour_count.sum())
print('collected: ', len(df))
print('clean: ', df.clean.sum())

# Dataframe containing our tweets, as well as the total number of available tweets from Count endpoint. 
counts = pd.DataFrame({'Category': ['Available', 'Collected', 'Clean'], 'Tweets':[total.hour_count.sum(), len(df), df.clean.sum()]})

## Plot total number of tweets availabe, collected and after cleaning

In [None]:
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set(rc={'figure.figsize':(2,2),"font.size":5,"axes.titlesize":5,"axes.labelsize":5, "xtick.labelsize" :5, "ytick.labelsize" :5})
ax = sns.barplot(data = counts, x = 'Category', y = 'Tweets')
ax.xaxis.labelpad = 5
ax.yaxis.labelpad = 5
ax.set(xlabel = '', ylabel = 'Tweets (millions)')

ylabels = ['{:.0f}'.format(x) + 'm' for x in ax.get_yticks()/1000000] # format y-axis
ax.set_yticklabels(ylabels)

ax.figure.savefig('plots/counts.png', bbox_inches="tight", dpi = 300) 

## Plot total number of tweets availabe, collected and after cleaning by hour of day

In [None]:
per_hour = total.groupby('hour').sum().rename(columns = {'hour_count':'Total tweets'}) # group counts of tweets by hour

# add our dataset
per_hour['Collected'] = df.created_at.dt.hour.value_counts().sort_index() 
per_hour['Clean'] = df.created_at[df.clean == True].dt.hour.value_counts().sort_index()

# change to long format for easier plotting
per_hour_l =pd.melt(per_hour, ignore_index=False)
per_hour_l = per_hour_l.reset_index()

In [None]:
sns.set(rc={'figure.figsize':(15,3),"font.size":15,"axes.titlesize":15,"axes.labelsize":15, "xtick.labelsize" :10, "ytick.labelsize" :10})
ax = sns.lineplot(data = per_hour_l, x = per_hour_l.hour, y = per_hour_l.value, hue = per_hour_l.variable, style = per_hour_l.variable)


ax.set(xlabel = 'Hour of day', ylabel = 'Tweets (thousands)')

ax.legend(loc = 'upper left', title = None)

ylabels = ['{:.0f}'.format(x) + 'k' for x in ax.get_yticks()/1000] # format y-axis
ax.set_yticklabels(ylabels)
ax.xaxis.labelpad = 15
ax.yaxis.labelpad = 15

tickvalues = per_hour.index
ax.set(xticks=tickvalues)

ax.figure.savefig('plots/clean_hourofday.png', bbox_inches="tight", dpi = 300) # plot total number of tweets per hour of day

## Plot total number of tweets availabe, collected and after cleaning by day

In [None]:
total = total.drop('hour', axis = 1) # remove hours, no longer needed
per_date = total.groupby(total.date).sum().rename(columns = {'hour_count':'Total tweets'}) # group count by day

# add our dataset
per_date['Collected'] = df.created_at.dt.date.value_counts().sort_index() 
per_date['Clean'] = df.created_at[df.clean == True].dt.date.value_counts().sort_index()

# long format
per_date_l =pd.melt(per_date, ignore_index=False)
per_date_l = per_date_l.reset_index()

In [None]:
per_date['Total tweets'] # show total for each day

In [None]:
import matplotlib.dates as md

# date labels in more readable format are created manually
datelab = []
for i in range(24,29):
    datelab.append('Feb {:2d}'.format(i))
    
for i in range(1,10):
    datelab.append('Mar {:2d}'.format(i))
    
ax = sns.lineplot(data = per_date_l, x = per_date_l.date, y = per_date_l.value, hue = per_date_l.variable, style = per_date_l.variable)

ax.set(xlabel = 'Date', ylabel = 'Tweets (millions)')
ylabels = ['{:.1f}'.format(x) + 'm' for x in ax.get_yticks()/1000000] #format y-axis
ax.set_yticklabels(ylabels)
ax.legend(loc = 'upper right', title = None)
ax.xaxis.labelpad = 15
ax.yaxis.labelpad = 15

tickvalues = per_date.index
ax.set(xticks=tickvalues)
ax.tick_params(axis='x', labelrotation=45) 
ax.set_xticklabels(datelab)

ax.figure.savefig('plots/clean_per_day.png', bbox_inches="tight", dpi = 300) # plot number of tweets per day Total, Collected and clean. 


## Plot total number of tweets availabe, collected and after cleaning with per-hour granularity

In [None]:
from datetime import timedelta
# add columns with hour tweets were made
df['date_hour'] = df['created_at'].apply(lambda x: x.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1)) 
total['date_hour'] = total['start'].apply(lambda x: x.replace(minute=0, second=0, microsecond=0) + timedelta(hours=1))

In [None]:
date_hour = total.groupby('date_hour').sum().rename(columns = {'hour_count':'Total tweets'}) # group counts by hour

# group counts per hour for our dataset
date_hour['Collected'] = df.date_hour.value_counts().sort_index()
date_hour['Clean'] = df.date_hour[df.clean == True].value_counts().sort_index()

# long format
date_hour_l =pd.melt(date_hour, ignore_index=False)
date_hour_l = date_hour_l.reset_index()

In [None]:
ax = sns.lineplot(data = date_hour_l, x = date_hour_l.date_hour, y = date_hour_l.value, hue = date_hour_l.variable, style = date_hour_l.variable)

ax.set(xlabel = 'Date & hour', ylabel = 'Tweets (thousands)')

ax.xaxis.labelpad = 15
ax.yaxis.labelpad = 15
ylabels = ['{:.0f}'.format(x) + 'k' for x in ax.get_yticks()/1000] # format y-axis
ax.set_yticklabels(ylabels)

ax.legend(loc = 'upper right', title = None)

tickvalues = per_date.index
ax.set(xticks=tickvalues)
ax.tick_params(axis='x', labelrotation=45)
ax.set_xticklabels(datelab)

ax.figure.savefig('plots/clean_per_date_hour.png', bbox_inches="tight", dpi = 300) 