# Twitter Discourse and Emotions Around the Invasion of Ukraine - Companion code
## – A Text Analytics Approach 
### Gabriel Lindelöf



# Inspection and visualization of topic model

In [None]:
import pandas as pd
from pyarrow import feather
import numpy as np
from bertopic import BERTopic
import os
import pickle
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 500)
pd.options.display.float_format = '{:,.15f}'.format

### load dataset and pre-process

In [None]:
df = feather.read_feather('data/ukraine_two_weeks_clean_shuffled_v2.feather', columns = ['id', 'text_clean', 'clean', 'created_at']) # load dataset                            
dates = df.created_at.apply(lambda x: pd.Timestamp(x)).to_list() # change dates to supported format
docs = df.text_clean.tolist() # clean documents

In [None]:
len(df)

### Load model

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic.backend._utils import select_backend

# load model with speciffied sentence transformer
model_name = 'modelname'
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
topic_model = BERTopic.load('models/{}'.format(model_name), embedding_model=sentence_model)
# load file containing topic labels for all documents (predicted in batches and then combined into single file)
topics = pickle.load(open('models/{}_all.pickle'.format(model_name),'rb')) 

In [None]:
### Code that was used to combine the batches of predicted topic labels

# topics2 =  pickle.load(open('models/modelname_batch_1-3.pickle','rb'))
# topics3 =  pickle.load(open('models/modelname_batch_3-5.pickle','rb'))
# topics4 =  pickle.load(open('models/modelname_batch_5-7.pickle','rb'))
# topics5 =  pickle.load(open('models/modelname_batch_7-.pickle','rb'))

# topics += topics2 + topics3 + topics4 + topics5
# pd.Series(topics).value_counts()
# print(len(topics), len(docs), len(dates))

# print("SAVING TOPICS...")

# filename = 'models/modelname_all.pickle'
# with open(filename, 'wb') as f:
#     pickle.dump(topics, f)
    
# print("DONE TOPICS.")


### Write some information about each topic to a collection of text files to aid analysis

In [None]:
from datetime import datetime

def to_datetime(date):
    date = datetime.fromisoformat(date[:-1])
    return date

df['date'] = df['created_at'].apply(to_datetime)
df['date'] = df.date.apply(lambda x: x.date()) # convert time format

docs_t = pd.DataFrame(zip(docs,topics, df.date), columns = ["doc", "topic", 'date'], index = df.id) # dataframe with the text, topic and date of each tweet


for i in range(len(topic_model.get_topics())-1): # for each topic in model
    topic = topic_model.get_representative_docs(i)[:20] # get representative docs for topic 


    with open('representative_docs/{:02d}_{}.txt'.format(i, topic_model.get_topic(i)[0][0]), 'w') as f: # save in text file, with keywords from topic and ID as name
        f.write("Summary topic number {}\n\n".format(i))

        f.write("Topic frequency: {}\n\n".format(topic_model.get_topic_freq(i))) 

        f.write("Top Words & TF-IDF Scores\n")
        f.writelines(["{}: {}\n".format(l[0], round(l[1], 4)) for l in topic_model.get_topic(i)])
        f.write("\n\n")
        for index, doc in enumerate(topic):
            f.write("[Document {}]\n".format(index+1))
            f.write(doc)
            f.write("\n\n")


        f.write("\n\n")
        f.write("1000 random documents from topic:")
        f.write("\n\n")
        
        # samples 1000 docs from topic to write to bottom of text file
        for row in docs_t[docs_t.topic == i].sample(1000).iterrows():
            f.write("[{}] [{}]\n".format(row[0], row[1].date))
            f.write(row[1].doc)
            f.write("\n____________________________________\n")



    

## Preliminary visualizations of model (final versions were made manually with Matplotlib and Seaborn, further down)

In [None]:
hie = topic_model.visualize_hierarchy()
it = topic_model.visualize_topics()
bar = topic_model.visualize_barchart(range(-1,len(topic_model.get_topics())-1))
freq = topic_model.get_topic_freq()

In [None]:
# save as webpages to allow sharing of interactive graphs
hie.write_html("html/dendrogram.html")
it.write_html("html/intertopic_distance.html")
bar.write_html("html/topic_representations.html")

In [None]:
# show graphs, and frequency table for topics
hie.show()
it.show()
bar.show()
freq

### Save a table in Words .docx format containing topics their frequency

In [None]:
table = pd.DataFrame()
table['Index'] = topic_model.get_topics().keys()
table['Topic name'] = "name" # we add column with placeholder name, to be given by researcher based on key terms in topic

# get top key-words for each topic
key_words = []
for x in topic_model.get_topics().values():
    words = [tup[0] for tup in x][:7]
    w_str = ', '.join(words)
    key_words.append(w_str)

table['Key terms'] = key_words
table['Number of tweets'] = pd.Series(topics).value_counts().sort_index().tolist()

In [None]:
import docx
from docx import Document
import pandas as pd


# open an existing document
doc = Document()

doc.add_paragraph("Value Counts: ")

t = doc.add_table(table.shape[0]+1, table.shape[1])
t.style = 'Colorful List Accent 1'

# add the header rows
for j in range(table.shape[-1]):
    t.cell(0,j).text = table.columns[j]

# add the data from table
for i in range(table.shape[0]):
    for j in range(table.shape[-1]):
        t.cell(i+1,j).text = str(table.values[i,j])
        
filename = "topics_table.docx"
doc.save(filename)

## Inspection of topics with word-clouds and info

In [None]:
# create Dataframe with text, topic and creation date of tweets
docs_t = pd.DataFrame(zip(docs[:len(topics)],topics, df.created_at), columns = ["doc", "topic", 'created_at'], index = df[:len(topics)].id)

# load tokenized version of dataset
tok = feather.read_feather('data/ukraine_two_weeks_clean_shuffled_v2_tok.feather')
df = df.join(tok)

# add topic info about each tweet to main dataset
df['topic'] = topics
df['topic_name'] = df.topic.apply(lambda x: topic_model.get_topic(x)[0][0])
df['len'] = df.apply(len)
topic_index_names = df[['topic', 'topic_name']].drop_duplicates().set_index('topic_name')

In [None]:
def inspect_topic(t, s = 0):
    '''
    Function that prints some info about a topic. 
    
    Parameters:
        t (int): index of topic
        s (int): size of sample docs printed
    '''
    print(*topic_model.get_topic(t), '\n', sep = '\n') # prints top key-words
    print('Representative docs:\n')
    if t != -1:
        print(*topic_model.get_representative_docs(t)[:20], sep = '\n__________________________________________________________________________________\n\n')
    print('\n')
    s = docs_t[docs_t.topic == t].sample(s, random_state = 42).aggregate(lambda x: '{} |{}| ({})'.format(x.doc, x.created_at, x.topic), axis = 1) # sample from all docs in topic
    print('Sample docs:', *s, sep = '\n__________________________________________________________________________________\n\n')

In [None]:
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk import word_tokenize   
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import re


class LemmaTokenizer:
    '''Convert to lemmas for purpose of word clouds, class is based on https://gist.github.com/4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55'''
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(str(t)) for t in doc if str(t)]


def preprocess_text(text):
    '''Remove numbers, we don't want them in the word clouds'''
    text = re.sub(" \d+", '', text)
    return text

def get_wordcloud(topic_i):
    '''Get word cloud for topic with index passed as argument'''
    
    docs = df[(df.topic == topic_i)].tok # get tokenized version of all docs in topic

    tokenizer=LemmaTokenizer()
    stop_words = stopwords.words('english')
    token_stop = tokenizer(stop_words)
    lem_text = [' '.join(tokenizer(doc)) for doc in docs] # lemmatize documents 

    # vectorizer used to count word occurences
    vectorizer = CountVectorizer(stop_words = token_stop, max_features = 200, lowercase = True, strip_accents = 'unicode', preprocessor=preprocess_text) 
    
    # transform documents with vectorizer
    transformed = vectorizer.fit_transform(lem_text)
    
    # get names (words) of each feature in vectorizer
    fnames= vectorizer.get_feature_names() 
    
    # combine word frequencies with corresponding words
    freq_dict = dict(zip(vectorizer.get_feature_names(), np.asarray(transformed.sum(axis=0)).ravel()))

    # initiate word cloud
    wordcloud = WordCloud(max_words = 100, background_color="white", colormap = 'tab10', width=1600, height=400, max_font_size=None, min_font_size = 0)
    wordcloud.generate_from_frequencies(frequencies=freq_dict) # passing frequency dictonary to wordcloud
    
    plt.figure(figsize = (20,5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    filename = topic_index_names[topic_index_names.topic == topic_i].index[0]
    plt.savefig("plots/wordclouds/{}_topic_{}.png".format(filename, topic_i), format="png", dpi = 300, bbox_inches='tight')
    plt.show()

In [None]:
topic = 6 # supply with first keyword of topic OR index of topic

# check if passed variable is str (name of topic) or int (index of topic), if str convert to index
if isinstance(topic, str):
    topic_i = int(topic_index_names.loc[topic][0])
else:
    topic_i = topic

print("Getting topic with index: ", topic_i)   
get_wordcloud(topic_i) # show wordcloud
inspect_topic(topic_i, s = 100) # show info about topics content

In [None]:
from bertopic import BERTopic
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Reduce problems with cloud computing. 



coherence_results = []
first_run = True
for i in reversed(range(2, (len(set(topics))))): # Try every model size down until 2 topics. 
    if not first_run:
        print("Reducing topics...")
        topics, _ = topic_model.reduce_topics(docs, topics, nr_topics=i) # Reduce with 1 topic. 
 
    # Dataframe with documents, their ID and topic. 
    documents = pd.DataFrame({"Document": docs,
                              "ID": range(len(docs)),
                              "Topic": topics})
    documents_per_topic = documents.groupby(['Topic'], as_index=False).agg({'Document': ' '.join})
    
    
    print("Preprocessing...")
    cleaned_docs = topic_model._preprocess_text(documents_per_topic.Document.values) # Get preprocessed texts.
    
    # Get topic models vectorizer and features. 
    vectorizer = topic_model.vectorizer_model 
    analyzer = vectorizer.build_analyzer() 
    words = vectorizer.get_feature_names() 
    
    # Create a corpora of tokens. 
    tokens = [analyzer(doc) for doc in cleaned_docs]
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token) for token in tokens]
    topic_words = [[words for words, _ in topic_model.get_topic(topic)] 
                   for topic in range(len(set(topics))-1)]

    print("Calculating coherence...")
    # Calculate coherence for model. 
    coherence_model = CoherenceModel(topics=topic_words, 
                                     texts=tokens, 
                                     corpus=corpus,
                                     dictionary=dictionary, 
                                     coherence='c_v')
    coherence = coherence_model.get_coherence()
    
    
    coherence_results.append((i, coherence)) # Save number of topics and coherence. 
    print(i, coherence)
    first_run = False
    
    with open('models/coherence_cv.pickle', 'wb') as f: # Write coherence results to file. 
        pickle.dump(coherence_results, f)
    

In [None]:
import pickle
coh = pickle.load(open('models/coherence_cv.pickle','rb')) # Load previous results 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})
sns.set(rc={'figure.figsize':(2,2),"font.size":5,"axes.titlesize":5,"axes.labelsize":5, "xtick.labelsize" :5, "ytick.labelsize" :5})
sns.set_style('whitegrid')
ax = sns.lineplot(*zip(*reversed(coh2)))

ax.set(xlabel = 'Number of topics', ylabel = 'C_V score')
ax.axvline(15, color = 'red',linewidth=0.4, alpha = 0.5) # Add red line at selected number of topics. 

ax.figure.savefig('plots/coherence.png', bbox_inches="tight", dpi = 300) 

# Topics over time

In [None]:
topics_over_time = topic_model.topics_over_time(docs, topics, dates, nr_bins=14)

In [None]:
filename = 'models/topics_over_time_14t.pickle' 
topics_over_time = pickle.load(open(filename,'rb')) # Load previously saved topics over time. 

In [None]:
tot = topic_model.visualize_topics_over_time(topics_over_time, normalize_frequency = True) # Build in visualization for preliminary inspection. 

In [None]:
tot.write_html("html/topics_over_time.html") # Save to HTML 

In [None]:
from datetime import datetime

def to_datetime(date):
    date = datetime.fromisoformat(date[:-1])
    return date

In [None]:
   import matplotlib.dates as mdates
import seaborn as sns
import matplotlib.pyplot as plt

topics_i = [10] # Which topics to visualize (use index).
topics = topics_over_time[topics_over_time.Topic.isin(topics_i)] # Select these topics. 


topic_strings = []
# Add default labels, first keyword in each topic. 
for i in topics_i:
    keywords = topic_model.get_topic(i)[:3]
    string = "{}: {}, {}, {}".format(i, keywords[0][0], keywords[1][0], keywords[2][0])
    topic_strings.append(string)

## Override default names for visualiztion. 
#topic_strings = ['The invasion', 'NATO']
#topic_strings = ['Foreign students', 'Cryptocurrency', 'Attacked cities']
#topic_strings = ['Nazism', 'Energy', 'Planes', 'Biolabs']
#topic_strings = ['China', 'Other conflicts', 'COVID-19', 'Solidarity']
topic_strings =  ['Nuclear plants']
sns.set(rc={"figure.dpi":300, 'savefig.dpi':300})


sns.set(rc={'figure.figsize':(15,3),"font.size":15,"axes.titlesize":15,"axes.labelsize":15, "xtick.labelsize" :8, "ytick.labelsize" :10})

sns.set_style('whitegrid')
ax = sns.lineplot(data = topics, x = topics.Timestamp, y = topics['Frequency'], hue = topics.Topic, style = topics.Topic, palette = sns.color_palette('deep', len(topics_i)))

ax.set(xlabel = 'Date', ylabel = 'Tweets')
ax.legend(prop={'size': 10}, labels = topic_strings, loc = 'upper right')


locator = mdates.DayLocator(interval=1)
ax.xaxis.set_major_locator(locator)


ax.set(ylim = (0,30000))
ylabels = ['{:.0f}'.format(x) + 'k' for x in ax.get_yticks()/1000] # Reformat y-axis to count of thousands. 
ax.set_yticklabels(ylabels)
ax.xaxis.labelpad = 15
ax.yaxis.labelpad = 15

ax.figure.savefig('plots/nuke.png', bbox_inches="tight") 