# Topic modelling in Python using Gensim

Inspired in code by:
- Idil Ismiguzel, available at:
https://github.com/Idilismiguzel/NLP-with-Python
- Christopher S. Corley, available at: https://christop.club/2014/05/06/using-gensim-for-lda/

# Importing libraries

In [1]:
# !pip install pyLDAvis

In [None]:
# Data Handling
import pandas as pd
pd.set_option('display.max_colwidth', 80)
import numpy as np

# Plotting
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
%matplotlib inline
import matplotlib.patheffects as path_effects
import seaborn as sns

# Gensim
import gensim
from gensim.models import CoherenceModel
import pyLDAvis.gensim_models

# NLTK
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

# Python Core
from collections import Counter
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import locale
locale.getpreferredencoding = lambda: "UTF-8"
import requests
import re


# Download UN tweets data

A little preprocessing is required here to fix a problem in the data file

In [None]:


url = 'https://raw.githubusercontent.com/world-politics-datalab/un_hum_rights_office_tweets/main/un_office_humrights_tweets_sept4_2017_sept3_2022.csv'
r = requests.get(url, allow_redirects=True)

open('un_office_humrights_tweets_sept4_2017_sept3_2022.csv', 'wb').write(r.content)

# the original file has a problem around row 4037 so we need to import it in two steps to fix it

data1 = pd.read_csv("un_office_humrights_tweets_sept4_2017_sept3_2022.csv", header=0, nrows=4037, encoding='utf-8',  quotechar='"')
data1 = data1.iloc[:,:88]

!tail -n 17130 un_office_humrights_tweets_sept4_2017_sept3_2022.csv > temp.csv

data2 = pd.read_csv("temp.csv", encoding='utf-8',  quotechar='"', header=None)
data2.drop(data2.columns[[14, 15]], axis=1, inplace=True)

data2 = pd.DataFrame(data=data2.values, columns=data1.columns)
# data_all = data.append(data2,ignore_index=True) # version for use with older versions of pandas
data_all = pd.concat((data1, data2), ignore_index=True)

# filter out non-English language texts
data = data_all.loc[data_all['lang'] == "en"].copy()
data

# Text data

In [None]:

tweets = data[["text"]]
tweets

# Data Cleaning and Preprocessing

1. Expand contractions
2. Remove links, html tags, numbers, and other unwanted characters
3. Tokenise
4. Lemmatise

In [5]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot", # can not?
    "can't've": "cannot have", # can not have?
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

In [6]:
def preprocess_text(text):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    # Convert words to lower case
    text = text.lower()

    # Expand contractions
    text = text.split()
    new_text = []
    for word in text:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)

    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # remove numbers
    text = re.sub(r'[0-9]+', ' ', text)

    # remove <U+ >
    text = re.sub(r'<u.*?>', " ", text)

    text = re.sub(r'<', ' ', text)
    text = re.sub(r'>', ' ', text)

    # Split documents into tokens
    text = nltk.WordPunctTokenizer().tokenize(text)
    
    # Lemmatize each word: that is, convert inflected forms of words into their reference form, e.g. 'had' becomes 'have'
    text = [nltk.stem.WordNetLemmatizer().lemmatize(token, pos='v') for token in text if len(token)>1]

    return text

def to_string(text):
    # Convert list to string
    text = ' '.join(map(str, text))

    return text

In [7]:
data['text_Clean_List'] = data['text'].apply(preprocess_text)

In [None]:
data

In [9]:
data['text_Clean'] = data['text_Clean_List'].apply(to_string)

In [None]:
data['text'][5474]

In [None]:
pd.set_option('display.max_colwidth', 15000)
data[["text", "text_Clean"]].sample()

In [None]:
# Join  text together
tweet_words = ','.join(list(data['text_Clean'].values))

# Count each word
word_counter = Counter(tweet_words.split())
most_frequent = word_counter.most_common(30)

fig = plt.figure(1, figsize = (20,10))
_ = pd.DataFrame(most_frequent, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = _, palette = 'winter', hue='words', legend=False)
plt.xticks(rotation=45)

In [13]:
stopwords_list = stopwords.words('english')
#stopwords_list.extend(["say", "go", "look", "come", "see", "think", "get", "one", "would", "like", "could"])

In [14]:
data['text_Clean_SW_List'] = [[word for word in line if word not in stopwords_list] for line in data['text_Clean_List']]

In [15]:
data['text_Clean_SW'] = data['text_Clean_SW_List'].apply(to_string)

In [None]:
# Join  text together
tweet_words_sw = ','.join(list(data['text_Clean_SW'].values))

# Count each word
word_counter = Counter(tweet_words_sw.split())
most_frequent = word_counter.most_common(30)

# Bar plot of frequent words
fig = plt.figure(1, figsize = (20,10))
_ = pd.DataFrame(most_frequent, columns=("words","count"))
sns.barplot(x = 'words', y = 'count', data = _, palette = 'winter', hue='words', legend=False)
plt.xticks(rotation=45);

# Create dictionary and corpus (Bag-of-Words)

In [None]:
# Create Dictionary
id2word = gensim.corpora.Dictionary(data['text_Clean_SW_List'])
print(len(id2word))
id2word.filter_extremes(no_below=15, no_above=0.5)
print(len(id2word))
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data['text_Clean_SW_List']]

# Determining the Number of Topics using coherence score

In [None]:

# Compute Coherence Score
number_of_topics = []
coherence_score = []
t_min = 51
t_max = 100

import datetime

a = datetime.datetime.now()

for i in range(t_min,t_max + 1):
    a = datetime.datetime.now()
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        random_state=100,
        num_topics=i,
        passes=10,
        alpha='auto',
        eta='auto',
        iterations=50,
        per_word_topics=True
    )


    coherence_model_lda = CoherenceModel(model=lda_model, texts=data['text_Clean_SW_List'], dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    number_of_topics.append(i)
    coherence_score.append(coherence_lda);
    b = datetime.datetime.now()
    delta = b - a
    print("Calculated coherence score for solution with", i, " topics. Total time:", delta)



In [19]:
topic_coherence = pd.DataFrame({'number_of_topics':number_of_topics, 'coherence_score':coherence_score})

In [None]:
topic_coherence.loc[:20]

In [None]:
g = sns.lineplot(data=topic_coherence, x='number_of_topics', y='coherence_score')
g.set_xticks(range(t_min,t_max, int((t_max - t_min) / 10)))
g

# Topic Modelling with LDA

In [22]:
n_topics = 14

lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=n_topics,
    random_state=100,
    #update_every=1,
    #chunksize=10,
    passes=20,
    alpha='auto',
    eta='auto',
    iterations=600,
    per_word_topics=True
)

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

# Visualising with pyLDAvis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word, sort_topics=False)
vis

# Predict the topics in a new document

In [None]:
doc = "children education"
vec_bow = id2word.doc2bow(preprocess_text(doc))

vec_topic = lda_model[vec_bow]  # convert the query to LSI space

a = list(sorted(lda_model[vec_bow][0], key=lambda x: x[1]))

for t in a[::-1]:
    print(t[0], t[1], lda_model.print_topic(t[0]))
    print()

# Generate a Document-Topic Matrix

In [None]:
document_topic_matrix = [list(dict(lda_model.get_document_topics(doc, minimum_probability=0)).values()) for doc in corpus]
document_topic_matrix = pd.DataFrame(document_topic_matrix, columns=lda_model.print_topics(-1))
document_topic_matrix["text"] = list(tweets["text"])
document_topic_matrix