In [21]:
import pandas as pd
import re
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from nltk.stem import WordNetLemmatizer
import nltk
from utils import show_topics

In [8]:
# reading in the text:
got_books = [] 

for i in range(1, 6):
    try:
        with open(f"./data/00{i}ssb.txt") as f:
            got_books.append(f.read())
    except:
        with open(f"./data/00{i}ssb.txt", encoding="cp1252") as f:
            got_books.append(f.read())

In [None]:
# Task 2
got_books = [re.sub(r"Page \d+", "", book) for book in got_books] # removing pages
got_books = [re.sub(r"[A-Z ]{2,}\n", "", book) for book in got_books] # removing chapters

In [44]:
# Task 3
lm = WordNetLemmatizer()
def preprocessing(text) -> list:
    # remove special characters and unwanted control characters and make lower case
    processed_t = re.sub(r"[^A-Za-z ]", "", text)
    processed_t = processed_t.lower().split()
    # remove stopwords
    processed_t = [word for word in processed_t if word not in nltk.corpus.stopwords.words("english")]
    # lemmatize
    processed_t = [lm.lemmatize(word) for word in processed_t]
    
    return processed_t


In [45]:
processed_got_books = [preprocessing(book) for book in got_books]

In [46]:
# turn processed text into corpus for LDA modelb
word_dict = Dictionary(processed_got_books)
corpus = [word_dict.doc2bow(book) for book in processed_got_books]

In [47]:
# Task 4
ldas_f_b = []
for _ in range(5):
    # 50 iterations is already the default
    # also only train on the first book
    lda = LdaModel([corpus[0]], 10, id2word=word_dict, iterations=50)
    print(lda.show_topics()[0])
    ldas_f_b.append(lda) # reuse this in task 5

(0, '0.012*"said" + 0.009*"would" + 0.008*"lord" + 0.006*"ser" + 0.005*"one" + 0.005*"could" + 0.005*"hand" + 0.005*"ned" + 0.005*"king" + 0.004*"jon"')
(0, '0.010*"said" + 0.008*"lord" + 0.007*"would" + 0.006*"ser" + 0.005*"hand" + 0.004*"could" + 0.004*"king" + 0.004*"ned" + 0.004*"like" + 0.004*"one"')
(0, '0.011*"said" + 0.008*"lord" + 0.007*"would" + 0.006*"ser" + 0.005*"one" + 0.005*"king" + 0.005*"hand" + 0.005*"could" + 0.005*"eye" + 0.005*"jon"')
(0, '0.013*"said" + 0.009*"lord" + 0.007*"ser" + 0.006*"would" + 0.005*"one" + 0.005*"could" + 0.005*"king" + 0.004*"jon" + 0.004*"hand" + 0.004*"man"')
(0, '0.012*"said" + 0.008*"lord" + 0.007*"would" + 0.005*"ned" + 0.005*"ser" + 0.004*"king" + 0.004*"jon" + 0.004*"could" + 0.004*"one" + 0.004*"man"')


- They are not equal but similar, this is because of the mcmc nature of estimation. We are incoporating randomness into the estimation.
- If would use the same seed, the results would be the same

In [48]:
# Task 5
for i in range(len(corpus)):
    print(f"Topic model on book {i}:")
    print(LdaModel([corpus[i]], 10, id2word=word_dict).show_topics()[0])
print("------------\nTopic models from task 4:")
for lda in ldas_f_b:
    print(lda.show_topics()[0])

Topic model on book 0:
(0, '0.012*"said" + 0.009*"lord" + 0.007*"would" + 0.006*"one" + 0.006*"ned" + 0.004*"king" + 0.004*"jon" + 0.004*"ser" + 0.004*"eye" + 0.004*"like"')
Topic model on book 1:
(0, '0.008*"lord" + 0.007*"one" + 0.007*"would" + 0.006*"man" + 0.006*"said" + 0.006*"ser" + 0.005*"could" + 0.004*"back" + 0.003*"king" + 0.003*"tyrion"')
Topic model on book 2:
(0, '0.008*"said" + 0.007*"lord" + 0.006*"one" + 0.005*"would" + 0.005*"well" + 0.005*"ser" + 0.005*"back" + 0.004*"could" + 0.004*"man" + 0.004*"like"')
Topic model on book 3:
(0, '0.008*"would" + 0.008*"one" + 0.008*"said" + 0.007*"lord" + 0.005*"ser" + 0.004*"could" + 0.004*"man" + 0.004*"jaime" + 0.004*"king" + 0.003*"men"')
Topic model on book 4:
(0, '0.008*"would" + 0.008*"one" + 0.007*"lord" + 0.006*"said" + 0.006*"could" + 0.006*"men" + 0.004*"back" + 0.004*"man" + 0.004*"jon" + 0.004*"eye"')
------------
Topic models from task 4:
(0, '0.012*"said" + 0.009*"would" + 0.008*"lord" + 0.006*"ser" + 0.005*"one" + 