# Topic Modeling

In [1]:
import pandas as pd
import re

# Reading the data
sb_data = pd.read_csv("spongebob_data.csv")
sb_transcripts = sb_data["transcript"]

# Removes the speaker
def remove_speakers(text):
    output = []
    lines = text.split("\n")
    for line in lines:
        segments = line.split(": ")
        if len(segments) > 1:
            new_text = "".join(segments[1:]) 
        else:
            new_text = "".join(segments)
        output.append(new_text)
    return " ".join(output)

# Removes the speaker, then removes the queues in the brackets
sb_transcripts = sb_transcripts.apply(lambda x: re.sub("[\[].*?[\]]", "", remove_speakers(x)))

sb_transcripts.head()

0     Ah, the sea... so fascinating. So wonderful. ...
1     Ahh, what a wonderful day. The sun is out, th...
2     Ah, ze early morning in Bikini Bottom, when t...
3    Wow. Squidward, this is the best soufflé you h...
4     Hurry up with those chairs, SpongeBob. It's a...
Name: transcript, dtype: object

Formatting and Tokenization

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string

# Run the below line of code to download the list of stopwords:
# nltk.download("stopwords")

# Removes punctuation and turns the word into lowercase
def remove_punc_and_lowercase(text):
    return "".join(s for s in text if not s in string.punctuation).lower()


# Removes punctuation and stop words, puts the text into lowercase, and performs tokenization
def format_and_tokenize(text):
    text = remove_punc_and_lowercase(text)
    stop_words = [remove_punc_and_lowercase(word) for word in stopwords.words("english")]
    no_stop_words = [x for x in word_tokenize(text) if not x in stop_words]
    return no_stop_words

sb_transcripts = sb_transcripts.apply(format_and_tokenize)

sb_transcripts.head()

0    [ah, sea, fascinating, wonderful, see, bikini,...
1    [ahh, wonderful, day, sun, water, shimmering, ...
2    [ah, ze, early, morning, bikini, bottom, jelly...
3    [wow, squidward, best, soufflé, ever, created,...
4    [hurry, chairs, spongebob, closing, id, like, ...
Name: transcript, dtype: object

Lemmatization and POS Filtering

In [3]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Returns true only if the part of speech is a noun or adjective or verb
def noun_or_adjective_or_verb(pos):
    return pos[:2] in ["NN", "JJ", "VB"]

# Lemmatizes the list and filters to get only nouns and adjectives
def lem_and_filter(text):
    pos_tags = pos_tag(text)
    return [lemmatizer.lemmatize(word) for (word, pos) in pos_tags if noun_or_adjective_or_verb(pos)]

sb_transcripts = sb_transcripts.apply(lem_and_filter)

sb_transcripts.head()

0    [sea, fascinating, wonderful, see, bikini, bot...
1    [ahh, wonderful, day, sun, water, shimmering, ...
2    [ah, early, morning, bikini, bottom, jellyfish...
3    [best, soufflé, created, congratulation, chef,...
4    [hurry, chair, spongebob, closing, id, go, hom...
Name: transcript, dtype: object

Model

In [38]:
import gensim.corpora as corpora
from gensim.models import CoherenceModel

# Adding all of the words into one list
all_words = sb_transcripts.to_list()

id2word = corpora.Dictionary(all_words)

corpus = [id2word.doc2bow(words) for words in all_words]
    
lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, id2word = id2word, num_topics = 4, 
                                            random_state = 100, update_every = 1, chunksize = 100,
                                            passes = 10, alpha = "auto")

Model Visualization

In [39]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  default_term_info = default_term_info.sort_values(
