# LDA with Gensim 
Using the Gensim library to perform LDA topic extraction from a sample of the IMdb TV series descriptions and reassigning the topics to the descriptions

In [33]:
import pandas as pd
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim 

from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

In [146]:
# Reading the IMDb dataset 
df = pd.read_csv('tv_series_2000_2020.csv', dtype={'Release_year': "object", 'IMDB': 'object'})
df.drop('Unnamed: 0',axis=1, inplace=True)

In [158]:
sample_df = df.sample(200)

In [159]:
# Removing noise text leftover from the scrape
sample_df['Description'] = sample_df['Description'].str.replace("See full summary", "").str.replace("»","")

## Preprocessing steps
* Removing stopwords
* Tokenizing strings
* Removing punctuation
* Lemmatizing tokens
* Gensim dictionary
* Doc2bow document term matrix

In [160]:
# Preparing some preprocessing steps
stops = set(stopwords.words('english'))
punc = set(string.punctuation)
lemme = WordNetLemmatizer()

In [161]:
# Clean function
def cleantxt(txt):
    stop_free = ' '.join([x for x in txt.lower().split() if x not in stops])
    punc_free = ''.join([x for x in stop_free if x not in punc])
    noralized = ' '.join([lemme.lemmatize(x) for x in punc_free.split()])
    return noralized.split()

In [162]:
#Clean column
sample_df['clean_desc'] = sample_df['Description'].apply(cleantxt)
sample_df.head()

Unnamed: 0,IMDB,Name,Runtime (mins),Release_year,End_year,Genre,Rating,Votes,Description,Stars,clean_desc
2116,386993,Xiaolin Showdown,30,2015.0,2006.0,"Animation, Action, Adventure",7.5,6529,A young Xiaolin monk named Omi with a gian...,"Jeff Bennett, Danny Cooksey, Grey Griffin, Tom...","[young, xiaolin, monk, named, omi, giant, yell..."
1610,10437218,A House Divided,32,2016.0,,Drama,5.0,100,The wealthy Sanders family deals with the ...,"Dominique DuVernay, Lawrence Hilton-Jacobs, Br...","[wealthy, sander, family, deal, loss, family, ..."
5331,7737438,"Eat, Sleep, BBQ",30,,,"Family, Reality-TV",7.4,168,"A lover of true barbecue, Rashad Jones is ...","9, Rashad Jones","[lover, true, barbecue, rashad, jones, mission..."
3198,7708956,Wrong Man,60,,,"Documentary, Crime",6.8,210,A six-part docu-series investigates the cr...,"8, 9, X","[sixpart, docuseries, investigates, criminal, ..."
751,206511,Even Stevens,30,2003.0,2003.0,"Comedy, Family",7.6,13595,"The Stevens family live in Sacramento, Cal...","Shia LaBeouf, Christy Carlson Romano, A.J. Tra...","[stevens, family, live, sacramento, california..."


In [163]:
# Create a dictionart and count unique words .nnz
dictionary = corpora.Dictionary(sample_df['clean_desc'])
print(dictionary.num_nnz)

2809


In [164]:
# Create bag of words - doc2bow
doc_term_matrix = [dictionary.doc2bow(doc) for doc in sample_df['clean_desc']]
print(len(doc_term_matrix))

200


## Modelling steps
* Instantiate Gensism LDA model
* Fit the doc term, dict, and num_topics
* Review topic keywords 
* Visualize topics - intertopic distance, saliency, releveance, frequency etc..
* Modify model fit if needed

In [165]:
# Instantiate the LDA model
lda = gensim.models.ldamodel.LdaModel

In [171]:
# Fit the LDA model 
# IMDb has 27 genres for tv so lets use this as anumber of topics and see how the match up to the decrptions
# Reduced by 2 due to overlaping in LDAvis 
num_topics = 40
%time ldamodel = lda(doc_term_matrix, num_topics=num_topics, \
                     id2word = dictionary, passes=100, minimum_probability =0)



CPU times: user 8 s, sys: 29 ms, total: 8.03 s
Wall time: 8.12 s


In [172]:
# Print topics 
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.033*"di" + 0.025*"uk" + 0.016*"series" + 0.016*"police" + 0.016*"another" + 0.016*"clip" + 0.016*"show" + 0.016*"host" + 0.008*"island" + 0.008*"murder"'),
 (1,
  '0.013*"brunson" + 0.013*"buy" + 0.013*"tournament" + 0.013*"top" + 0.013*"televised" + 0.013*"playing" + 0.013*"player" + 0.013*"compete" + 0.013*"farha" + 0.013*"johnny"'),
 (2,
  '0.021*"summer" + 0.021*"turn" + 0.021*"center" + 0.021*"sister" + 0.021*"victoria" + 0.010*"dark" + 0.010*"ancient" + 0.010*"mythology" + 0.010*"scare" + 0.010*"awakens"'),
 (3,
  '0.023*"day" + 0.023*"90" + 0.023*"fiancé" + 0.023*"family" + 0.023*"catch" + 0.012*"couple" + 0.012*"after" + 0.012*"face" + 0.012*"chapter" + 0.012*"next"'),
 (4,
  '0.031*"deaf" + 0.021*"show" + 0.021*"game" + 0.021*"college" + 0.010*"attending" + 0.010*"group" + 0.010*"life" + 0.010*"private" + 0.010*"follows" + 0.010*"human"'),
 (5,
  '0.017*"sophia" + 0.017*"collins" + 0.017*"begin" + 0.017*"new" + 0.017*"planet" + 0.009*"gal" + 0.009*"life" + 0.009*"fou

In [173]:
# Display the topics / inter topic distance
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)