# **Homework 8 - Topic Modeling**
# KDD Tuesdays 12:30 PM - 2:45 PM
## Jake Brulato

In [1]:
#import relvant packages for conduct topic modeling analysis
import os
import pandas as pd
import numpy as np
from pandas import DataFrame
np.random.seed(2023)

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

## **Defined Functions**

In [2]:
from nltk.stem import*
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

## **Read all the text files in the directory**

In [3]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'MovieReviews' # Folder Name
file_names = PlaintextCorpusReader(corpus_root, '.*',encoding='latin-1')  # wildcard is read all files in the folder
file_names.fileids()  # Get the filenames

['16748.txt',
 '17108.txt',
 '17109.txt',
 '17110.txt',
 '17111.txt',
 '17116.txt',
 '17117.txt',
 '17118.txt',
 '17119.txt',
 '17139.txt',
 '17144.txt',
 '17145.txt',
 '17146.txt',
 '17147.txt',
 '17150.txt',
 '17185.txt',
 '17192.txt',
 '17219.txt',
 '17239.txt',
 '17243.txt',
 '17254.txt',
 '17255.txt',
 '17280.txt',
 '17300.txt',
 '17303.txt',
 '17341.txt',
 '17384.txt',
 '17391.txt',
 '17398.txt',
 '17399.txt',
 '17430.txt',
 '17431.txt',
 '17447.txt',
 '17457.txt',
 '17460.txt',
 '17501.txt',
 '17518.txt',
 '17532.txt',
 '17534.txt',
 '17578.txt',
 '17609.txt',
 '17610.txt',
 '17655.txt',
 '17662.txt',
 '17663.txt',
 '17695.txt',
 '17711.txt',
 '17713.txt',
 '17753.txt',
 '17757.txt',
 '17758.txt',
 '17761.txt',
 '17803.txt',
 '17811.txt',
 '17874.txt',
 '17879.txt',
 '17886.txt',
 '17896.txt',
 '17898.txt',
 '17902.txt',
 '17912.txt',
 '17933.txt',
 '17934.txt',
 '17945.txt',
 '17963.txt',
 '17971.txt',
 '17992.txt',
 '18004.txt',
 '18016.txt',
 '18032.txt',
 '18067.txt',
 '1806

## **Read all the text files in the directory**

In [4]:
path: str = 'C:/Users/rbrul/Documents/GitHub/KDD/Homework_8/MovieReviews'
# path: str = '/Users/jakebrulato/Documents/GitHub/KDD/Homework_8/MovieReviews'
file_names = os.listdir(path)

doc_contents: list = []
for i, file_name in zip(range(len(file_names)), file_names):
    with open(path + '/' + file_name, encoding="utf8", errors='ignore') as file:
        doc_contents.append((i, file_name, file.read()))

## **Build dictionary out of the document contents**

In [5]:
data = pd.DataFrame(doc_contents, columns=['RowNum', 'FileName', 'FileContent'])
data.dropna(subset=['FileContent'], inplace= True)
print(data.head(5))

   RowNum   FileName                                        FileContent
0       0  16748.txt  DENNIS SCHWARTZ "Movie Reviews and Poetry"\nUN...
1       1  17108.txt  A brilliant, witty mock documentary of Jean Se...
2       2  17109.txt  NOSTALGHIA (director: Andrei Tarkovsky; cast: ...
3       3  17110.txt  PAYBACK (director: Brian Helgeland; cast:(Port...
4       4  17111.txt  WAKING NED DEVINE (director: Kirk Jones (III);...


## **Load documents contenets into a dataframe**

In [7]:
import gensim
import nltk
nltk.download('wordnet')

processed_docs = data['FileContent'].map(preprocess)
print(processed_docs[:5])

dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rbrul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    [denni, schwartz, movi, review, poetri, unmak,...
1    [brilliant, witti, mock, documentari, jean, se...
2    [nostalghia, director, andrei, tarkovski, cast...
3    [payback, director, brian, helgeland, cast, po...
4    [wake, devin, director, kirk, jone, cast, bann...
Name: FileContent, dtype: object


## **Build BagOfWords corpus from the document content**

In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(len(bow_corpus))

180


## **Bag of Words test**

In [10]:
test_corpus = bow_corpus[0]
for i, word_stat in zip(range(len(test_corpus)), test_corpus):
    # print only the first 5 words
    if i < 5:
        print("Word {} (\"{}\") appears {} times.".format(word_stat[0], dictionary[word_stat[0]], word_stat[1]))

Word 0 ("accomplish") appears 1 times.
Word 1 ("actor") appears 1 times.
Word 2 ("america") appears 1 times.
Word 3 ("apart") appears 1 times.
Word 4 ("attract") appears 3 times.


## **Build LDA model With 10 Topics**

In [12]:
lda_model = LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} -> Words: {}'.format(idx, topic))

Topic: 0 -> Words: 0.014*"money" + 0.010*"school" + 0.009*"problem" + 0.008*"expect" + 0.007*"friend" + 0.007*"origin" + 0.007*"tri" + 0.007*"place" + 0.006*"role" + 0.006*"start"
Topic: 1 -> Words: 0.009*"paul" + 0.009*"artist" + 0.008*"see" + 0.008*"feel" + 0.007*"perform" + 0.007*"show" + 0.006*"mark" + 0.006*"need" + 0.005*"friend" + 0.005*"tri"
Topic: 2 -> Words: 0.011*"littl" + 0.009*"book" + 0.009*"screen" + 0.008*"long" + 0.008*"chang" + 0.008*"perform" + 0.007*"novel" + 0.007*"plot" + 0.006*"action" + 0.006*"john"
Topic: 3 -> Words: 0.010*"releas" + 0.010*"littl" + 0.009*"music" + 0.008*"audienc" + 0.007*"action" + 0.007*"produc" + 0.007*"money" + 0.006*"michael" + 0.006*"turn" + 0.005*"reason"
Topic: 4 -> Words: 0.007*"lover" + 0.007*"polit" + 0.007*"show" + 0.007*"romanc" + 0.007*"littl" + 0.006*"person" + 0.006*"singl" + 0.006*"home" + 0.006*"have" + 0.006*"murder"
Topic: 5 -> Words: 0.009*"world" + 0.008*"famili" + 0.008*"littl" + 0.007*"father" + 0.007*"best" + 0.006*"pro

## **Top 10 Words for each topic**

In [13]:
top_topics = lda_model.top_topics(corpus=bow_corpus, topn=10)

i = 0
for words, coherence in top_topics:
    print('Topic: {} -> Top Words: {}'.format(i, words))
    i += 1

Topic: 0 -> Top Words: [(0.009110279, 'jack'), (0.008815186, 'need'), (0.008435835, 'question'), (0.0076961187, 'tri'), (0.007628205, 'see'), (0.006812147, 'believ'), (0.006757227, 'real'), (0.006111999, 'action'), (0.0060606995, 'feel'), (0.006007181, 'turn')]
Topic: 1 -> Top Words: [(0.011611248, 'world'), (0.011028524, 'feel'), (0.008025016, 'talk'), (0.007964426, 'women'), (0.0076935138, 'role'), (0.0072711306, 'great'), (0.007047095, 'tri'), (0.0070455396, 'see'), (0.0065199705, 'music'), (0.0063639693, 'relationship')]
Topic: 2 -> Top Words: [(0.009827383, 'releas'), (0.009562484, 'littl'), (0.009049662, 'music'), (0.0076763663, 'audienc'), (0.0071927644, 'action'), (0.006645522, 'produc'), (0.0065885372, 'money'), (0.006281782, 'michael'), (0.0059787543, 'turn'), (0.0053524706, 'reason')]
Topic: 3 -> Top Words: [(0.011665505, 'action'), (0.0082615595, 'compani'), (0.007038567, 'screen'), (0.0066807275, 'better'), (0.0065927957, 'question'), (0.0065694875, 'begin'), (0.0061623203

## **Document -> Topic probabilities for all the documents in the corpus**

In [15]:
for i, corpus_item in zip(range(len(bow_corpus)), bow_corpus):
    print(data['RowNum'][i], '-', data['FileName'][i], '->', lda_model[corpus_item])

0 - 16748.txt -> [(4, 0.6654528), (5, 0.26274422), (8, 0.066289455)]
1 - 17108.txt -> [(5, 0.17996767), (8, 0.19980069), (9, 0.61419517)]
2 - 17109.txt -> [(5, 0.9409828), (8, 0.055379685)]
3 - 17110.txt -> [(3, 0.61053795), (8, 0.018494712), (9, 0.36647862)]
4 - 17111.txt -> [(3, 0.21550938), (5, 0.7786917)]
5 - 17116.txt -> [(5, 0.34349066), (6, 0.48683968), (8, 0.12873161), (9, 0.03813343)]
6 - 17117.txt -> [(5, 0.1629894), (8, 0.61665076), (9, 0.21399423)]
7 - 17118.txt -> [(1, 0.514556), (8, 0.47719425)]
8 - 17119.txt -> [(9, 0.99552095)]
9 - 17139.txt -> [(5, 0.9939169)]
10 - 17144.txt -> [(5, 0.2878855), (6, 0.25209427), (8, 0.4563541)]
11 - 17145.txt -> [(1, 0.9903196)]
12 - 17146.txt -> [(5, 0.7485997), (8, 0.2471436)]
13 - 17147.txt -> [(1, 0.988747)]
14 - 17150.txt -> [(1, 0.76285326), (8, 0.18677412), (9, 0.04579607)]
15 - 17185.txt -> [(5, 0.025522908), (6, 0.11788307), (9, 0.8516276)]
16 - 17192.txt -> [(8, 0.99125963)]
17 - 17219.txt -> [(0, 0.35930455), (1, 0.42927274),