In [10]:
#import relvant packages for conduct topic modeling analysis
import os
import pandas as pd
import numpy as np
from pandas import DataFrame
np.random.seed(2023)

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

In [11]:
from nltk.stem import*
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'MovieReviews' # Folder Name
file_names = PlaintextCorpusReader(corpus_root, '.*',encoding='latin-1')  # wildcard is read all files in the folder
file_names.fileids()  # Get the filenames

['16748.txt',
 '17108.txt',
 '17109.txt',
 '17110.txt',
 '17111.txt',
 '17116.txt',
 '17117.txt',
 '17118.txt',
 '17119.txt',
 '17139.txt',
 '17144.txt',
 '17145.txt',
 '17146.txt',
 '17147.txt',
 '17150.txt',
 '17185.txt',
 '17192.txt',
 '17219.txt',
 '17239.txt',
 '17243.txt',
 '17254.txt',
 '17255.txt',
 '17280.txt',
 '17300.txt',
 '17303.txt',
 '17341.txt',
 '17384.txt',
 '17391.txt',
 '17398.txt',
 '17399.txt',
 '17430.txt',
 '17431.txt',
 '17447.txt',
 '17457.txt',
 '17460.txt',
 '17501.txt',
 '17518.txt',
 '17532.txt',
 '17534.txt',
 '17578.txt',
 '17609.txt',
 '17610.txt',
 '17655.txt',
 '17662.txt',
 '17663.txt',
 '17695.txt',
 '17711.txt',
 '17713.txt',
 '17753.txt',
 '17757.txt',
 '17758.txt',
 '17761.txt',
 '17803.txt',
 '17811.txt',
 '17874.txt',
 '17879.txt',
 '17886.txt',
 '17896.txt',
 '17898.txt',
 '17902.txt',
 '17912.txt',
 '17933.txt',
 '17934.txt',
 '17945.txt',
 '17963.txt',
 '17971.txt',
 '17992.txt',
 '18004.txt',
 '18016.txt',
 '18032.txt',
 '18067.txt',
 '1806

In [13]:
path: str = 'C:/Users/rbrul/Documents/GitHub/KDD/Homework_8/MovieReviews'
# path: str = '/Users/jakebrulato/Documents/GitHub/KDD/Homework_8/MovieReviews'
file_names = os.listdir(path)

doc_contents: list = []
for i, file_name in zip(range(len(file_names)), file_names):
    with open(path + '/' + file_name, encoding="utf8", errors='ignore') as file:
        doc_contents.append((i, file_name, file.read()))

In [14]:
data: DataFrame = pd.DataFrame(doc_contents, columns=['RowNum', 'FileName', 'FileContent'])
data.dropna(subset=['FileContent'], inplace= True)
print(data.head(5))

   RowNum   FileName                                        FileContent
0       0  16748.txt  DENNIS SCHWARTZ "Movie Reviews and Poetry"\nUN...
1       1  17108.txt  A brilliant, witty mock documentary of Jean Se...
2       2  17109.txt  NOSTALGHIA (director: Andrei Tarkovsky; cast: ...
3       3  17110.txt  PAYBACK (director: Brian Helgeland; cast:(Port...
4       4  17111.txt  WAKING NED DEVINE (director: Kirk Jones (III);...


In [17]:
import gensim
import nltk
nltk.download('wordnet')

processed_docs = data['FileContent'].map(preprocess)
print(processed_docs[:5])

dictionary: Dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rbrul\AppData\Roaming\nltk_data...


0    [denni, schwartz, movi, review, poetri, unmak,...
1    [brilliant, witti, mock, documentari, jean, se...
2    [nostalghia, director, andrei, tarkovski, cast...
3    [payback, director, brian, helgeland, cast, po...
4    [wake, devin, director, kirk, jone, cast, bann...
Name: FileContent, dtype: object


In [18]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print(len(bow_corpus))

180


In [19]:
test_corpus = bow_corpus[0]
for i, word_stat in zip(range(len(test_corpus)), test_corpus):
    # print only the first 5 words
    if i < 5:
        print("Word {} (\"{}\") appears {} times.".format(word_stat[0], dictionary[word_stat[0]], word_stat[1]))

Word 0 ("accomplish") appears 1 times.
Word 1 ("actor") appears 1 times.
Word 2 ("america") appears 1 times.
Word 3 ("apart") appears 1 times.
Word 4 ("attract") appears 3 times.


In [20]:
lda_model = LdaMulticore(bow_corpus, num_topics=2, id2word=dictionary, passes=2, workers=2)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} -> Words: {}'.format(idx, topic))

Topic: 0 -> Words: 0.007*"littl" + 0.005*"tri" + 0.005*"best" + 0.005*"money" + 0.005*"music" + 0.005*"perform" + 0.005*"show" + 0.005*"plot" + 0.005*"believ" + 0.005*"need"
Topic: 1 -> Words: 0.007*"feel" + 0.007*"world" + 0.007*"littl" + 0.006*"see" + 0.006*"audienc" + 0.006*"father" + 0.005*"real" + 0.005*"role" + 0.005*"turn" + 0.005*"tri"


In [21]:
top_topics = lda_model.top_topics(corpus=bow_corpus, topn=10)

i = 0
for words, coherence in top_topics:
    print('Topic: {} -> Top Words: {}'.format(i, words))
    i += 1

Topic: 0 -> Top Words: [(0.007473582, 'feel'), (0.0072176526, 'world'), (0.006504633, 'littl'), (0.0060972637, 'see'), (0.0058726715, 'audienc'), (0.0055169645, 'father'), (0.005495724, 'real'), (0.0052900286, 'role'), (0.004981498, 'turn'), (0.0048618377, 'tri')]
Topic: 1 -> Top Words: [(0.0066210236, 'littl'), (0.005331485, 'tri'), (0.0053301197, 'best'), (0.005158095, 'money'), (0.00510272, 'music'), (0.0050485325, 'perform'), (0.0049529034, 'show'), (0.0049239616, 'plot'), (0.004871073, 'believ'), (0.0048304666, 'need')]


In [22]:
for i, corpus_item in zip(range(len(bow_corpus)), bow_corpus):
    print(data['RowNum'][i], '-', data['FileName'][i], '->', lda_model[corpus_item])

0 - 16748.txt -> [(0, 0.3905401), (1, 0.6094599)]
1 - 17108.txt -> [(0, 0.02214602), (1, 0.977854)]
2 - 17109.txt -> [(0, 0.040818673), (1, 0.95918137)]
3 - 17110.txt -> [(0, 0.3949776), (1, 0.6050224)]
4 - 17111.txt -> [(0, 0.02834988), (1, 0.9716501)]
5 - 17116.txt -> [(0, 0.022452053), (1, 0.97754794)]
6 - 17117.txt -> [(0, 0.2222657), (1, 0.7777343)]
7 - 17118.txt -> [(0, 0.89955723), (1, 0.10044281)]
8 - 17119.txt -> [(1, 0.9915718)]
9 - 17139.txt -> [(0, 0.022548487), (1, 0.97745156)]
10 - 17144.txt -> [(0, 0.23618843), (1, 0.7638116)]
11 - 17145.txt -> [(0, 0.988023), (1, 0.011976957)]
12 - 17146.txt -> [(0, 0.8724876), (1, 0.12751245)]
13 - 17147.txt -> [(0, 0.7837367), (1, 0.21626325)]
14 - 17150.txt -> [(0, 0.9675193), (1, 0.032480646)]
15 - 17185.txt -> [(0, 0.8499528), (1, 0.15004721)]
16 - 17192.txt -> [(1, 0.9904974)]
17 - 17219.txt -> [(0, 0.87924606), (1, 0.12075392)]
18 - 17239.txt -> [(0, 0.07096078), (1, 0.9290392)]
19 - 17243.txt -> [(0, 0.97024906), (1, 0.029750975