In [1]:
import pandas as pd
import os
 
 
data = pd.read_csv('Negative imdb reviews.csv', encoding='utf-8')
with open('Negative imdb reviews.txt','a+', encoding='utf-8') as f:
    for line in data.values:
        f.write((str(line[1])+'\n'))


In [2]:
# import and download necessary packages
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

# start reading the text file
# establish a connection to the external text file 'epl.txt'
infile = open('Negative imdb reviews.txt', 'r') 

# read all lines in epl.txt
lines = infile.readlines()

# remove \n at the end of each line 
lines = [l.strip() for l in lines]

# remove empty lines
lines = [l for l in lines if l != ""]

# display the number of lines in the text file
# note: each line is treated as a sole document
len(lines)

[nltk_data] Downloading package punkt to /Users/kawanwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


601

In [3]:
# import necessary library
from sklearn.feature_extraction.text import CountVectorizer

# convert our text data to a DTM
tf_vectorizer = CountVectorizer(stop_words='english')
dtm_epl = tf_vectorizer.fit_transform(lines)

In [4]:
# convert our DTM to a numpy array and display it
dtm_epl.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
# import necessary library
from sklearn.decomposition import LatentDirichletAllocation

# train a latent dirichlet allocation model with number of topics = 3
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# fit the dtm into the lda object
lda.fit(dtm_epl)

LatentDirichletAllocation(random_state=0)

In [6]:
# get the topic-word(term) association for the LDA object
topic_word_matrix = lda.components_

# retrieve top n_top_words words for each topic
no_top_words = 10
feature_names = tf_vectorizer.get_feature_names()

# create a dataframe for displaying the results
rows = []

for topic_id, topic in enumerate(topic_word_matrix):
    row = ["Topic #" + str(topic_id) + ":"]
    row += [
        feature_names[i] + "*" + str(np.round(topic[i] / np.sum(topic), 4))
        for i in topic.argsort()[:-no_top_words - 1:-1]
    ]
    rows.append(row)

topic_word_df = pd.DataFrame(rows, columns=['Topic', 'Top 1 Word*Prob', 'Top 2 Word*Prob', \
                                            'Top 3 Word*Prob', 'Top 4 Word*Prob', 'Top 5 Word*Prob', 'Top 6 Word*Prob', 'Top 7 Word*Prob', 'Top 8 Word*Prob', 'Top 9 Word*Prob', 'Top 10 Word*Prob'])

topic_word_df

Unnamed: 0,Topic,Top 1 Word*Prob,Top 2 Word*Prob,Top 3 Word*Prob,Top 4 Word*Prob,Top 5 Word*Prob,Top 6 Word*Prob,Top 7 Word*Prob,Top 8 Word*Prob,Top 9 Word*Prob,Top 10 Word*Prob
0,Topic #0:,watch*0.0188,actors*0.0131,time*0.0123,like*0.0101,english*0.0095,series*0.0073,movie*0.0071,good*0.007,dont*0.0067,story*0.0067
1,Topic #1:,watching*0.0147,watch*0.0143,good*0.0123,series*0.0102,story*0.0095,acting*0.0094,episode*0.0078,tv*0.0067,games*0.0066,hype*0.0066
2,Topic #2:,series*0.0354,dont*0.0135,like*0.0125,watched*0.0124,watch*0.0119,episode*0.0088,korean*0.008,bad*0.0078,really*0.0077,im*0.0072
3,Topic #3:,acting*0.0203,like*0.014,better*0.0122,characters*0.0116,dont*0.0111,good*0.0099,vips*0.0095,time*0.0095,episode*0.0091,english*0.0086
4,Topic #4:,good*0.0154,bad*0.0149,like*0.0122,series*0.0122,characters*0.012,people*0.0112,watch*0.0105,end*0.0087,acting*0.0084,plot*0.0082
5,Topic #5:,acting*0.0119,didnt*0.0117,really*0.0108,vips*0.0106,like*0.0094,dont*0.0094,watch*0.0082,time*0.0066,people*0.0065,story*0.0058
6,Topic #6:,acting*0.0166,dont*0.0136,plot*0.0102,like*0.0099,series*0.0098,people*0.0087,watch*0.0084,better*0.0082,great*0.0081,games*0.0075
7,Topic #7:,good*0.0193,series*0.0188,really*0.0167,acting*0.0137,watch*0.0132,plot*0.0127,dont*0.0112,vips*0.0093,korean*0.0089,season*0.0081
8,Topic #8:,watching*0.0155,really*0.0135,episode*0.0131,episodes*0.0117,dont*0.0097,acting*0.0092,series*0.0091,good*0.0083,didnt*0.0076,like*0.0072
9,Topic #9:,watch*0.0117,series*0.0102,people*0.0088,movie*0.008,dont*0.008,episode*0.0074,good*0.0071,new*0.0065,like*0.0064,story*0.0062


In [7]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i) + "*" + str(np.round(doc[i], 2))
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic2*0.95,Topic8*0.01
1,# 1,Topic2*0.96,Topic3*0.0
2,# 2,Topic2*0.99,Topic4*0.0
3,# 3,Topic8*0.98,Topic3*0.0
4,# 4,Topic8*0.96,Topic6*0.0
...,...,...,...
596,# 596,Topic2*0.98,Topic3*0.0
597,# 597,Topic1*0.95,Topic3*0.01
598,# 598,Topic3*0.76,Topic7*0.17
599,# 599,Topic4*0.7,Topic8*0.25


In [8]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i)
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic2,Topic8
1,# 1,Topic2,Topic3
2,# 2,Topic2,Topic4
3,# 3,Topic8,Topic3
4,# 4,Topic8,Topic6
...,...,...,...
596,# 596,Topic2,Topic3
597,# 597,Topic1,Topic3
598,# 598,Topic3,Topic7
599,# 599,Topic4,Topic8


In [9]:
import pyLDAvis
import pyLDAvis.sklearn

import pyLDAvis.gensim_models

data = pyLDAvis.sklearn.prepare(lda,dtm_epl,tf_vectorizer,mds='mmds')

In [11]:
pyLDAvis.display(data)

In [10]:
pyLDAvis.save_html(data, 'imdb neg reviews.html')

  and should_run_async(code)
