In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import os
 
 
data = pd.read_csv('Negative douban reviews.csv', encoding='utf-8')
with open('Negative douban reviews.txt','a+', encoding='utf-8') as f:
    for line in data.values:
        f.write((str(line[1])+'\n'))


In [3]:
# import and download necessary packages
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

# start reading the text file
# establish a connection to the external text file 'epl.txt'
infile = open('Negative douban reviews.txt', 'r') 

# read all lines in epl.txt
lines = infile.readlines()

# remove \n at the end of each line 
lines = [l.strip() for l in lines]

# remove empty lines
lines = [l for l in lines if l != ""]

# display the number of lines in the text file
# note: each line is treated as a sole document
len(lines)

[nltk_data] Downloading package punkt to /Users/kawanwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


114

In [4]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


In [8]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# convert our text data to a DTM
tf_vectorizer = CountVectorizer(stop_words='english')
dtm_epl = tf_vectorizer.fit_transform(lines)

In [9]:
# convert our DTM to a numpy array and display it
dtm_epl.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
# import LatentDirichletAllocation
from sklearn.decomposition import LatentDirichletAllocation

# train a latent dirichlet allocation model with number of topics = 10
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# fit the dtm into the lda object
lda.fit(dtm_epl)

LatentDirichletAllocation(random_state=0)

In [12]:
# get the topic-word(term) association for the LDA object
topic_word_matrix = lda.components_

# retrieve top 10 n_top_words words for each topic
no_top_words = 10
feature_names = tf_vectorizer.get_feature_names()

# create a dataframe for displaying the results
rows = []

for topic_id, topic in enumerate(topic_word_matrix):
    row = ["Topic #" + str(topic_id) + ":"]
    row += [
        feature_names[i] + "*" + str(np.round(topic[i] / np.sum(topic), 4))
        for i in topic.argsort()[:-no_top_words - 1:-1]
    ]
    rows.append(row)

topic_word_df = pd.DataFrame(rows, columns=['Topic', 'Top 1 Word*Prob', 'Top 2 Word*Prob', \
                                            'Top 3 Word*Prob', 'Top 4 Word*Prob', 'Top 5 Word*Prob', 'Top 6 Word*Prob', 'Top 7 Word*Prob', 'Top 8 Word*Prob', 'Top 9 Word*Prob', 'Top 10 Word*Prob'])

topic_word_df



Unnamed: 0,Topic,Top 1 Word*Prob,Top 2 Word*Prob,Top 3 Word*Prob,Top 4 Word*Prob,Top 5 Word*Prob,Top 6 Word*Prob,Top 7 Word*Prob,Top 8 Word*Prob,Top 9 Word*Prob,Top 10 Word*Prob
0,Topic #0:,episode*0.015,theme*0.0101,like*0.0076,drama*0.0076,marketing*0.0076,say*0.0076,hot*0.0076,management*0.0076,dont*0.0076,people*0.0076
1,Topic #1:,plot*0.0143,old*0.0096,dont*0.0096,know*0.0073,time*0.0073,li*0.0073,bit*0.0073,drama*0.0073,good*0.0073,beginning*0.0073
2,Topic #2:,words*0.0201,number*0.0186,baby*0.0109,people*0.0109,games*0.0094,let*0.0063,say*0.0063,police*0.0048,li*0.0048,net*0.0048
3,Topic #3:,money*0.0132,people*0.0132,male*0.0116,dont*0.0116,seen*0.0099,time*0.0083,good*0.0083,poor*0.0067,little*0.0067,really*0.0067
4,Topic #4:,korea*0.0286,south*0.0236,dog*0.0219,legs*0.0186,oldamerican*0.0169,old*0.0119,people*0.0069,story*0.0069,let*0.0069,vip*0.0069
5,Topic #5:,people*0.0248,__*0.0143,good*0.0125,___*0.0125,film*0.0124,drama*0.0108,big*0.0108,dont*0.009,little*0.0072,story*0.0072
6,Topic #6:,people*0.0238,xun*0.016,black*0.0107,say*0.0107,hahahahahahahaha*0.0107,like*0.0081,man*0.0081,rope*0.0081,map*0.0081,korean*0.0055
7,Topic #7:,__*0.0172,brother*0.0123,dont*0.0123,people*0.0123,mother*0.0099,end*0.0075,person*0.0075,jiang*0.0075,drama*0.0075,male*0.0075
8,Topic #8:,people*0.0175,good*0.0155,man*0.0136,say*0.0136,dont*0.0136,male*0.0117,really*0.0117,want*0.0098,money*0.0098,16*0.0098
9,Topic #9:,dont*0.043,people*0.0246,believe*0.0185,think*0.0124,man*0.0093,say*0.0093,day*0.0078,life*0.0078,gambling*0.0078,worth*0.0078


In [13]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i) + "*" + str(np.round(doc[i], 2))
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic3*0.98,Topic4*0.0
1,# 1,Topic5*0.97,Topic8*0.0
2,# 2,Topic2*0.95,Topic3*0.01
3,# 3,Topic4*0.97,Topic9*0.0
4,# 4,Topic9*0.97,Topic8*0.0
...,...,...,...
109,# 109,Topic3*0.98,Topic8*0.0
110,# 110,Topic9*0.97,Topic5*0.0
111,# 111,Topic4*0.97,Topic9*0.0
112,# 112,Topic4*0.97,Topic1*0.0


In [14]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i)
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic3,Topic4
1,# 1,Topic5,Topic8
2,# 2,Topic2,Topic3
3,# 3,Topic4,Topic9
4,# 4,Topic9,Topic8
...,...,...,...
109,# 109,Topic3,Topic8
110,# 110,Topic9,Topic5
111,# 111,Topic4,Topic9
112,# 112,Topic4,Topic1


In [15]:
pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install pyldavis

Note: you may need to restart the kernel to use updated packages.


In [17]:
import pyLDAvis
import pyLDAvis.sklearn

import pyLDAvis.gensim_models

data = pyLDAvis.sklearn.prepare(lda,dtm_epl,tf_vectorizer,mds='mmds')

  default_term_info = default_term_info.sort_values(


In [18]:
pyLDAvis.display(data)

In [19]:
pyLDAvis.save_html(data, 'neg douban review.html')