In [2]:
import pandas as pd
import os
 
 
data = pd.read_csv('Positive douban reviews.csv', encoding='utf-8')
with open('Positive douban reviews.txt','a+', encoding='utf-8') as f:
    for line in data.values:
        f.write((str(line[1])+'\n'))


In [3]:
# import and download necessary packages
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

# start reading the text file
# establish a connection to the external text file 'epl.txt'
infile = open('Positive douban reviews.txt', 'r') 

# read all lines in epl.txt
lines = infile.readlines()

# remove \n at the end of each line 
lines = [l.strip() for l in lines]

# remove str 'nan' at each line 
lines = [l.strip('nan') for l in lines]

# remove empty lines
lines = [l for l in lines if l != ""]

# display the number of lines in the text file
# note: each line is treated as a sole document
len(lines)

[nltk_data] Downloading package punkt to /Users/kawanwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


550

In [4]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# convert our text data to a DTM
tf_vectorizer = CountVectorizer(stop_words='english')
dtm_epl = tf_vectorizer.fit_transform(lines)

In [5]:
# convert our DTM to a numpy array and display it
dtm_epl.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [6]:
# import LatentDirichletAllocation
from sklearn.decomposition import LatentDirichletAllocation

# train a latent dirichlet allocation model with number of topics = 10
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# fit the dtm into the lda object
lda.fit(dtm_epl)

LatentDirichletAllocation(random_state=0)

In [7]:
# get the topic-word(term) association for the LDA object
topic_word_matrix = lda.components_

# retrieve top 10 n_top_words words for each topic
no_top_words = 10
feature_names = tf_vectorizer.get_feature_names()

# create a dataframe for displaying the results
rows = []

for topic_id, topic in enumerate(topic_word_matrix):
    row = ["Topic #" + str(topic_id) + ":"]
    row += [
        feature_names[i] + "*" + str(np.round(topic[i] / np.sum(topic), 4))
        for i in topic.argsort()[:-no_top_words - 1:-1]
    ]
    rows.append(row)

topic_word_df = pd.DataFrame(rows, columns=['Topic', 'Top 1 Word*Prob', 'Top 2 Word*Prob', \
                                            'Top 3 Word*Prob', 'Top 4 Word*Prob', 'Top 5 Word*Prob', 'Top 6 Word*Prob', 'Top 7 Word*Prob', 'Top 8 Word*Prob', 'Top 9 Word*Prob', 'Top 10 Word*Prob'])

topic_word_df



Unnamed: 0,Topic,Top 1 Word*Prob,Top 2 Word*Prob,Top 3 Word*Prob,Top 4 Word*Prob,Top 5 Word*Prob,Top 6 Word*Prob,Top 7 Word*Prob,Top 8 Word*Prob,Top 9 Word*Prob,Top 10 Word*Prob
0,Topic #0:,people*0.0145,rules*0.0098,male*0.0077,man*0.0074,film*0.0066,good*0.0059,life*0.0056,drama*0.0054,know*0.0053,dont*0.0053
1,Topic #1:,drama*0.0264,people*0.0185,story*0.0064,korean*0.0063,human*0.0057,good*0.0057,say*0.0053,said*0.0051,different*0.0051,wood*0.005
2,Topic #2:,people*0.0268,drama*0.0151,games*0.0142,korean*0.0069,time*0.0067,really*0.0066,want*0.0063,big*0.0055,group*0.0055,good*0.0054
3,Topic #3:,drama*0.0125,episode*0.0116,say*0.0076,people*0.0064,feel*0.0061,human*0.0056,old*0.0055,better*0.0054,korean*0.0048,man*0.0046
4,Topic #4:,potato*0.0172,drama*0.0153,pudding*0.0152,people*0.0096,korean*0.0092,really*0.0083,episode*0.0065,like*0.0062,say*0.0055,movie*0.0053
5,Topic #5:,people*0.0255,good*0.0095,games*0.0084,person*0.0079,wood*0.0072,drama*0.0065,want*0.0063,life*0.0063,money*0.0058,male*0.0057
6,Topic #6:,male*0.0133,good*0.0112,people*0.0095,big*0.009,characters*0.0089,man*0.008,drama*0.007,dont*0.0069,really*0.0064,like*0.0057
7,Topic #7:,drama*0.0212,people*0.0148,human*0.0088,simple*0.0073,like*0.0069,plot*0.0068,dont*0.0065,really*0.0065,games*0.0065,good*0.0063
8,Topic #8:,people*0.0168,life*0.0135,money*0.01,male*0.0095,drama*0.0087,death*0.0073,dont*0.0072,think*0.0064,play*0.0061,world*0.0057
9,Topic #9:,people*0.0176,drama*0.0132,old*0.0116,man*0.0104,male*0.0093,korean*0.0083,lot*0.0065,games*0.0062,good*0.0059,big*0.0057


In [8]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i) + "*" + str(np.round(doc[i], 2))
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic5*0.97,Topic2*0.0
1,# 1,Topic2*0.97,Topic5*0.0
2,# 2,Topic8*0.98,Topic7*0.0
3,# 3,Topic7*0.97,Topic2*0.0
4,# 4,Topic1*0.97,Topic2*0.0
...,...,...,...
545,# 545,Topic7*0.98,Topic9*0.0
546,# 546,Topic7*0.97,Topic6*0.0
547,# 547,Topic1*0.97,Topic7*0.0
548,# 548,Topic0*0.97,Topic5*0.0


In [9]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i)
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic5,Topic2
1,# 1,Topic2,Topic5
2,# 2,Topic8,Topic7
3,# 3,Topic7,Topic2
4,# 4,Topic1,Topic2
...,...,...,...
545,# 545,Topic7,Topic9
546,# 546,Topic7,Topic6
547,# 547,Topic1,Topic7
548,# 548,Topic0,Topic5


In [13]:
import pyLDAvis
import pyLDAvis.sklearn

import pyLDAvis.gensim_models

data = pyLDAvis.sklearn.prepare(lda,dtm_epl,tf_vectorizer,mds='mmds')

  default_term_info = default_term_info.sort_values(


In [14]:
pyLDAvis.display(data)

In [15]:
pyLDAvis.save_html(data, 'postive douban review.html')