In [1]:
import pandas as pd
import os
 
 
data = pd.read_csv('Negative douban title.csv', encoding='utf-8')
with open('Negative douban title.txt','a+', encoding='utf-8') as f:
    for line in data.values:
        f.write((str(line[1])+'\n'))


In [2]:
# import and download necessary packages
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

# start reading the text file
# establish a connection to the external text file 'epl.txt'
infile = open('Negative douban title.txt', 'r') 

# read all lines in epl.txt
lines = infile.readlines()

# remove \n at the end of each line 
lines = [l.strip() for l in lines]

# remove empty lines
lines = [l for l in lines if l != ""]

# display the number of lines in the text file
# note: each line is treated as a sole document
len(lines)

[nltk_data] Downloading package punkt to /Users/kawanwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


375

In [3]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# convert our text data to a DTM
tf_vectorizer = CountVectorizer(stop_words='english')
dtm_epl = tf_vectorizer.fit_transform(lines)

In [4]:
# convert our DTM to a numpy array and display it
dtm_epl.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [5]:
# import LatentDirichletAllocation
from sklearn.decomposition import LatentDirichletAllocation

# train a latent dirichlet allocation model with number of topics = 10
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# fit the dtm into the lda object
lda.fit(dtm_epl)

LatentDirichletAllocation(random_state=0)

In [6]:
# get the topic-word(term) association for the LDA object
topic_word_matrix = lda.components_

# retrieve top 10 n_top_words words for each topic
no_top_words = 10
feature_names = tf_vectorizer.get_feature_names()

# create a dataframe for displaying the results
rows = []

for topic_id, topic in enumerate(topic_word_matrix):
    row = ["Topic #" + str(topic_id) + ":"]
    row += [
        feature_names[i] + "*" + str(np.round(topic[i] / np.sum(topic), 4))
        for i in topic.argsort()[:-no_top_words - 1:-1]
    ]
    rows.append(row)

topic_word_df = pd.DataFrame(rows, columns=['Topic', 'Top 1 Word*Prob', 'Top 2 Word*Prob', \
                                            'Top 3 Word*Prob', 'Top 4 Word*Prob', 'Top 5 Word*Prob', 'Top 6 Word*Prob', 'Top 7 Word*Prob', 'Top 8 Word*Prob', 'Top 9 Word*Prob', 'Top 10 Word*Prob'])

topic_word_df



Unnamed: 0,Topic,Top 1 Word*Prob,Top 2 Word*Prob,Top 3 Word*Prob,Top 4 Word*Prob,Top 5 Word*Prob,Top 6 Word*Prob,Top 7 Word*Prob,Top 8 Word*Prob,Top 9 Word*Prob,Top 10 Word*Prob
0,Topic #0:,people*0.0247,play*0.0218,set*0.0218,high*0.0218,boring*0.0189,point*0.0165,second*0.0165,man*0.0165,plot*0.0112,cool*0.0112
1,Topic #1:,dont*0.0529,bad*0.049,man*0.0221,good*0.0174,spoiled*0.0167,looking*0.0167,person*0.0113,mens*0.0113,___*0.0113,episode*0.0113
2,Topic #2:,people*0.0314,play*0.0108,super*0.0108,terrible*0.0108,write*0.0108,long*0.0108,number*0.0108,sorry*0.0108,screenwriters*0.0108,comments*0.0108
3,Topic #3:,people*0.0281,korea*0.0189,day*0.0143,wood*0.0143,problem*0.0143,boss*0.0097,money*0.0097,fan*0.0097,south*0.0097,behindthescenes*0.0097
4,Topic #4:,bug*0.0199,dont*0.0199,male*0.015,drama*0.015,ending*0.015,capital*0.015,liu*0.015,big*0.0102,evil*0.0102,story*0.0102
5,Topic #5:,think*0.0389,end*0.0115,disappointment*0.0115,mens*0.0115,really*0.0115,ugly*0.0115,thats*0.0115,gambling*0.0115,bigger*0.0115,worth*0.0115
6,Topic #6:,survival*0.0299,games*0.025,think*0.0201,kind*0.0152,money*0.0103,story*0.0103,make*0.0103,built*0.0103,dog*0.0103,man*0.0103
7,Topic #7:,dont*0.0286,talk*0.0239,man*0.0192,old*0.0192,humanity*0.0192,like*0.0192,korean*0.0099,looks*0.0099,vip*0.0098,spoiler*0.0098
8,Topic #8:,life*0.045,participate*0.0138,poor*0.0138,expected*0.0138,stay*0.0138,high*0.0138,boring*0.0118,boss*0.0094,let*0.0094,opening*0.0094
9,Topic #9:,good*0.0515,want*0.0351,dont*0.0265,write*0.0265,film*0.0135,review*0.0135,look*0.0135,rich*0.0135,people*0.0103,poor*0.0091


In [7]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i) + "*" + str(np.round(doc[i], 2))
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic2*0.77,Topic5*0.03
1,# 1,Topic7*0.9,Topic9*0.01
2,# 2,Topic0*0.85,Topic1*0.02
3,# 3,Topic3*0.92,Topic4*0.01
4,# 4,Topic5*0.55,Topic1*0.05
...,...,...,...
370,# 370,Topic2*0.89,Topic7*0.01
371,# 371,Topic5*0.7,Topic1*0.03
372,# 372,Topic2*0.55,Topic5*0.05
373,# 373,Topic5*0.85,Topic1*0.02


In [8]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i)
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic2,Topic5
1,# 1,Topic7,Topic9
2,# 2,Topic0,Topic1
3,# 3,Topic3,Topic4
4,# 4,Topic5,Topic1
...,...,...,...
370,# 370,Topic2,Topic7
371,# 371,Topic5,Topic1
372,# 372,Topic2,Topic5
373,# 373,Topic5,Topic1


In [9]:
import pyLDAvis
import pyLDAvis.sklearn

import pyLDAvis.gensim_models

data = pyLDAvis.sklearn.prepare(lda,dtm_epl,tf_vectorizer,mds='mmds')

  default_term_info = default_term_info.sort_values(


In [10]:
pyLDAvis.display(data)

In [12]:
pyLDAvis.save_html(data, 'neg douban title.html')