In [1]:
import pandas as pd
import os
 
 
data = pd.read_csv('Positive douban title.csv', encoding='utf-8')
with open('Positive douban title.txt','a+', encoding='utf-8') as f:
    for line in data.values:
        f.write((str(line[1])+'\n'))


In [7]:
# import and download necessary packages
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')

# start reading the text file
# establish a connection to the external text file 'epl.txt'
infile = open('Positive douban title.txt', 'r') 

# read all lines in epl.txt
lines = infile.readlines()

# remove \n at the end of each line 
lines = [l.strip() for l in lines]

# remove 'nan' at each line 
lines = [l.strip('nan') for l in lines]

# remove empty lines
lines = [l for l in lines if l != ""]

# display the number of lines in the text file
# note: each line is treated as a sole document
len(lines)

[nltk_data] Downloading package punkt to /Users/kawanwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


688

In [8]:
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# convert our text data to a DTM
tf_vectorizer = CountVectorizer(stop_words='english')
dtm_epl = tf_vectorizer.fit_transform(lines)

In [9]:
# convert our DTM to a numpy array and display it
dtm_epl.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
# import LatentDirichletAllocation
from sklearn.decomposition import LatentDirichletAllocation

# train a latent dirichlet allocation model with number of topics = 10
lda = LatentDirichletAllocation(n_components=10, random_state=0)

# fit the dtm into the lda object
lda.fit(dtm_epl)

LatentDirichletAllocation(random_state=0)

In [11]:
# get the topic-word(term) association for the LDA object
topic_word_matrix = lda.components_

# retrieve top 10 n_top_words words for each topic
no_top_words = 10
feature_names = tf_vectorizer.get_feature_names()

# create a dataframe for displaying the results
rows = []

for topic_id, topic in enumerate(topic_word_matrix):
    row = ["Topic #" + str(topic_id) + ":"]
    row += [
        feature_names[i] + "*" + str(np.round(topic[i] / np.sum(topic), 4))
        for i in topic.argsort()[:-no_top_words - 1:-1]
    ]
    rows.append(row)

topic_word_df = pd.DataFrame(rows, columns=['Topic', 'Top 1 Word*Prob', 'Top 2 Word*Prob', \
                                            'Top 3 Word*Prob', 'Top 4 Word*Prob', 'Top 5 Word*Prob', 'Top 6 Word*Prob', 'Top 7 Word*Prob', 'Top 8 Word*Prob', 'Top 9 Word*Prob', 'Top 10 Word*Prob'])

topic_word_df



Unnamed: 0,Topic,Top 1 Word*Prob,Top 2 Word*Prob,Top 3 Word*Prob,Top 4 Word*Prob,Top 5 Word*Prob,Top 6 Word*Prob,Top 7 Word*Prob,Top 8 Word*Prob,Top 9 Word*Prob,Top 10 Word*Prob
0,Topic #0:,world*0.0233,police*0.0176,good*0.0147,big*0.0118,look*0.0089,456*0.0089,feelings*0.0089,escape*0.0089,plot*0.0089,brother*0.0089
1,Topic #1:,games*0.0468,people*0.0205,rules*0.0171,cruel*0.0171,chinese*0.0138,versio*0.0104,survival*0.0104,truth*0.0104,wards*0.0104,human*0.0073
2,Topic #2:,human*0.0739,nature*0.0499,humanity*0.0449,test*0.0176,drama*0.0127,world*0.0077,high*0.0077,evil*0.0077,film*0.0066,male*0.0057
3,Topic #3:,life*0.0731,good*0.0244,evil*0.0236,humanity*0.0235,dram*0.0207,like*0.0167,childlike*0.0148,long*0.0148,class*0.009,desire*0.009
4,Topic #4:,male*0.0226,say*0.0203,want*0.0197,good*0.0175,story*0.0175,korean*0.0146,people*0.0117,little*0.0117,evil*0.0117,world*0.0117
5,Topic #5:,episode*0.0178,episodes*0.0178,humanity*0.0178,people*0.0172,old*0.0149,man*0.0149,really*0.0144,hum*0.012,human*0.0098,points*0.0091
6,Topic #6:,drama*0.0209,worth*0.0163,spoiler*0.0136,want*0.011,seeing*0.011,praise*0.011,understand*0.011,dram*0.011,dont*0.0083,second*0.0083
7,Topic #7:,film*0.0265,man*0.0121,spit*0.0121,people*0.0092,hell*0.0092,far*0.0092,winner*0.0092,plugin*0.0092,life*0.0092,feel*0.0062
8,Topic #8:,good*0.0273,drama*0.0215,money*0.0187,best*0.0187,male*0.0125,thing*0.0125,comment*0.0125,lthough*0.0114,look*0.0111,death*0.0095
9,Topic #9:,korean*0.0346,drama*0.0259,escape*0.0186,view*0.0186,people*0.014,dram*0.014,reading*0.014,big*0.014,feeling*0.0117,end*0.0094


In [12]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i) + "*" + str(np.round(doc[i], 2))
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic0*0.7,Topic2*0.22
1,# 1,Topic5*0.85,Topic3*0.02
2,# 2,Topic7*0.82,Topic3*0.02
3,# 3,Topic0*0.7,Topic8*0.03
4,# 4,Topic6*0.77,Topic3*0.03
...,...,...,...
683,# 683,Topic9*0.53,Topic8*0.27
684,# 684,Topic2*0.82,Topic4*0.02
685,# 685,Topic9*0.7,Topic8*0.03
686,# 686,Topic9*0.77,Topic5*0.03


In [13]:
# document-topic matrix
doc_topic_matrix = lda.transform(dtm_epl)

# specify the number of topics we're interested in
no_top_topic = 2
rows = []

# process each document
for doc_id, doc in enumerate(doc_topic_matrix):
    row = ['# %d' % doc_id]
    # get the top n topics
    row += [
        'Topic' + str(i)
        for i in doc.argsort()[:-no_top_topic - 1:-1]
    ]
    rows.append(row)

doc_topic_df = pd.DataFrame(
    rows, columns=['Document', 'Top1 Topic*Prob', 'Top2 Topic*Prob'])

doc_topic_df

Unnamed: 0,Document,Top1 Topic*Prob,Top2 Topic*Prob
0,# 0,Topic0,Topic2
1,# 1,Topic5,Topic3
2,# 2,Topic7,Topic3
3,# 3,Topic0,Topic8
4,# 4,Topic6,Topic3
...,...,...,...
683,# 683,Topic9,Topic8
684,# 684,Topic2,Topic4
685,# 685,Topic9,Topic8
686,# 686,Topic9,Topic5


In [14]:
import pyLDAvis
import pyLDAvis.sklearn

import pyLDAvis.gensim_models

data = pyLDAvis.sklearn.prepare(lda,dtm_epl,tf_vectorizer,mds='mmds')

  default_term_info = default_term_info.sort_values(


In [15]:
pyLDAvis.display(data)

In [19]:
pyLDAvis.save_html(data, 'postive douban title.html')