In [2]:
import gensim
import nltk
import pandas as pd
from nltk.stem import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim import corpora, models

In [4]:
text = open("text_english.txt", "r").read()

In [5]:
sent = sent_tokenize(text)
sent = pd.DataFrame({"Sentence": sent})

stemmer = SnowballStemmer("english")

In [6]:
def preprocessing(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(stemmer.stem(token))
    return result

processed_sent = sent["Sentence"].map(preprocessing)

In [7]:
processed_sent

0     [propos, unifi, neural, network, architectur, ...
1     [versatil, achiev, tri, avoid, task, specif, e...
2     [instead, exploit, input, featur, care, optim,...
3     [work, basi, build, freeli, avail, tag, good, ...
4                              [keyword, neuralnetwork]
                            ...                        
89    [later, ando, zhang, reach, semi, supervis, ap...
90    [train, joint, linear, model, linear, model, a...
91                [perform, viterbi, decod, test, time]
92               [unlabel, corpus, word, taken, reuter]
93    [featur, includ, word, tag, suffix, prefix, ch...
Name: Sentence, Length: 94, dtype: object

In [8]:
d = gensim.corpora.Dictionary(processed_sent)
d.filter_extremes(no_below=3, no_above=0.5, keep_n=100000)

corpus = [d.doc2bow(doc) for doc in processed_sent]

In [10]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)],
 [(6, 1),
  (14, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1)],
 [(13, 1), (27, 1)],
 [],
 [(5, 1), (8, 1), (20, 1), (28, 1), (29, 1), (30, 2)],
 [(20, 1), (29, 1)],
 [(24, 1), (31, 1)],
 [(5, 1), (8, 1), (19, 1), (24, 1), (31, 1), (32, 1)],
 [(1, 1),
  (2, 1),
  (4, 1),
  (7, 1),
  (10, 1),
  (11, 2),
  (12, 1),
  (13, 1),
  (31, 2),
  (32, 1),
  (33, 1)],
 [(20, 1), (27, 1), (29, 1), (30, 1), (34, 1)],
 [(5, 1), (8, 1), (35, 1), (36, 1), (37, 1)],
 [(36, 1), (38, 1)],
 [],
 [(39, 1)],
 [],
 [(40, 1)],
 [],
 [(41, 1)],
 [],
 [(42, 1)],
 [(38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1)],
 [(14, 1), (34, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1)],
 [(34, 1)],
 [(14, 1), (21, 1), (34, 1), (35, 1), (44, 1), (45, 1)],
 [(14, 1

In [13]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [14]:
lda_model = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=d, passes=5,
                                      workers=3)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

Topic: 0 
Words: 0.088*"label" + 0.073*"chunk" + 0.068*"semant" + 0.068*"role" + 0.068*"speech" + 0.067*"inform" + 0.058*"tag" + 0.055*"benchmark" + 0.055*"entiti" + 0.055*"name"

Topic: 1 
Words: 0.082*"text" + 0.082*"test" + 0.082*"time" + 0.081*"perform" + 0.079*"structur" + 0.079*"data" + 0.056*"decod" + 0.056*"viterbi" + 0.030*"describ" + 0.030*"languag"

Topic: 2 
Words: 0.212*"section" + 0.114*"task" + 0.071*"train" + 0.066*"benchmark" + 0.048*"data" + 0.048*"test" + 0.047*"recognit" + 0.040*"chunk" + 0.025*"conll" + 0.025*"valid"

Topic: 3 
Words: 0.062*"perform" + 0.060*"bidirect" + 0.058*"approach" + 0.043*"word" + 0.041*"larg" + 0.041*"learn" + 0.041*"window" + 0.040*"infer" + 0.039*"knowledg" + 0.037*"featur"

Topic: 4 
Words: 0.080*"tag" + 0.067*"challeng" + 0.063*"includ" + 0.060*"chunk" + 0.053*"classifi" + 0.050*"conll" + 0.048*"word" + 0.046*"featur" + 0.034*"train" + 0.034*"prefix"

Topic: 5 
Words: 0.075*"report" + 0.071*"collobert" + 0.071*"kavukcuoglu" + 0.071*"wes

In [15]:
lda_model = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=d, passes=5,
                                      workers=3)
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}\n')

Topic: 0 
Words: 0.081*"learn" + 0.050*"unlabel" + 0.049*"discov" + 0.046*"reach" + 0.042*"represent" + 0.041*"instead" + 0.040*"intern" + 0.038*"intermedi" + 0.032*"word" + 0.029*"bidirect"

Topic: 1 
Words: 0.097*"chunk" + 0.068*"conll" + 0.067*"word" + 0.060*"featur" + 0.058*"challeng" + 0.048*"best" + 0.047*"prefix" + 0.047*"score" + 0.043*"hand" + 0.032*"perform"

Topic: 2 
Words: 0.085*"kuksa" + 0.063*"research" + 0.063*"collobert" + 0.036*"infer" + 0.036*"window" + 0.036*"viterbi" + 0.034*"achiev" + 0.034*"bidirect" + 0.034*"decod" + 0.034*"text"

Topic: 3 
Words: 0.079*"natur" + 0.074*"languag" + 0.070*"perform" + 0.068*"benchmark" + 0.065*"improv" + 0.065*"specif" + 0.042*"standard" + 0.033*"represent" + 0.033*"hand" + 0.028*"inform"

Topic: 4 
Words: 0.077*"avoid" + 0.068*"knowledg" + 0.067*"specif" + 0.062*"engin" + 0.054*"inform" + 0.052*"task" + 0.046*"research" + 0.045*"larg" + 0.034*"represent" + 0.031*"report"

Topic: 5 
Words: 0.083*"time" + 0.074*"data" + 0.072*"base"