In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
import os, glob
import pandas as pd
from collections import defaultdict
from pathlib import Path
import pandas as df
import pyLDAvis
import pyLDAvis.gensim

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/raulbag/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/raulbag/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
files_folder = f'./full_text/cleaned/cleaned_text/'

results = defaultdict(list)
for file in Path(files_folder).glob('**/*.txt'):
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["text"].append(file_open.read())
df = pd.DataFrame(results)

data_text = df[['text']]
data_text['index'] = data_text.index
documents = data_text

In [5]:
stemmer = SnowballStemmer(language='english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
doc_sample = documents[documents['index'] == 10].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['', 'testing', 'continuous', 'time', 'models', 'financial', 'markets', 'a', 'zur', 'erlangung', 'des', 'akademischen', 'grades', 'doctor', 'rerum', 'politicarum', 'rer', 'pol', 'fach', 'statistik', 'und', 'okonometrie', 'eingereicht', 'der', 'wirtschaftswissenschaftlichen', 'fakult', 'humboldt', 'universit', 'berlin', 'von', 'herrn', 'dipl', 'math', 'torsten', 'kleinow', 'geborem', 'potsdam', 'asident', 'der', 'humboldt', 'universit', 'berlin', 'prof', 'urgen', 'mlynek', 'dekan', 'der', 'wirtschaftswissenschaftlichen', 'fakult', 'prof', 'burda', 'gutachter', 'prof', 'wolfgang', 'ardle', 'priv', 'doz', 'helmut', 'herwartz', 'eingereicht', 'mai', 'tag', 'der', 'undlichen', 'ufung', 'juli', 'abstract', 'the', 'aim', 'the', 'thesis', 'provide', 'wide', 'range', 'statistical', 'methods', 'designed', 'test', 'parametric', 'assumptions', 'about', 'the', 'evolution', 'continuous', 'time', 'processes', 'nancial', 'markets', 'the', 'main', 'focus', 'the', 'statistical', 'met

['test', 'continu', 'time', 'model', 'financi', 'market', 'erlangung', 'akademischen', 'grade', 'doctor', 'rerum', 'politicarum', 'fach', 'statistik', 'okonometri', 'eingereicht', 'fakult', 'humboldt', 'universit', 'berlin', 'herrn', 'dipl', 'math', 'torsten', 'kleinow', 'geborem', 'potsdam', 'asid', 'humboldt', 'universit', 'berlin', 'prof', 'urgen', 'mlynek', 'dekan', 'fakult', 'prof', 'burda', 'gutacht', 'prof', 'wolfgang', 'ardl', 'priv', 'helmut', 'herwartz', 'eingereicht', 'undlichen', 'ufung', 'juli', 'abstract', 'thesi', 'provid', 'wide', 'rang', 'statist', 'method', 'design', 'test', 'parametr', 'assumpt', 'evolut', 'continu', 'time', 'process', 'nancial', 'market', 'main', 'focus', 'statist', 'methodolog', 'investig', 'properti', 'propos', 'method', 'appli', 'nite', 'sampl', 'aspect', 'particular', 'import', 'empir', 'applic', 'chapter', 'includ', 'empir', 'analysi', 'nancial', 'data', 'develop', 'method', 'keyword', 'mathemat', 'financ', 'statist', 'test', 'usion', 'process'

In [7]:
processed_docs = documents['text'].map(preprocess)
processed_docs[:10]

0    [adapt, method, risk, calibr, dissert, erlangu...
1    [dynam, cluster, visual, smart, data, applic, ...
2    [essay, learn, statist, implement, statist, so...
3    [model, financi, social, network, erlangung, a...
4    [rtat, erlangung, akademischen, grade, doctor,...
5    [statist, digit, financ, erlangung, akademisch...
6    [function, data, analysi, applic, financ, diss...
7    [weather, risk, manag, bond, weather, deriv, e...
8    [valuat, properti, econom, model, real, estat,...
9    [tail, event, drive, financi, risk, model, erl...
Name: text, dtype: object

In [8]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abbrevi
1 abil
2 abl
3 absolut
4 abus
5 accept
6 access
7 accord
8 account
9 accumu
10 accumul


In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(1, 2),
 (2, 6),
 (3, 1),
 (7, 9),
 (8, 2),
 (10, 1),
 (11, 1),
 (12, 3),
 (13, 4),
 (14, 2),
 (23, 9),
 (24, 1),
 (26, 12),
 (27, 2),
 (29, 7),
 (32, 1),
 (34, 4),
 (37, 2),
 (45, 1),
 (48, 1),
 (49, 8),
 (52, 3),
 (53, 22),
 (58, 2),
 (59, 64),
 (61, 1),
 (62, 2),
 (65, 1),
 (66, 1),
 (67, 5),
 (70, 11),
 (71, 17),
 (72, 82),
 (73, 26),
 (76, 23),
 (77, 4),
 (78, 54),
 (80, 13),
 (81, 8),
 (85, 23),
 (86, 2),
 (88, 2),
 (90, 5),
 (91, 7),
 (92, 4),
 (96, 1),
 (99, 1),
 (100, 3),
 (101, 22),
 (103, 4),
 (104, 35),
 (105, 43),
 (110, 72),
 (112, 1),
 (123, 20),
 (124, 7),
 (125, 8),
 (126, 15),
 (127, 3),
 (132, 2),
 (134, 28),
 (137, 45),
 (138, 5),
 (140, 60),
 (141, 2),
 (142, 6),
 (145, 1),
 (146, 1),
 (147, 5),
 (153, 1),
 (155, 1),
 (157, 10),
 (158, 2),
 (160, 4),
 (161, 5),
 (162, 1),
 (163, 13),
 (164, 1),
 (168, 3),
 (170, 5),
 (171, 3),
 (173, 7),
 (174, 1),
 (176, 8),
 (182, 56),
 (183, 16),
 (184, 5),
 (186, 1),
 (189, 2),
 (190, 31),
 (192, 6),
 (195, 2),
 (198, 17),
 (2

In [10]:
bow_doc_10 = bow_corpus[10]
for i in range(len(bow_doc_10)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_10[i][0], 
                                               dictionary[bow_doc_10[i][0]], 
bow_doc_10[i][1]))

Word 1 ("abil") appears 2 time.
Word 2 ("abl") appears 6 time.
Word 3 ("absolut") appears 1 time.
Word 7 ("accord") appears 9 time.
Word 8 ("account") appears 2 time.
Word 10 ("accumul") appears 1 time.
Word 11 ("accur") appears 1 time.
Word 12 ("accuraci") appears 3 time.
Word 13 ("achiev") appears 4 time.
Word 14 ("acknowledg") appears 2 time.
Word 23 ("adapt") appears 9 time.
Word 24 ("add") appears 1 time.
Word 26 ("addit") appears 12 time.
Word 27 ("address") appears 2 time.
Word 29 ("adjust") appears 7 time.
Word 32 ("adopt") appears 1 time.
Word 34 ("advantag") appears 4 time.
Word 37 ("advisor") appears 2 time.
Word 45 ("akademischen") appears 1 time.
Word 48 ("algebra") appears 1 time.
Word 49 ("algorithm") appears 8 time.
Word 52 ("allow") appears 3 time.
Word 53 ("altern") appears 22 time.
Word 58 ("analys") appears 2 time.
Word 59 ("analysi") appears 64 time.
Word 61 ("analyt") appears 1 time.
Word 62 ("analyz") appears 2 time.
Word 65 ("andigkeitserkl") appears 1 time.
Wor

In [11]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.003248349864707201),
 (1, 0.008316246912457506),
 (2, 0.0011769728615134876),
 (3, 0.00041193825800108413),
 (4, 0.004212435475679623),
 (5, 0.006490338899663914),
 (6, 0.0009985136768713714),
 (8, 0.0002496515575077928),
 (9, 0.008300045172605349),
 (10, 0.010035812419483505),
 (11, 0.00016813898021621253),
 (12, 0.0031936688315450357),
 (13, 0.0016813898021621253),
 (14, 0.00030370962049230824),
 (15, 0.0068440092204679615),
 (16, 0.0022284844527887928),
 (17, 0.0002496515575077928),
 (18, 0.005992282788765764),
 (19, 0.00021236410522117952),
 (20, 0.008300045172605349),
 (21, 0.008424870951359246),
 (22, 0.008424870951359246),
 (23, 0.029617143783722603),
 (24, 0.0006043095204351917),
 (25, 0.007369040809852362),
 (27, 0.0001248257787538964),
 (28, 0.016600090345210698),
 (29, 0.0002471629548006505),
 (30, 0.1826009937973177),
 (31, 0.0018069524097712188),
 (32, 0.0042301666430463415),
 (33, 0.13280072276168559),
 (34, 0.00016477530320043365),
 (35, 0.0021149099909579107),
 (

In [15]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [16]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.014*"model" + 0.011*"estim" + 0.008*"data" + 0.007*"price" + 0.006*"method" + 0.006*"function" + 0.006*"market" + 0.005*"time" + 0.005*"figur" + 0.005*"client"
Topic: 1 
Words: 0.011*"risk" + 0.009*"model" + 0.009*"estim" + 0.007*"data" + 0.007*"market" + 0.007*"return" + 0.006*"result" + 0.006*"portfolio" + 0.005*"time" + 0.005*"valu"
Topic: 2 
Words: 0.015*"model" + 0.011*"data" + 0.009*"estim" + 0.008*"time" + 0.007*"risk" + 0.006*"price" + 0.005*"figur" + 0.005*"function" + 0.005*"valu" + 0.005*"tabl"
Topic: 3 
Words: 0.011*"price" + 0.010*"model" + 0.009*"market" + 0.008*"data" + 0.007*"valu" + 0.006*"estim" + 0.006*"time" + 0.005*"research" + 0.005*"function" + 0.005*"figur"
Topic: 4 
Words: 0.015*"model" + 0.013*"estim" + 0.010*"time" + 0.008*"risk" + 0.007*"data" + 0.007*"function" + 0.007*"market" + 0.006*"valu" + 0.006*"figur" + 0.005*"factor"


In [19]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=3, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.000*"mortal" + 0.000*"prune" + 0.000*"wind" + 0.000*"temperatur" + 0.000*"lemma" + 0.000*"logn" + 0.000*"diag" + 0.000*"usion" + 0.000*"copula" + 0.000*"uencer"
Topic: 1 Word: 0.000*"crix" + 0.000*"bitcoin" + 0.000*"expectil" + 0.000*"crypto" + 0.000*"uplift" + 0.000*"cryptocurr" + 0.000*"differ" + 0.000*"lpxbhr" + 0.000*"phil" + 0.000*"effect"
Topic: 2 Word: 0.000*"expectil" + 0.000*"teda" + 0.000*"yaml" + 0.000*"temperatur" + 0.000*"vali" + 0.000*"surrog" + 0.000*"band" + 0.000*"gvsm" + 0.000*"client" + 0.000*"copula"


In [68]:
processed_docs[10]

['test',
 'continu',
 'time',
 'model',
 'financi',
 'market',
 'erlangung',
 'akademischen',
 'grade',
 'doctor',
 'rerum',
 'politicarum',
 'fach',
 'statistik',
 'okonometri',
 'eingereicht',
 'fakult',
 'humboldt',
 'universit',
 'berlin',
 'herrn',
 'dipl',
 'math',
 'torsten',
 'kleinow',
 'geborem',
 'potsdam',
 'asid',
 'humboldt',
 'universit',
 'berlin',
 'prof',
 'urgen',
 'mlynek',
 'dekan',
 'fakult',
 'prof',
 'burda',
 'gutacht',
 'prof',
 'wolfgang',
 'ardl',
 'priv',
 'helmut',
 'herwartz',
 'eingereicht',
 'undlichen',
 'ufung',
 'juli',
 'abstract',
 'thesi',
 'provid',
 'wide',
 'rang',
 'statist',
 'method',
 'design',
 'test',
 'parametr',
 'assumpt',
 'evolut',
 'continu',
 'time',
 'process',
 'nancial',
 'market',
 'main',
 'focus',
 'statist',
 'methodolog',
 'investig',
 'properti',
 'propos',
 'method',
 'appli',
 'nite',
 'sampl',
 'aspect',
 'particular',
 'import',
 'empir',
 'applic',
 'chapter',
 'includ',
 'empir',
 'analysi',
 'nancial',
 'data',
 'de

In [17]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
p

In [20]:
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim.prepare(lda_model_tfidf, corpus_tfidf, dictionary)
p