<h3>Data</h3>

https://www.kaggle.com/nzalake52/new-york-times-articles

<h3>Dependencies and dictionaries</h3>

In [24]:
import random
import pickle
import spacy
spacy.load('en_core_web_sm')
from spacy.lang.en import English
parser = English()
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<h3>Helper functions</h3>

In [37]:
#tokenizer
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

#get lemma
def lemmatizer(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

#preprocesing for lda
def pre_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [lemmatizer(token) for token in tokens]
    return tokens

<h3>Creating a dictionary lda </h3>

In [58]:
text_data = []
with open('NYT.txt', encoding='utf-8') as f:
    for line in f:
        tokens = pre_lda(line)
        if random.random() > .99:
            text_data.append(tokens)
            
#create a dictionary
dictionary = corpora.Dictionary(text_data)
#convert to bag_of_words
corpus = [dictionary.doc2bow(text) for text in text_data]
#save it
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

<h3>Topic modeling with lda on Gensim</h3>

In [59]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.004*"state" + 0.004*"trump" + 0.004*"people" + 0.004*"company"')
(1, '0.006*"school" + 0.005*"include" + 0.004*"company" + 0.004*"million"')
(2, '0.010*"would" + 0.004*"still" + 0.004*"first" + 0.004*"report"')
(3, '0.006*"trump" + 0.006*"state" + 0.005*"million" + 0.004*"would"')
(4, '0.007*"state" + 0.006*"school" + 0.005*"would" + 0.004*"american"')
(5, '0.004*"first" + 0.003*"country" + 0.003*"building" + 0.003*"include"')
(6, '0.005*"years" + 0.004*"include" + 0.004*"state" + 0.004*"could"')
(7, '0.006*"could" + 0.004*"first" + 0.004*"still" + 0.003*"going"')
(8, '0.006*"would" + 0.006*"trump" + 0.005*"state" + 0.004*"woman"')
(9, '0.006*"would" + 0.004*"could" + 0.004*"percent" + 0.004*"state"')


In [61]:
test_text = """
            Emmanuel Macron suggests building its own army to protect Europe against the U.S., China and Russia. 
            But it was Germany in World Wars One & Two - How did that work out for France?
            They were starting to learn German in Paris before the U.S. came along. Pay for NATO or not!
            """
test_text = pre_lda(test_text)
test_bow = dictionary.doc2bow(test_text)
print(test_bow)
print(ldamodel.get_document_topics(test_bow))

[(162, 1), (361, 1), (425, 1), (644, 1), (657, 1), (692, 1), (729, 1), (819, 1), (853, 1), (990, 1), (1417, 1), (1815, 1), (3116, 1), (3687, 1), (3828, 1)]
[(0, 0.2585358), (5, 0.5261629), (7, 0.17154017)]
