In [1]:
import sys

# lib
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [7]:
import pickle

with open('Pickles/News_dataset.pickle', 'rb') as data:
    df = pickle.load(data)
    

In [15]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

import nltk
nltk.download('wordnet')
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
processed_docs = df['Content'].str.fillna('').astype(str).map(preprocess)
dictionary = gensim.corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.087*"trump" + 0.046*"obama" + 0.038*"romney" + 0.027*"debat" + 0.027*"biden" + 0.018*"republican" + 0.017*"rubio" + 0.017*"poll" + 0.015*"cruz" + 0.015*"donald"
Topic: 1 
Words: 0.070*"trump" + 0.025*"coronavirus" + 0.022*"clinton" + 0.021*"lose" + 0.019*"elect" + 0.017*"debat" + 0.017*"democrat" + 0.013*"republican" + 0.012*"hillari" + 0.012*"polit"
Topic: 2 
Words: 0.081*"clinton" + 0.065*"trump" + 0.046*"hillari" + 0.038*"sander" + 0.026*"berni" + 0.021*"donald" + 0.019*"email" + 0.011*"democrat" + 0.011*"claim" + 0.010*"china"
Topic: 3 
Words: 0.048*"democrat" + 0.037*"opinion" + 0.032*"elect" + 0.029*"trump" + 0.029*"voter" + 0.022*"clinton" + 0.019*"hillari" + 0.019*"republican" + 0.017*"berni" + 0.016*"sander"
Topic: 4 
Words: 0.048*"coronavirus" + 0.033*"trump" + 0.015*"romney" + 0.015*"senat" + 0.014*"cruz" + 0.013*"democrat" + 0.012*"say" + 0.011*"time" + 0.010*"race" + 0.010*"china"
Topic: 5 
Words: 0.045*"obama" + 0.033*"campaign" + 0.032*"trump" + 0.022*

In [None]:
X = np.array(X)

In [None]:
no_features = 1000
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(X)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(X)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                random_state=0).fit(tf)

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                  for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
print()
display_topics(lda, tf_feature_names, no_top_words)