In [1]:
import pandas as pd
import re
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel


In [2]:
newsgroups_train = fetch_20newsgroups(subset="train")

df = pd.DataFrame({'post':newsgroups_train['data'],'target': newsgroups_train['target']})
df['target_names'] = df['target'].apply(lambda t: newsgroups_train)
df.head()

Unnamed: 0,post,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,{'data': ['From: lerxst@wam.umd.edu (where's m...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,{'data': ['From: lerxst@wam.umd.edu (where's m...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,{'data': ['From: lerxst@wam.umd.edu (where's m...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,{'data': ['From: lerxst@wam.umd.edu (where's m...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,{'data': ['From: lerxst@wam.umd.edu (where's m...


In [3]:
def remove_urls(text):
  url_pattern = re.compile(r'https?://\s+|www\.\s+')
  return url_pattern.sub(r'',text)

def remove_html(text):
  html_pattern = re.compile('')
  return html_pattern.sub(r'',text)

def remove_emails(text):
  email_pattern = re.compile(r'\S+@\S+')
  return email_pattern.sub(r'', text)


def remove_non_alpha(text):
  return re.sub("[^A-Z a-z]+", " ", str(text))

def preprocess_text(text):
  t= remove_urls(text)
  t = remove_html(t)
  t = remove_emails(t)
  t = remove_non_alpha(t)
  return t

def lemmatize_words(text,lemmatizer):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_stopwords(text,stopwords):
  return " ".join([word for word in str(text).split() if word not in stopwords.words('english')])

df['post_preprocessed'] = df['post'].apply(preprocess_text).str.lower()

print("lemming.....")





lemming.....


In [4]:
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
df['post_final'] = df['post_preprocessed'].apply(lambda post: lemmatize_words)
df['post_final'] = df['post_final'].apply(lambda post: remove_stopwords(post, stopwords))


print('remove stopwords......')
df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


remove stopwords......


Unnamed: 0,post,target,target_names,post_preprocessed,post_final
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,{'data': ['From: lerxst@wam.umd.edu (where's m...,from where s my thing subject what car is ...,<function lemmatize_words 0x7f2630d31cf0>
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,{'data': ['From: lerxst@wam.umd.edu (where's m...,from guy kuo subject si clock poll final...,<function lemmatize_words 0x7f2630d31cf0>
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,{'data': ['From: lerxst@wam.umd.edu (where's m...,from thomas e willis subject pb questions ...,<function lemmatize_words 0x7f2630d31cf0>
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,{'data': ['From: lerxst@wam.umd.edu (where's m...,from joe green subject re weitek p orga...,<function lemmatize_words 0x7f2630d31cf0>
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,{'data': ['From: lerxst@wam.umd.edu (where's m...,from jonathan mcdowell subject re shuttle...,<function lemmatize_words 0x7f2630d31cf0>


In [6]:
#BOW
posts = [x.split(' ') for x in df['post_final']]
id2word = corpora.Dictionary(posts)
corpus_tf = [id2word.doc2bow(text) for text in posts]
print(corpus_tf[0])

[(0, 1), (1, 1), (2, 1)]


In [7]:
from gensim.models.ldamulticore import LdaMulticore

model = LdaMulticore(corpus=corpus_tf,id2word = id2word, num_topics = 20, alpha = 1, eta = 0.1, random_state = 10)

coherence = CoherenceModel(model = model, texts = posts, dictionary= id2word, coherence = 'u_mass')

print(coherence.get_coherence())
print(model.show_topics())



1.0000889005818408e-12
[(19, '0.378*"lemmatize_words" + 0.317*"<function" + 0.305*"0x7f2630d31cf0>"'), (18, '0.363*"0x7f2630d31cf0>" + 0.363*"lemmatize_words" + 0.273*"<function"'), (4, '0.403*"<function" + 0.300*"lemmatize_words" + 0.297*"0x7f2630d31cf0>"'), (12, '0.340*"<function" + 0.340*"lemmatize_words" + 0.320*"0x7f2630d31cf0>"'), (17, '0.362*"lemmatize_words" + 0.329*"<function" + 0.309*"0x7f2630d31cf0>"'), (3, '0.392*"<function" + 0.332*"0x7f2630d31cf0>" + 0.276*"lemmatize_words"'), (9, '0.373*"lemmatize_words" + 0.319*"0x7f2630d31cf0>" + 0.308*"<function"'), (13, '0.371*"0x7f2630d31cf0>" + 0.366*"lemmatize_words" + 0.264*"<function"'), (16, '0.367*"0x7f2630d31cf0>" + 0.365*"lemmatize_words" + 0.268*"<function"'), (10, '0.389*"0x7f2630d31cf0>" + 0.329*"<function" + 0.283*"lemmatize_words"')]


In [8]:
!pip install pyLDavis



In [10]:

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

vis_data = gensimvis.prepare(model, corpus_tf, id2word)
pyLDAvis.display(vis_data)

  and should_run_async(code)
