In [23]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load dataset
data = fetch_20newsgroups() 
df = pd.DataFrame({'text': data.data})

# Isolate one text to display
text = df.text.iloc[1]

print(text)

From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.washington.edu

A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



In [15]:
# Strip first block corresponding to email details

def stripheaders(text):
    dirt = text.split("\n\n")[0] # locate headers as everything before first double new line and save as string
    strip = text.replace(dirt, '') # Strip that string from full text
    return strip

stepbystep = stripheaders(text)

print(stepbystep)



A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.

Guy Kuo <guykuo@u.washington.edu>



In [16]:
# Strip lines containing "@" (usually emails)

def stripcontains (text):
    strip = "\n".join([line for line in text.splitlines() if not "@" in line])
    return strip

stepbystep = stripcontains(stepbystep)

print(stepbystep)




A fair number of brave souls who upgraded their SI clock oscillator have
shared their experiences for this poll. Please send a brief message detailing
your experiences with the procedure. Top speed attained, CPU rated speed,
add on cards and adapters, heat sinks, hour of usage per day, floppy disk
functionality with 800 and 1.4 m floppies are especially requested.

I will be summarizing in the next two days, so please add to the network
knowledge base if you have done the clock upgrade and haven't answered this
poll. Thanks.



In [17]:
import string

# Lowercase and remove punctuation 

exclude = set(string.punctuation) 

def low_punc (text): 
    nopunc = ''.join(ch for ch in text.lower() if ch not in exclude)
    return nopunc

stepbystep = low_punc(stepbystep)

print(stepbystep)



a fair number of brave souls who upgraded their si clock oscillator have
shared their experiences for this poll please send a brief message detailing
your experiences with the procedure top speed attained cpu rated speed
add on cards and adapters heat sinks hour of usage per day floppy disk
functionality with 800 and 14 m floppies are especially requested

i will be summarizing in the next two days so please add to the network
knowledge base if you have done the clock upgrade and havent answered this
poll thanks



In [18]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords 

# Remove Stopwords and Lemmatize

stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

def stops_lemma (text):
    stops = " ".join([i for i in text.split() if i not in stop]) 
    lemmatized = " ".join(lemma.lemmatize(word) for word in stops.split()) 
    return lemmatized

stepbystep = stops_lemma(stepbystep)

print(stepbystep)

fair number brave soul upgraded si clock oscillator shared experience poll please send brief message detailing experience procedure top speed attained cpu rated speed add card adapter heat sink hour usage per day floppy disk functionality 800 14 floppy especially requested summarizing next two day please add network knowledge base done clock upgrade havent answered poll thanks


In [20]:
# Data Preprocessing all in one

stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

def clean(text):
    dirt = text.split("\n\n")[0] # locate headers before first double new line
    stripheaders = text.replace(dirt, '') # Remove headers
    stripline = "\n".join([line for line in stripheaders.splitlines() if not "@" in line]) # Remove line containing email addresses
    char = ''.join(ch for ch in stripline if ch not in exclude) # Remove punctuation
    stopwords = " ".join([i for i in char.lower().split() if i not in stop]) # Remove stop words
    lemmatized = " ".join(lemma.lemmatize(word) for word in stopwords.split()) # Lemmatize
    return lemmatized

# Apply to all texts
clean_doc = [clean(doc).split() for doc in df.text]   

print(clean_doc[1])

['fair', 'number', 'brave', 'soul', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experience', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experience', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'card', 'adapter', 'heat', 'sink', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', '800', '14', 'floppy', 'especially', 'requested', 'summarizing', 'next', 'two', 'day', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks']


In [26]:
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
import pyLDAvis.gensim

# Create dictionary and corpus
dictionary = corpora.Dictionary(clean_doc)
corpus = [dictionary.doc2bow(doc) for doc in clean_doc]

# Train model
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)

#Original dataset topics
print(data.target_names)

# Graph topics via Multidimensional Scaling (Relevence metric optimal around λ = 0.3 for topic interpretability)
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda, corpus, dictionary)


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
