# Drill

Take the well-known 20 newsgroups dataset and use each of the methods on it. Your goal is to determine which method, if any, best reproduces the topics represented by the newsgroups. Write up a report where you evaluate each method in light of the 'ground truth'- the known source of each newsgroup post. Which works best, and why do you think this is the case?

### Import Statements

In [0]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### 20 Newsgroups Dataset

In [4]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups()

from pprint import pprint
pprint(list(newsgroups.target_names))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


### Generating the tf-idf Matrix

In [6]:
# Processing the data.

from sklearn.feature_extraction.text import TfidfVectorizer
import re

#reading in the data, this time in the form of paragraphs
#from sklearn.datasets import fetch_20newsgroups
#newsgroups = fetch_20newsgroups()

#processing
newsgroups_paras=[]
for paragraph in list(newsgroups.target_names):
   para=paragraph#[0]
   print ('para', para)
   #removing the double-dash from all words
   para=[re.sub(r'--','',word) for word in para]
   #Forming each paragraph into a string and adding it to the list of strings.
   newsgroups_paras.append(''.join(para))
#print newsgroups_paras
vectorizer = TfidfVectorizer(stop_words='english')
newsgroups_paras_tfidf=vectorizer.fit_transform(newsgroups_paras)
#print newsgroups_paras_tfidf


# Creating the tf-idf matrix.
vectorizer = TfidfVectorizer(stop_words='english')
newsgroups_paras_tfidf=vectorizer.fit_transform(newsgroups_paras)

# Getting the word list.
terms = vectorizer.get_feature_names()

# Number of topics.
ntopics=5

# Linking words to topics
def word_topic(tfidf, solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=chosenlist
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

para alt.atheism
para comp.graphics
para comp.os.ms-windows.misc
para comp.sys.ibm.pc.hardware
para comp.sys.mac.hardware
para comp.windows.x
para misc.forsale
para rec.autos
para rec.motorcycles
para rec.sport.baseball
para rec.sport.hockey
para sci.crypt
para sci.electronics
para sci.med
para sci.space
para soc.religion.christian
para talk.politics.guns
para talk.politics.mideast
para talk.politics.misc
para talk.religion.misc


### Fitting the Three Topic Extraction Models

In [0]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
newsgroups_paras_lsa = lsa.fit_transform(newsgroups_paras_tfidf)

components_lsa = word_topic(newsgroups_paras_tfidf, newsgroups_paras_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)  

In [18]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_topics=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

newsgroups_paras_lda = lda.fit_transform(newsgroups_paras_tfidf) 

components_lda = word_topic(newsgroups_paras_tfidf, newsgroups_paras_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)

TypeError: ignored

In [0]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to stop even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
newsgroups_paras_nmf = nmf.fit_transform(newsgroups_paras_tfidf) 

components_nmf = word_topic(newsgroups_paras_tfidf, newsgroups_paras_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

### Inspecting the Topics

In [21]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
   LSA NNMF
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
0  NaN  NaN
Topic 1:
   LSA NNMF
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
1  NaN  NaN
Topic 2:
   LSA NNMF
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
2  NaN  NaN
Topic 3:
   LSA NNMF
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
3  NaN  NaN
Topic 4:
   LSA NNMF
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
4  NaN  NaN
