# Topic Modeling Assignment

In [115]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [116]:
data = pd.read_pickle('data/articles.pkl')

In [117]:
data.columns

Index(['document_type', 'web_url', 'lead_paragraph', 'abstract', 'snippet',
       'news_desk', 'word_count', 'source', 'section_name', 'subsection_name',
       '_id', 'pub_date', 'print_page', 'headline', 'content'],
      dtype='object')

In [118]:
data.head(2)

Unnamed: 0,document_type,web_url,lead_paragraph,abstract,snippet,news_desk,word_count,source,section_name,subsection_name,_id,pub_date,print_page,headline,content
0,article,http://www.nytimes.com/2013/10/03/sports/footb...,You would think that in a symmetric zero-sum s...,,You would think that in a symmetric zero-sum s...,Sports,347,The New York Times,Sports,Pro Football,524d4e3a38f0d8198974001f,2013-10-03T00:00:00Z,,Week 5 Probabilities: Why Offense Is More Impo...,the original goal building model football fore...
1,article,http://www.nytimes.com/2013/10/03/us/new-immig...,House Democrats on Wednesday unveiled an immig...,House Democrats unveil immigration bill that p...,House Democrats on Wednesday unveiled an immig...,National,83,The New York Times,U.S.,,524cf71338f0d8198973ff7b,2013-10-03T00:00:00Z,21.0,New Immigration Bill Put Forward,house unveiled immigration bill provides path ...


In [119]:
docs = data['content']

In [161]:
doc_vect = CountVectorizer(stop_words = 'english' ,max_features = 5000)
doc_counts = doc_vect.fit_transform(docs)
doc_features = np.array(doc_vect.get_feature_names())
doc_counts = doc_counts.toarray()

In [173]:
import numpy as np

class NMF(object):
    
    def __init__(self, V, k, max_iterations):
        
        self.V = V
        self.k = k
        self.max_iterations = max_iterations
        self.W = np.random.rand(self.V.shape[0],k)
        self.H = np.random.rand(k, self.V.shape[1])
        
    def fit(self):
        
        cost = [0]
        
        for i in range(self.max_iterations):
            
            self.H = np.linalg.lstsq(self.W, self.V)[0]
            self.H[self.H < 0] = 0
        
            self.W = np.linalg.lstsq(np.transpose(self.H), np.transpose(self.V))[0]
            self.W[self.W < 0] = 0
            self.W = np.transpose(self.W)
        
            cost.append(np.linalg.norm(self.V - np.dot(self.W, self.H)))
        
        
            if np.sqrt((cost[-1] - cost[-2])**2) < 0.001:
            
                break
            
        
        return self.W, self.H
    
    def score(self):
        
        return np.linalg.norm(self.V - np.dot(self.W, self.H))

In [171]:
t = NMF(doc_counts, 5, 1000)

W, H = t.fit()



In [172]:
for i,topic in enumerate(H):
    
    print('Topic ',i, ': ', ','.join([str(x) for x in doc_features[topic.argsort()[-10:]]]))

Topic  0 :  obama,senate,shutdown,party,care,law,health,government,house,republican
Topic  1 :  accidental,time,state,law,said,death,firearm,year,child,gun
Topic  2 :  state,netanyahu,obama,president,united,nuclear,rouhani,iran,said,mr
Topic  3 :  american,state,percent,like,work,people,company,year,new,said
Topic  4 :  league,play,yankee,time,player,year,team,said,season,game


## Built in NMF

In [122]:
from sklearn.decomposition import NMF

In [138]:
skl_NMF = NMF(n_components=5)

W = skl_NMF.fit_transform(doc_counts)
H = skl_NMF.components_

In [166]:
skl_NMF.reconstruction_err_

2060.017949528096

In [164]:
for i,topic in enumerate(H):
    
    print('Topic ',i, ': ', ','.join([str(x) for x in doc_features[topic.argsort()[-10:]]]))

Topic  0 :  american,state,percent,like,work,people,company,year,new,said
Topic  1 :  league,yankee,play,time,player,year,team,said,season,game
Topic  2 :  president,senate,shutdown,party,care,law,health,government,house,republican
Topic  3 :  netanyahu,state,obama,president,nuclear,united,rouhani,iran,said,mr
Topic  4 :  accidental,time,state,law,said,death,firearm,year,child,gun
