# Quick Topic Modeling


* Just one column of text
* normal english stopwords


In [8]:
import pandas as pd
import numpy as np

from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from bs4 import BeautifulSoup

I found a great list of startup ideas on HackerNews. Really short, interetsing, easy to read. The vocabulary here would be perfect for a topic model.  [Here is the origional link](https://unawaz.github.io/stochastic-hill-climbing/tasks/).

This list is such an interesting read I might go through the rest of it in the future. 
for example:
**_Assist in setting up or optimizing analytics tools for tracking visitors' behaviors._**

In [26]:
soup = BeautifulSoup(open('data/businessIdeas.html'), 'html.parser')
documents = [l.text for l in soup.ol.find_all('li')]

In [20]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
    
n_topics = 20

# Run NMF
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [25]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        [t for t]
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic : 0
design software computer engineering aided hardware specifications create cad use
Topic : 1
using tools hand power instruments install assemble measuring cut remove
Topic : 2
equipment test clean machinery use maintain operate operation repairs repair
Topic : 3
maintain records prepare reports required documentation inventory review production files
Topic : 4
work areas clean orders schedules performed assignments assign completed schedule
Topic : 5
activities supervise staff direct coordinate train workers personnel members plan
Topic : 6
ensure monitor compliance regulations standards safety inspect specifications conformance quality
Topic : 7
information provide customers services technical assistance obtain support regarding clients
Topic : 8
materials products select finished used production weigh processing raw measure
Topic : 9
systems electrical control test components energy install water electronic mechanical
Topic : 10
develop programs plans implement new strategie