# Topic Model Machine
## William Harding <a href="mailto:william.jeffrey.harding@gmail.com"> Email  </a>
### A systematic way to quickly build and look at topical models. 
Topic modeling with real datasets requires a lot of cleaning and tweaking of the data to get valuable insights. Because the process is very iterative,  I built a process to move through models quickly and compare them to each other. 

In [166]:
import matplotlib 
get_ipython().magic(u'matplotlib inline')

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
#import pyLDAvis
##import pyLDAvis.sklearn

from __future__ import print_function
from time import time
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [203]:
n_samples = 2000
n_features = 10000
n_topics = 20
n_top_words = 100


Stopwords. You may need to run nltk.download() to get the corpus needed, including the defauilt stoplist. 

In [204]:
from nltk.corpus import stopwords
mystopwords = stopwords.words("english")   #default list from NTLK

#More stop words - once you run the document, you may find words that aren't helping your model
#go back to this step to add more words and run again.
more_stop_words = ['like','linkedin','twitter','facebook','would','think','google',
                   'nursing','patient','care','patients','financial']  
[mystopwords.append(x) for x in more_stop_words]


[None, None, None, None, None, None, None, None, None, None, None, None]

Load the documents into dataframe, they must be loaded in DICT like format. This step requires the most customized work, as the format of your input data will be  different. 
* End result is that your dataset is a dictionary with keys that you can match up later.  
* You will also need a dataframe that you can match the scores back up to.
* Data is cleaned at this point, I like to leave the origional text in tact, but that's up to you

In [218]:
#Using a list of documents from 'trusted news sources'
finance = pd.read_excel(r'C:\Users\v-wihar\OneDrive - METIA LTD\LinkedIn\LinkedIn_TopicModeling_SourceContent_111616.xlsx',
                       sheetname="Finance")
healthcare = pd.read_excel(r'C:\Users\v-wihar\OneDrive - METIA LTD\LinkedIn\LinkedIn_TopicModeling_SourceContent_111616.xlsx',
                          sheetname="Healthcare")

df = pd.concat([finance,healthcare])
docs = df['Content'].str.lower().replace({r'\n': ' ','[^&a-zA-Z]':' ','\s+': ' '}, regex=True).tolist()

corpus = {} 
for i in range(len(docs)):
    corpus[i] = docs[i]
#print(df.index[4], '\n',corpus[4])   #just printing out one for an example


Default functions and globals that you will need for your model. 

In [219]:
data_samples = corpus.values()
tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                lowercase= True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b',
                                max_features=n_features,
                                stop_words=mystopwords,
                                max_df=0.5,
                                min_df=10)

tf = tf_vectorizer.fit_transform(data_samples)

lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0
                                )
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()

#bunch of functions that I wrote to build the tables for the model:
def get_single_topic(lda, tf_feature_names, n_top_words, topic):
    words = [tf_feature_names[i] for i in lda.components_[topic].argsort()[:-n_top_words - 1:-1]]
    scores = lda.components_[topic][lda.components_[topic].argsort()[:-n_top_words - 1:-1]]
    df = pd.DataFrame(index=words,columns=['topic_{}'.format(topic)],data=scores)
    return df

def get_all_topics(lda, tf_feature_names, n_top_words,n_topics):
    df = pd.DataFrame()
    for topic in range(n_topics):
        tmpdf = get_single_topic(lda, tf_feature_names, n_top_words, topic)
        for item in tmpdf.index:
            df.loc[item,'topic_{}'.format(topic)] = tmpdf.loc[item,'topic_{}'.format(topic)]
    return df

def get_topic_names(lda, tf_feature_names, n_top_words, n_topics):
    themes = pd.Series()
    for topic in range(n_topics):
        theme = " ".join([tf_feature_names[i] for i in lda.components_[topic].argsort()[:-n_top_words - 1:-1]])
        themes.loc['topic_{}'.format(topic)] = theme
    return themes


In [220]:
def score_document(doc_dic,df,lda, tf_feature_names, n_top_words, n_topics,
                    returnDF=True,confidence=.01):
    '''
    gives scores to the origional document, assigning a category to each one. 

    returnDF : By default returns a DataFrame, set to false to return a dict.
    confidence : this is the threashold that the model must meet to match the document to a topic.
    set to .01 to include practically everything, set to .99 to include almost nothing.

    document_scores = score_document(doc_dic,df,lda, tf_feature_names, n_top_words, n_topics)
    '''
    results_dict = {}
    for key, item in enumerate(doc_dic.keys()):
        document = doc_dic[item]
        words = [tf_feature_names[i] for i in tf.getrow(list(doc_dic).index(item)).indices]
        scores = df[[word in words for word in df.index]]
        TM_Score = pd.DataFrame()
        TM_Score['docScore'] = scores.sum() 
        TM_Score['theme'] = get_topic_names(lda, tf_feature_names, n_top_words, n_topics)
        results = TM_Score['docScore'].fillna(0).to_dict()
        results['document'] = document
        results['key'] = item
        results['top_score'] = TM_Score['docScore'].max()

        if TM_Score['docScore'].max() >= confidence:
            results['top_theme'] = TM_Score['theme'][TM_Score['docScore'].tolist().index(TM_Score['docScore'].max())]
            results['top_topic'] = TM_Score.index[TM_Score['docScore'].tolist().index(TM_Score['docScore'].max())]
        else: 
            results['top_theme'] = 'unassigned'
            results['top_topic'] = 'unassigned'
        results_dict[key] = results
    if returnDF:
        return pd.DataFrame(results_dict).T
    else:
        return results_dict

In [221]:
Scored_words = get_all_topics(lda, tf_feature_names, n_top_words,n_topics)
Scored_corpus = score_document(corpus,Scored_words,lda, tf_feature_names, n_top_words, n_topics)
get_topic_names(lda, tf_feature_names, n_top_words, n_topics)

topic_0     bond stress billion year market trump investor...
topic_1     medicine university department medical center ...
topic_2     cfp center library building main band part ser...
topic_3     plus series launch apple value new managers on...
topic_4     funds cfp service tax one bond board term clin...
topic_5     cfp planning board center professionals profes...
topic_6     montana society cohen gas area oil manager cla...
topic_7     value index stock price investors stocks marke...
topic_8     cme education continuing credit activity accre...
topic_9     fbi coalition mental agents rule investors age...
topic_10    tax irs identity returns taxpayers partners se...
topic_11    information implant tax tissue tooth form soft...
topic_12    japan japanese said security information forei...
topic_13    transition team wall street interests water sp...
topic_14    democrats leader said democratic senate west t...
topic_15    transition trump team president two one elect ...
topic_16

Now we have to get the output into some formats that we can digest. I'm using Excel for visualization in this case so I'll just want to make some various tables:

In [264]:
#big table with all of the documents, scored by topic and with choice.
output = pd.merge(df, Scored_corpus, left_index=True, right_index=True).drop_duplicates()

In [275]:
output.head(20)


Unnamed: 0,Type,Source,Count,Link,Title,Content,document,key,top_score,top_theme,...,topic_18,topic_19,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9
0,Finance,Fortune,1,http://fortune.com/2016/11/16/krispy-kreme-law...,Krispy Kreme Is Being Sued For Not Using Real ...,The lawsuit claims the company is robbing cust...,the lawsuit claims the company is robbing cust...,0,2324.49,health dental risk medical treatment use may s...,...,2324.49,1011.68,2.86048,5.26626,2.342,28.9592,1.12666,68.9757,91.5283,4.32308
0,Healthcare,PubMed,1,https://www.ncbi.nlm.nih.gov/pubmed/27834777/,The Road Less Traveled: Alternative Pathways f...,1Laboratory of Experimental Psychology and Neu...,the lawsuit claims the company is robbing cust...,0,2324.49,health dental risk medical treatment use may s...,...,2324.49,1011.68,2.86048,5.26626,2.342,28.9592,1.12666,68.9757,91.5283,4.32308
1,Finance,Fortune,2,http://fortune.com/2016/11/16/elizabeth-warren...,"Elizabeth Warren Threatens Trump: Don't Let ""W...",As she calls for an overhaul.\n\nPresident-ele...,as she calls for an overhaul president elect ...,1,4319.56,one new said time people also trump take may y...,...,4313.0,4319.56,29.3861,36.3831,17.4505,302.224,72.9748,465.879,595.489,29.3121
1,Healthcare,PubMed,2,https://www.ncbi.nlm.nih.gov/pubmed/27841876/,Cardioprotection and lifespan extension by the...,"1Institute of Molecular Biosciences, NAWI Graz...",as she calls for an overhaul president elect ...,1,4319.56,one new said time people also trump take may y...,...,4313.0,4319.56,29.3861,36.3831,17.4505,302.224,72.9748,465.879,595.489,29.3121
2,Finance,Fortune,3,http://fortune.com/2016/11/16/koch-industries-...,Koch Industries Pours $2.5 Billion into Busine...,Infor brings digital expertise to Koch Industr...,infor brings digital expertise to koch industr...,2,2602.02,health dental risk medical treatment use may s...,...,2602.02,2290.01,11.1016,19.8794,6.99004,146.686,17.4642,276.552,269.615,10.4219
2,Healthcare,PubMed,3,https://www.ncbi.nlm.nih.gov/pubmed/27402560/,Intermuscular adipose tissue and thigh muscle ...,"1Ben-Gurion University of the Negev, Beer-Shev...",infor brings digital expertise to koch industr...,2,2602.02,health dental risk medical treatment use may s...,...,2602.02,2290.01,11.1016,19.8794,6.99004,146.686,17.4642,276.552,269.615,10.4219
3,Finance,Fortune,4,http://fortune.com/2016/11/15/snapchat-ipo-sec/,Snapchat Said to Have Confidentially Filed for...,"According to a new report, it was before the U...",according to a new report it was before the u...,3,1321.11,one new said time people also trump take may y...,...,643.554,1321.11,9.72075,11.5683,4.14919,78.2556,8.93235,204.783,202.212,5.08413
3,Healthcare,PubMed,4,https://www.ncbi.nlm.nih.gov/pubmed/27841873/,DNMT3A mutations promote anthracycline resista...,"1Human Oncology and Pathogenesis Program, Memo...",according to a new report it was before the u...,3,1321.11,one new said time people also trump take may y...,...,643.554,1321.11,9.72075,11.5683,4.14919,78.2556,8.93235,204.783,202.212,5.08413
4,Finance,Fortune,5,http://fortune.com/2016/11/16/disney-ceo-bob-i...,Prediction: Disney’s Next CEO Will Be Someone ...,Bob Iger’s contract ends in 2018.\n\nThe staff...,bob iger s contract ends in the staff of...,4,1383.56,one new said time people also trump take may y...,...,674.045,1383.56,5.53401,8.3419,3.88135,72.3225,6.89944,70.1144,65.3012,7.76767
4,Healthcare,PubMed,5,https://www.ncbi.nlm.nih.gov/pubmed/27832072/,Current Incentives for Scientists Lead to Unde...,Author information\n1Centre for Research in An...,bob iger s contract ends in the staff of...,4,1383.56,one new said time people also trump take may y...,...,674.045,1383.56,5.53401,8.3419,3.88135,72.3225,6.89944,70.1144,65.3012,7.76767


In [267]:
#To compare topic scores by group, I'm going to use the average score by group
def compare_faction(df1,col):
    tmp = pd.DataFrame()
    tmp['All score'] = df1.ix[:,['topic_{}'.format(i) for i in range(n_topics)]].mean()
    for item in np.unique(df1[col].tolist()):
        tmp[item+' score'] = output.ix[output[col]==item,['topic_{}'.format(i) for i in range(n_topics)]].mean()
        tmp[item] = tmp[item+' score']-tmp['All score']  # the goal is to create a score that shows the difference between groupings
    tmp = tmp[[n for n in tmp.columns if ' score' not in n]]
    return tmp

by_source = compare_faction(output,'Source')
by_type = compare_faction(output,'Type')

In [268]:
by_type

Unnamed: 0,Finance,Healthcare
topic_0,28.341477,-14.575617
topic_1,-25.060389,12.8882
topic_2,2.24334,-1.153718
topic_3,2.936572,-1.510237
topic_4,1.251672,-0.643717
topic_5,12.706221,-6.534628
topic_6,1.953652,-1.004735
topic_7,37.956543,-19.520508
topic_8,-56.50677,29.060624
topic_9,1.873215,-0.963368


## Output to Excel:

In [274]:
writer = pd.ExcelWriter(r'C:\Users\v-wihar\OneDrive - METIA LTD\LinkedIn\models\trusted_sources.xlsx')
by_type.to_excel(writer,'by type')
by_source.to_excel(writer,'by source')
Scored_words.to_excel(writer,'Scored words')
output.to_excel(writer,'Scored Documents')
pd.DataFrame(get_topic_names(lda, tf_feature_names, n_top_words, n_topics)).to_excel(writer,'Autogen topic names')
writer.save()