In [35]:
# packages to store and manipulate data
import pandas as pd
import numpy as np

# plotting packages
import matplotlib.pyplot as plt
import seaborn as sns

# model building package
import sklearn

# package to clean text
import re

In [36]:
df = pd.read_csv('/Users/AF/Downloads/messages_meta.csv')
df

Unnamed: 0,text
0,???
1,. pa order plss
2,]
3,3g6k9q
4,3jhqnk
...,...
396,👍
397,😊
398,😍
399,🙄🙄


In [37]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

In [38]:
my_stopwords = nltk.corpus.stopwords.words('english')
a = ['ng', 'nang', 'po', 'na', 'ba', 'sa','ko','hi', 'lang']
for c in a:
    my_stopwords.append(c)
print(my_stopwords)
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_text(text, bigrams=False):
    text = text.lower() # lower case
    text = re.sub('['+my_punctuation + ']+', ' ', text) # strip punctuation
    text = re.sub('\s+', ' ', text) #remove double spacing
    text = re.sub('([0-9]+)', '', text) # remove numbers
    text_token_list = [word for word in text.split(' ')
                            if word not in my_stopwords] # remove stopwords

    text_token_list = [word_rooter(word) if '#' not in word else word
                        for word in text_token_list] # apply word rooter
    if bigrams:
        text_token_list = text_token_list+[text_token_list[i]+'_'+text_token_list[i+1]
                                            for i in range(len(text_token_list)-1)]
    text = ' '.join(text_token_list)
    return text

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [39]:
df['clean_text'] = df.text.apply(clean_text)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=10, token_pattern='\w+|\$[\d\.]+|\S+')
tfidf = tfidf_vectorizer.fit_transform(df['clean_text']).toarray()
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# # LDA can only use raw term counts for LDA because it is a probabilistic graphical model
# tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
# tf = tf_vectorizer.fit_transform(documents)
# tf_feature_names = tf_vectorizer.get_feature_names()

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=1, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['clean_text']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

In [41]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

no_topics = 10

# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=0, alpha=.1, l1_ratio=.5).fit(tfidf)

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0).fit(tf)

In [42]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [43]:
no_top_words = 6

nmf = display_topics(nmf_model, tfidf_feature_names, no_top_words)
lda = display_topics(model, tf_feature_names, no_top_words)

In [44]:
import pandas as pd
from IPython.display import display

pd.options.display.max_columns = None
display(lda)
display(nmf)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,regular,5.1,chicken,23.0,ask,4.1,time,10.1,agent,7.1,magbayad,11.1,burger,12.1,cart,12.1,menu,12.1,order,8.4
1,fri,4.1,order,21.8,much,4.1,deliveri,6.1,live,6.1,thank,10.1,chees,9.1,la,11.1,good,5.1,follow,5.1
2,code,4.1,pc,12.1,would,4.1,nag,5.1,talk,4.1,paano,9.1,drive,6.1,breakfast,10.1,may,4.1,test,4.1
3,shake,4.1,fri,10.1,ok,4.1,concern,5.1,want,3.1,time,4.1,use,6.1,,7.1,choos,4.1,price,3.1
4,run,3.1,want,7.1,adult,3.1,order,5.1,hello,3.1,ano,3.0,heyyyyy,6.1,sandwich,7.1,teriyaki,4.1,cancel,3.1
5,cebu,3.1,deliv,6.1,someon,3.1,pwede,4.1,,3.1,run,2.1,order,6.1,chicken,7.1,ang,3.1,nag,3.1


Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,order,3.3,chicken,2.6,,2.4,fri,2.2,menu,2.2,thank,2.2,magbayad,2.2,want,2.2,cart,1.5,ask,1.9
1,burger,0.1,burger,0.2,good,0.2,much,0.6,breakfast,0.1,time,0.4,la,0.0,la,0.0,la,1.4,good,0.8
2,breakfast,0.1,,0.1,want,0.0,burger,0.6,want,0.0,good,0.1,breakfast,0.0,breakfast,0.0,breakfast,0.5,,0.7
3,time,0.1,la,0.0,la,0.0,want,0.0,la,0.0,,0.1,burger,0.0,burger,0.0,want,0.0,much,0.6
4,want,0.0,want,0.0,breakfast,0.0,la,0.0,burger,0.0,much,0.0,cart,0.0,cart,0.0,burger,0.0,time,0.0
5,good,0.0,breakfast,0.0,burger,0.0,breakfast,0.0,cart,0.0,want,0.0,chicken,0.0,chicken,0.0,chicken,0.0,breakfast,0.0


In [45]:
nmf

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,order,3.3,chicken,2.6,,2.4,fri,2.2,menu,2.2,thank,2.2,magbayad,2.2,want,2.2,cart,1.5,ask,1.9
1,burger,0.1,burger,0.2,good,0.2,much,0.6,breakfast,0.1,time,0.4,la,0.0,la,0.0,la,1.4,good,0.8
2,breakfast,0.1,,0.1,want,0.0,burger,0.6,want,0.0,good,0.1,breakfast,0.0,breakfast,0.0,breakfast,0.5,,0.7
3,time,0.1,la,0.0,la,0.0,want,0.0,la,0.0,,0.1,burger,0.0,burger,0.0,want,0.0,much,0.6
4,want,0.0,want,0.0,breakfast,0.0,la,0.0,burger,0.0,much,0.0,cart,0.0,cart,0.0,burger,0.0,time,0.0
5,good,0.0,breakfast,0.0,burger,0.0,breakfast,0.0,cart,0.0,want,0.0,chicken,0.0,chicken,0.0,chicken,0.0,breakfast,0.0


In [46]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('/Users/AF/Downloads/topic_modeling.xlsx', engine='xlsxwriter')

# Write each dataframe to a different worksheet.
nmf.to_excel(writer, sheet_name='nmf_topics')
lda.to_excel(writer, sheet_name='lda_topics')

# Close the Pandas Excel writer and output the Excel file.
writer.save()