# Testing the Pipeline from Class

In [6]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
from nltk.stem import PorterStemmer
from nltk import SnowballStemmer
import nltk

from pipeline import NLPPipe, tweet_clean1

from helper_functions import txt_to_df

import pickle
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
all_tweets = pd.read_pickle("all_tweets.pkl")

In [9]:
corpus_list = all_tweets['long_text'].tolist()
# Let's turn all of the tweets into a list, so our Pipeline
# can work with our data better

In [None]:
nlp = NLPPipe(vectorizer=CountVectorizer(), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())
# Assign our Pipeline to a variable

In [None]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [None]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()
# It looks like we have a basic Document Term matrix, but all of the terms shown seem pretty wrong.
# A good first step would be to take out strings with number, but let's see if there is anything that seems off.

In [None]:
nlp.vectorizer.vocabulary_['the']
# I didn't pass english stop words into the CountVectorizer, so that could be a good step to help out as well.

In [None]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words='english', max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [None]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [None]:
dtm = pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names())

In [None]:
dtm.sum(0).sort_values(ascending=False)
# I think taking out vegan, http and plantbas is okay, since all of these tweets are about these specific topics.

In [2]:
stopwords = nltk.corpus.stopwords.words('english')

In [3]:
stopwords.append('vegan')
stopwords.append('http')
stopwords.append('plantbas')
# Add those terms to the stopwords

In [120]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [9]:
nlp.fit(corpus_list)
dtm_tf = nlp.transform(corpus_list)

NameError: name 'nlp' is not defined

In [10]:
nlp2 = NLPPipe(vectorizer=TfidfVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TweetTokenizer().tokenize, 
              stemmer=SnowballStemmer("english"),
              cleaning_function=tweet_clean1)

In [11]:
nlp2.fit(corpus_list)
dtm_tfidf = nlp2.transform(corpus_list)
# Fit the corpus and transform the corpus

Let's do some basic topic modeling

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [14]:
lda_tf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf.fit(dtm_tf)
# Visualize our normal Count Vectorized model

  and should_run_async(code)


LatentDirichletAllocation(n_components=5, random_state=0)

In [15]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, nlp.vectorizer)

  and should_run_async(code)


In [None]:
lda_tfidf = LatentDirichletAllocation(n_components=9, random_state=0)
lda_tfidf.fit(dtm_tfidf)
# Visual our normalized Vectorized model

  and should_run_async(code)


In [None]:
pyLDAvis.enable_notebook()

pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, nlp2.vectorizer)

After further research, it seems as though NMF is better for smaller documents and smaller amounts of data, so it may predict topics better for tweets and this specific corpus.

In [12]:
nmf_model = NMF(9)
doc_topic = nmf_model.fit_transform(dtm_tfidf)

In [13]:

def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [14]:
display_topics(nmf_model, nlp2.vectorizer.get_feature_names(), 15)


Topic  0
new, love, day, tri, good, get, today, veganfood, like, us, look, one, thank, make, crueltyfre

Topic  1
nut, seed, natur, pinterest, pathway, linkedln, countless, superfood, heal, explor, facebook, twitter, flour, youtub, info

Topic  2
ur, juic, favorit, love, fruit, veget, nutrit, eat, recip, moist, cleanser, facial, soft, care, keep

Topic  3
soap, skincar, sk, toxin, free, herb, creat, use, natur, oil, glutenfre, organ, skin, gift, crueltyfre

Topic  4
recip, delici, easi, soup, via, vegetarian, veganrecip, cook, curri, make, dinner, bean, rt, potato, roast

Topic  5
food, vegetarian, healthi, organ, eat, raw, health, govegan, diet, foodi, healthyfood, high, foodporn, gmofre, fiber

Topic  6
glutenfre, chocol, cake, dairyfre, cooki, pumpkin, paleo, appl, dessert, breakfast, butter, chip, coconut, sweet, bake

Topic  7
anim, go, eat, meat, govegan, peopl, dont, human, think, animalright, live, whi, want, stop, pleas

Topic  8
shampoo, condition, bmrtg, travel, luxuri, sul

In [None]:
# 9 Topics looks very good

In [22]:
H = pd.DataFrame(doc_topic.round(5),
             index = corpus_list,
             columns = ["0","1",'2','3','4','5','6','7','8'])
H

Unnamed: 0,0,1,2,3,4,5,6,7,8
how can you possibly organize for a political revolution if you can’t even change your dietary habits? how can you sacrifice for social change if you can’t sacrifice your taste buds? going vegan is a first step for many people to realize the power their individual actions hold.,0.00552,0.0,0.00000,0.00106,0.00000,0.00414,0.00109,0.02736,0.00310
Agreeing this too 😣I'm so confused wit what stand should i take w.r.t. veganism though,0.00449,0.0,0.00053,0.00092,0.00000,0.00000,0.00000,0.00916,0.00000
Question to #AnimalActivists &amp; #Vegan saying #Trump2020 :\n\nSo you’re ok w #More of this ⬇️\n\n#Cuts to the #EndangeredSpeciesAct\n\n#Drilling in #Alaska’s #Wildlife Refuge...?\n\nA VOTE for TRUMP is a VOTE AGAINST ANIMALS\n\n#NotMyPresident\n\n#VoteHimOut\n\n https://t.co/YZJeUOrqES,0.00160,0.0,0.00000,0.00000,0.00000,0.00000,0.00000,0.03900,0.00000
mainstream veganism is very misconstrued. but the fact of the matter is: the future is vegan. there is no future without veganism-life on this planet cannot sustain itself without humans switching to mostly plantbased eating. we can’t improve society if we go extinct first.,0.00134,0.0,0.00577,0.00000,0.00000,0.00473,0.00000,0.03897,0.00312
me:\nmy brain: lets just try n do low fat vegan keto! its not relapse its for the animals,0.00761,0.0,0.00000,0.00000,0.00123,0.00000,0.00310,0.04177,0.00000
...,...,...,...,...,...,...,...,...,...
This was at vegan street fair and I’m wishing more than anything I could eat these fried avocados right now. I need to eat a breakfast. #avocado #vegan #veganstreetfair #avocados #veganfood #veganlife #breakfast #food #foodie #foodiesofinstagram #fried #friedavocado https://t.co/rZB4YhXTCw,0.01545,0.0,0.01135,0.00000,0.00000,0.03890,0.00870,0.01322,0.00000
"Do no harm, be kind!\n\n#vegan #crueltyfree https://t.co/PCxjdfEPAE",0.01007,0.0,0.00000,0.00368,0.00000,0.00000,0.00000,0.00521,0.00145
Cork Fabric from Portugal now available at https://t.co/0talKzrTDv\n#corkfabric #organic #vegan #sustainable #bags #wallets #purses #crafts #sewing @alabaonajin @ayo_uk @smallbizshoutuk https://t.co/LHL0IEF2H9,0.00369,0.0,0.00000,0.00340,0.00000,0.01235,0.00394,0.00000,0.00004
Don’t steal my milk!\n\n#vegan #DairyFree https://t.co/3Jz7b2ntjE,0.00106,0.0,0.00000,0.00000,0.00000,0.00000,0.03545,0.00367,0.00000


In [27]:
H_topic = H.idxmax(axis=1)

In [33]:
H_topic

how can you possibly organize for a political revolution if you can’t even change your dietary habits? how can you sacrifice for social change if you can’t sacrifice your taste buds? going vegan is a first step for many people to realize the power their individual actions hold.                7
Agreeing this too 😣I'm so confused wit what stand should i take w.r.t. veganism though                                                                                                                                                                                                                7
Question to #AnimalActivists &amp; #Vegan saying #Trump2020 :\n\nSo you’re ok w #More of this ⬇️\n\n#Cuts to the #EndangeredSpeciesAct\n\n#Drilling in #Alaska’s #Wildlife Refuge...?\n\nA VOTE for TRUMP is a VOTE AGAINST ANIMALS\n\n#NotMyPresident\n\n#VoteHimOut\n\n https://t.co/YZJeUOrqES     7
mainstream veganism is very misconstrued. but the fact of the matter is: the future is vegan. there is no future