# Testing the Pipeline from Class

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer
import nltk

from pipeline import NLPPipe, tweet_clean1

from helper_functions import txt_to_df

import pickle
%load_ext autoreload
%autoreload 2

In [2]:
all_tweets = pd.read_pickle("all_tweets.pkl")

In [3]:
corpus_list = all_tweets['long_text'].tolist()
# Let's turn all of the tweets into a list, so our Pipeline
# can work with our data better

In [4]:
nlp = NLPPipe(vectorizer=CountVectorizer(), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())
# Assign our Pipeline to a variable

In [6]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [7]:
pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names()).head()
# It looks like we have a basic Document Term matrix, but all of the terms shown seem pretty wrong.
# A good first step would be to take out strings with number, but let's see if there is anything that seems off.

Unnamed: 0,00,000,00hsyobokk,00k9t0ock9,00pm,00uejqfvnl,01,0102,01nyvgqvhi,02,...,𝐰𝐨𝐫𝐭𝐡,𝕃𝕄𝔽𝔸𝕆𝕆𝕆𝕆𝕆𝕆𝕆𝕆𝕆𝕆𝕆𝕆𝕆,𝗚𝗿𝗼𝘄𝘁𝗵𝗠𝗶𝗻𝗱𝘀𝗲𝘁,𝗮𝗰𝗰𝗼𝘂𝗻𝘁𝗮𝗯𝗶𝗹𝗶𝘁𝘆,𝗺𝗼𝗻𝗱𝗮𝘆𝘀,𝗻𝗼𝘃𝗲𝗺𝗯𝗲𝗿,𝘛𝘩𝘦𝘙𝘦𝘢𝘭𝘛𝘳𝘶𝘵𝘩𝘈𝘣𝘰𝘶𝘵𝘏𝘦𝘢𝘭𝘵𝘩,𝘤𝘰𝘮,𝙜𝙤𝙖𝙡𝙨,𝙧𝙪𝙡𝙚𝙨
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
nlp.vectorizer.vocabulary_['the']
# I didn't pass english stop words into the CountVectorizer, so that could be a good step to help out as well.

13747

In [5]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words='english', max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [6]:
nlp.fit(corpus_list)
nlp.transform(corpus_list);
# Fit the corpus and transform the corpus

In [35]:
dtm = pd.DataFrame(nlp.transform(corpus_list).toarray(), columns=nlp.vectorizer.get_feature_names())

  and should_run_async(code)


In [48]:
dtm.sum(0).sort_values(ascending=False)
# I think taking out vegan, http and plantbas is okay, since all of these tweets are about these specific topics.

  and should_run_async(code)


vegan       4411
http        3044
plantbas    1093
thi          752
amp          536
            ... 
brown         10
limit         10
later         10
late          10
nazi          10
Length: 1081, dtype: int64

In [7]:
stopwords = nltk.corpus.stopwords.words('english')

In [8]:
stopwords.append('vegan')
stopwords.append('vegan')
stopwords.append('http')

In [9]:
nlp = NLPPipe(vectorizer=CountVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [10]:
nlp.fit(corpus_list)
dtm_tf = nlp.transform(corpus_list)

In [11]:
nlp2 = NLPPipe(vectorizer=TfidfVectorizer(stop_words=stopwords, max_df=0.80, min_df=10), 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer(),
              cleaning_function=tweet_clean1)
# Now let's add those stopwords in there, and change the cleaning function.

In [12]:
nlp2.fit(corpus_list)
nlp2.transform(corpus_list);
# Fit the corpus and transform the corpus

In [13]:
dtm_tfidf = nlp2.transform(corpus_list)

Let's do some basic topic modeling

In [14]:
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.sklearn

In [15]:
lda_tf = LatentDirichletAllocation(n_components=5, random_state=0)
lda_tf.fit(dtm_tf)

  and should_run_async(code)


LatentDirichletAllocation(n_components=5, random_state=0)

In [67]:
tf_vectorizer = CountVectorizer(stop_words='english', max_df=0.80, min_df=10)

  and should_run_async(code)


In [16]:
pyLDAvis.enable_notebook()
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, nlp.vectorizer)

  and should_run_async(code)


In [19]:
lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)

  and should_run_async(code)


LatentDirichletAllocation(random_state=0)

In [20]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, nlp2.vectorizer)

  and should_run_async(code)
