In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', -1)

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import spacy
import re
import string
from textblob import TextBlob
import seaborn as sns
plt.style.use('ggplot')
font = {'weight' : 'bold',
        'size'   : 14}
plt.rc('font', **font)

  if __name__ == '__main__':


In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['br', 'ive', 'gery', 'one', 'product']) #<<- keep updating stop_words if there are unneccessary words whenever checking text 

In [3]:
# train.pkl fille was created from "Filter_data_final.jupyter notebook in data_preprocessing"
df = pd.read_pickle('/Users/ellykyles/capstone2/data/train.pkl')

In [14]:
df.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text',
       'clean_text', 'token_words', 'join_token', 'Date', 'year',
       'words_count', 'unique_words', 'new_token_words', 'data_lemmatized'],
      dtype='object')

In [4]:
data = df.clean_text.values.tolist()

pprint(data[:1])

['i was pleasantly surprised at these little snacks  thought sure they wouldnt '
 'fill me up but they did  had just the right amount of crunch and a hint of '
 'cinnamon flavor that was not overpowering or bitter']


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['was', 'pleasantly', 'surprised', 'at', 'these', 'little', 'snacks', 'thought', 'sure', 'they', 'wouldnt', 'fill', 'me', 'up', 'but', 'they', 'did', 'had', 'just', 'the', 'right', 'amount', 'of', 'crunch', 'and', 'hint', 'of', 'cinnamon', 'flavor', 'that', 'was', 'not', 'overpowering', 'or', 'bitter']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['was', 'pleasantly_surprised', 'at', 'these', 'little', 'snacks', 'thought', 'sure', 'they', 'wouldnt', 'fill', 'me', 'up', 'but', 'they', 'did', 'had', 'just', 'the', 'right', 'amount', 'of', 'crunch', 'and', 'hint', 'of', 'cinnamon', 'flavor', 'that', 'was', 'not', 'overpowering', 'or', 'bitter']


In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)

nlp = spacy.load('en', disable=['parser', 'ner'])

In [11]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN'])

print(data_lemmatized[:1])

[['snack', 'amount', 'crunch', 'hint', 'cinnamon', 'flavor']]


In [12]:
df['data_lemmatized'] = np.array(data_lemmatized)

In [13]:
df['data_lemmatized'].iloc[:10]

491450    [snack, amount, crunch, hint, cinnamon, flavor]                                                                                                                                                                                                                                                                                                                                                                                      
254868    [coffee, product, cup, price]                                                                                                                                                                                                                                                                                                                                                                                                        
111001    [item, bulldog, stuff, price, shipping, recommend, dog, lover, dog, treat]                                                    

In [32]:
np.savetxt('clean_text.txt', df['data_lemmatized'].values, fmt="%s") 

In [38]:
dct = corpora.Dictionary(simple_preprocess(line, deacc=True) for line in open('clean_text.txt'))

In [41]:
corpus = [dct.doc2bow(line) for line in np.array(data_lemmatized)]

In [44]:
from gensim.models import LdaModel, LdaMulticore

In [45]:
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dct,
                         random_state=100,
                         num_topics=7,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)
# save the model
lda_model.save('lda_model.model')

In [46]:
# See the topics
lda_model.print_topics(-1)

[(0,
  '0.027*"amazon" + 0.023*"price" + 0.022*"chocolate" + 0.021*"store" + 0.020*"box" + 0.015*"love" + 0.015*"time" + 0.014*"flavor" + 0.013*"bag" + 0.013*"cookie"'),
 (1,
  '0.030*"flavor" + 0.019*"taste" + 0.019*"sauce" + 0.019*"chip" + 0.018*"salt" + 0.011*"bag" + 0.011*"oil" + 0.010*"time" + 0.010*"rice" + 0.009*"cheese"'),
 (2,
  '0.069*"food" + 0.058*"dog" + 0.031*"cat" + 0.026*"treat" + 0.012*"day" + 0.012*"time" + 0.010*"year" + 0.009*"baby" + 0.009*"month" + 0.008*"love"'),
 (3,
  '0.098*"water" + 0.044*"bottle" + 0.032*"coconut" + 0.025*"taste" + 0.017*"flavor" + 0.017*"powder" + 0.016*"use" + 0.015*"drink" + 0.014*"juice" + 0.012*"mix"'),
 (4,
  '0.044*"sugar" + 0.032*"taste" + 0.030*"bar" + 0.027*"flavor" + 0.022*"calorie" + 0.019*"cereal" + 0.018*"protein" + 0.016*"milk" + 0.015*"ingredient" + 0.015*"snack"'),
 (5,
  '0.149*"coffee" + 0.043*"cup" + 0.028*"flavor" + 0.021*"taste" + 0.019*"kcup" + 0.012*"pod" + 0.012*"blend" + 0.010*"love" + 0.010*"bean" + 0.010*"roast"')

In [47]:
# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
import matplotlib.colors as mcolors

In [48]:
# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])
    
# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values
# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]
# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

In [66]:
# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=0, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)
# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 7

In [67]:
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

In [55]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

finding an appropriate number of topic is essential -> get a sense of the number from stuying k-clustering

# Doc2Vec 

In [23]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['data_lemmatized'])]

In [25]:
# train a Doc2Vec model with our text data
model = Doc2Vec(documents, vector_size=10, window=7, min_count=100, workers=4)

# transform each document into a vector data
doc2vec_df = df["token_words"].apply(lambda x: model.infer_vector(x)).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]

In [56]:
model.save('model/doc2vec.model')

In [29]:
new_df = pd.concat([df, doc2vec_df], axis=1)
new_df.head(1)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,doc2vec_vector_0,doc2vec_vector_1,doc2vec_vector_2,doc2vec_vector_3,doc2vec_vector_4,doc2vec_vector_5,doc2vec_vector_6,doc2vec_vector_7,doc2vec_vector_8,doc2vec_vector_9
491450,491451,B005A1LH0Q,A2Z3EDGYBA75F5,Jeanne R. Michon,2,2,4,1322611200,Great snack,"I was pleasantly surprised at these little snacks. Thought sure they wouldn't fill me up, but they did. Had just the right amount of crunch and a hint of cinnamon flavor that was not overpowering or bitter.",...,-0.093459,0.010824,0.044514,0.019727,-0.061364,0.016364,-0.032123,0.031593,0.025646,0.013055


Doc2Vec will be used for K-clustering

# Word2Vec

In [57]:
from gensim.models.word2vec import Word2Vec

In [61]:
words=df['data_lemmatized'].tolist()

In [62]:
model_w = Word2Vec(words, window=7, min_count=100, workers=4)

In [None]:
from gensim.models