In [3]:
# download nlp tools
# !python -m spacy download en_core_web_md

In [1]:
# install visualization tools
# !pip install PyLDAvis==3.3.1         

In [2]:
# for utilization of core processing power
# !pip install pandarallel==1.4.8

In [None]:
# imports
import bs4
import requests
from google.colab import files
import io
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
spacy.util.fix_random_seed(0)

import pyLDAvis
import pyLDAvis.gensim_models 

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from pandarallel import pandarallel

%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.DataFrame()

In [None]:
k = {}

In [None]:
end = False
unused = []

# list e will hold the target values
e = []
for b in range(28):
    req = requests.get(f'''https://www.consumeraffairs.com/food/nabisco.html{
                       f"?page={b}" if b else ""}#scroll_to_reviews=true''')
    k['page'+str(b)] = []
    soup = bs4.BeautifulSoup(req.text, 'lxml')

    for x in soup.select('.rvw.js-rvw'):
        s = re.findall(r'Rated with \d star', str(x).replace('\n', ''))
        try:
          e += [''.join(filter(lambda c: c.isdigit(), s[0]))]

        except Exception as l:
          print(l)
          # print(b)
          end = True
          break
    if end:
      break

list index out of range


In [None]:
print(len(e))

735


In [None]:
dates, states, review_texts, num_helpfuls = [], [], [], []

In [None]:
'''This nested for loop will scrape all pages of reviews listed on 
   consumeraffairs' dedication to Nabisco, placing a review's date, state of
   origin, text, and number of ratings indicating the review as helpful, in their respective
   lists for compilation into the dataframe'''

for b in range(28):
    req = requests.get(f'''https://www.consumeraffairs.com/food/nabisco.html{
                       f"?page={b}" if b else ""}#scroll_to_reviews=true''')
    soup = bs4.BeautifulSoup(req.text, 'lxml')

    for x in soup.select('.rvw.js-rvw'):
      new = x.text.replace('\n', ' ')
      state = new[new.index(',')+2:new.index(',')+4]
      new = new[new.index('Original review:')+16:]
      date = new[:new.index(', 20')+6]
      
      new = new[new.index(', 20')+6:]
      fin = ' '.join(new.split('          ')).split('   ')
      num_helpful = fin[-2]
      review_text = fin[0]

      dates += [date]
      states += [state]
      review_texts += [review_text]
      num_helpfuls += [num_helpful]

In [None]:
# check lengths of each list

In [None]:
dates, states, review_texts, num_helpfuls = [b[:735] for b in [dates, states, review_texts, num_helpfuls]]

In [None]:
k = {'Date':dates, 'State':states, 'Num_Helpful':num_helpfuls, 'Review':review_texts}

In [None]:
df = pd.DataFrame(k)

In [None]:
# assign the list of target variables(star ratings) to Stars column
df['Stars'] = e

In [None]:
# check for null value count
df.isnull().sum(), len(df)

(Date           0
 State          0
 Num_Helpful    0
 Review         0
 Stars          0
 dtype: int64, 735)

In [None]:
df.to_csv('nabisco_dataframe.csv', index=False)

In [None]:
files.download('nabisco_dataframe.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
part_2 = pd.read_csv(io.BytesIO(files.upload()['nabisco_dataframe.csv']), parse_dates=['Date'])

Saving nabisco_dataframe.csv to nabisco_dataframe (1).csv


In [None]:
# check for null values and evaluate best fill method
part_2.fillna('0', inplace=True)

In [None]:
# remove unnecessary words from Num_Helpful
part_2['Num_Helpful'] = part_2['Num_Helpful'].apply(lambda x: int(''.join(b for b in x if b.isdigit())))

In [None]:
# distribution of target variable
part_2.Stars.value_counts(normalize=True)

1    0.703401
2    0.253061
3    0.023129
5    0.012245
4    0.008163
Name: Stars, dtype: float64

In [None]:
# nabisco reviews were pretty bad, so anything above a 1 star is considered
# a good review

part_2['Good_Bad'] = part_2['Stars'].apply(lambda x: 1 if x > 1 else 0)
                                          

In [None]:
def cleaner(txt):
  ''' 
  removes unnecessary spaces, new lines, and special characters,
  returns lowercased version of new string
  '''

  txt = txt.replace('\n', '')

  txt = re.sub('[^a-zA-Z 0-9]', '', txt)

  txt = re.sub('[ ]{2,}', ' ', txt)

  return txt.lower().strip()

In [None]:
# load in the NLP tools and create a tokenizer function

nlp = spacy.load('en_core_web_md')
def tokenizer(txt):

  lemmas = []

  for x in nlp(txt):

    if not sum([x.is_punct, x.is_stop, x.is_space]):

      lemmas += [x.lemma_]

  return lemmas





In [None]:
# apply text-cleaning function

part_2['cleaned'] = part_2['Review'].apply(cleaner)

In [None]:
# n_processors is how many core processors your machine has

n_processors = 8

nb_workers = n_processors - 1

pandarallel.initialize(progress_bar=True, nb_workers=nb_workers)

INFO: Pandarallel will run on 7 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
# optionally use pandarallel's parallel_apply() to utilize
# machine's core processing power
part_2['lemmas'] = part_2['cleaned'].apply(tokenizer)

In [None]:
# create bag of words dictionary

id2word = corpora.Dictionary(part_2['lemmas'])

In [None]:
# create corpus

corpus = [id2word.doc2bow(lemmas) for lemmas in part_2['lemmas']]

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary ex. corpora.Dictionary(df['lemmas']) i.e. id2word
    corpus : Gensim corpus i.e. [id2word.doc2bow(doc) for doc in df['lemmas']]
    texts : List of input texts  ex. df['lemmas']
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=num_topics, 
                                                        chunksize=100,   # number of docs used in each training chunk
                                                        passes=10,
                                                        random_state=1234,
                                                        per_word_topics=True,
                                                        workers=2)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
model_list, coherence_vals = compute_coherence_values(id2word, corpus, part_2['lemmas'], limit=16, step=2)

In [None]:
coherence_vals

[0.25390204692471363,
 0.3405380167810327,
 0.32762302265115645,
 0.35297994733169546,
 0.3459528236522583,
 0.3870424985728458,
 0.3726643752713762]

In [None]:
# find index of best model associated with coherence values

best_model_index = np.argmax(coherence_vals)
best_model_index

5

In [None]:
lda_trained_model = model_list[best_model_index]

In [None]:
lda_trained_model

<gensim.models.ldamulticore.LdaMulticore at 0x7fce81b881d0>

In [None]:
# initialize topic visualization
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_trained_model, corpus, id2word)
vis

In [None]:
# names I have chosen to give each topic

vis_topic_name_dict = {
              1: 'Did/Did not like', 2: 'Quality of product',
              3: 'Heavily processed', 4: 'Older products', 5: 'Crackers',
              6: 'Packaging', 7: 'Preservatives', 8: 'Cookies',
              9: 'Company/Employees', 10: 'Dessert', 11: 'Branding',
              12: 'Ingredients'
              }

In [None]:
vis.topic_coordinates # len 12

#       vis.topic_coordinates.topics.to_dict()
#       takes topic index and topics column into dictionary pairs;
#       topic index is according to lda_trained_model, column is order according pyLDAvis tool

Unnamed: 0_level_0,x,y,topics,cluster,Freq
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,-0.15052,-0.049367,1,1,40.446306
2,-0.119175,0.055918,2,1,11.490827
3,-0.18748,0.139002,3,1,8.822418
6,-0.035165,-0.172766,4,1,8.073364
10,-0.031934,-0.258134,5,1,6.518507
7,-0.037021,0.025238,6,1,5.767866
0,-0.100999,0.103385,7,1,5.08352
8,0.055097,0.079048,8,1,5.026493
11,0.100322,-0.042028,9,1,3.926152
4,0.130742,0.02023,10,1,2.27352


In [None]:
def get_topic_id_lookup_dict(vis, vis_topic_name_dict):
    """
    vis_topic_name_dict ; dictionary I create from pyldavis tool

    The starting index and the ordering of topic ids bewteen the trained LDA model 
    and the viz tool are different. So we need to create a look up dictionary that maps 
    the correct association between topic ids from both sources. 
    """
    # value is order of topic ids according to pyLDAvis tool 
    # key is order of topic ids according to lda model
    model_vis_tool_topic_id_lookup = vis.topic_coordinates.topics.to_dict()        # ind     col
                                                                                 #   9       1
    # invert dictionary so that 
    # key is order of topic ids accoridng to pyLDAvis tool 
    # value is order of topic ids according to lda model
    topic_id_lookup =  {v:k for k, v in model_vis_tool_topic_id_lookup.items()}   # col      ind
                                                                                  # 1        9
    return {v:vis_topic_name_dict[k]  for k, v in topic_id_lookup.items()}      #   9: 'Did/Did not like'

In [None]:
topic_name_dict = get_topic_id_lookup_dict(vis, vis_topic_name_dict)

In [None]:
topic_name_dict

{0: 'Preservatives',
 1: 'Ingredients',
 2: 'Quality of product',
 3: 'Heavily processed',
 4: 'Dessert',
 5: 'Branding',
 6: 'Older products',
 7: 'Packaging',
 8: 'Cookies',
 9: 'Did/Did not like',
 10: 'Crackers',
 11: 'Company/Employees'}

In [None]:
def get_topic_ids_for_docs(lda_model, corpus):
    
    """
    Passes a Bag-of-Words vector into a trained LDA model in order to get the topic id of that document. 
    
    Parameters
    ----------
    lda_model: Gensim object
        Must be a trained model 
        
    corpus: nested lists of tuples, 
        i.e. [[(),(), ..., ()], [(),(), ..., ()], ..., [(),(), ..., ()]]
        
    Returns
    -------
    topic_id_list: list
        Contains topic ids for all document vectors in corpus 
    """
    
    # store topic ids for each document
    doc_topic_ids = []

    # iterate through bow vectors for each doc
    for doc_bow in corpus:
        
        # store the topic ids for the doc
        topic_ids = []
        # store the topic probabilities for the doc
        topic_probs = []

        # list of tuples
        # each tuple has a topic id and the prob that the doc belongs to that topic 
        topic_id_prob_tuples = lda_trained_model.get_document_topics(doc_bow)  # retrieves topics from each bag of words for each lemmas item
        
        # iterate through topic id/prob pairs 
        for topic_id_prob in topic_id_prob_tuples:
            
            # index for topic id
            topic_id = topic_id_prob[0]
            # index for prob that doc belongs that the corresponding topic
            topic_prob = topic_id_prob[1]

            # store all topic ids for doc
            topic_ids.append(topic_id)
            # store all topic probs for doc
            topic_probs.append(topic_prob)

        # get the index for the topic that had the highest probability, for the current document 
        max_topic_prob_ind = np.argmax(topic_probs)
        # get the corresponding topic id
        max_prob_topic_id = topic_ids[max_topic_prob_ind]
        # store the most probable topic id for the current document
        doc_topic_ids.append(max_prob_topic_id)
        
    return doc_topic_ids

In [None]:
# get the document id for each doc in the corpus
topic_id_list = get_topic_ids_for_docs(lda_trained_model, corpus)

# create a feature for document's topic id
part_2["topic_id"] = topic_id_list

In [None]:
# assign topic names to rows
part_2['topic'] = [topic_name_dict[item] for item in part_2['topic_id']]