In [1]:
## NLP with Machine Learning

### 1. Sentiment Analysis

In [2]:
import pandas as pd

# create a list of sentences
data = [
    "When life gives you lemons, make lemonade! ðŸ™‚",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon â€” there's a great sale today.",
    "iced tea is my favorite",
    "I didn't like the taste of that lemonade at all.",
    "My lemons went bad before I could use them, unfortunately.",
]

# expand the column width to see the full sentences
pd.set_option('display.max_colwidth', None)

# turn it into a dataframe
data_df = pd.DataFrame(data, columns=["sentence"])
data_df.head()

# make a copy of the dataframe
df = data_df.copy()
df.head()

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! ðŸ™‚"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon â€” there's a great sale today.


In [3]:
#now lets start the sentiment analysis, import the vader library
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
test = df.sentence[0]
test

'When life gives you lemons, make lemonade! ðŸ™‚'

In [5]:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(test)
#here we get the score of what percent is negativity, positivity and over all compound score.

{'neg': 0.0, 'neu': 0.75, 'pos': 0.25, 'compound': 0.4587}

In [6]:
#if we just want the compound score instead of all the output
analyzer.polarity_scores(test)['compound']

0.4587

In [7]:
#above we just apply to single text string, lets apply it to all the above sentences, for that lets define a function

In [8]:
def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)['compound']

In [9]:
df.sentence.apply(get_sentiment)

0    0.4587
1    0.0000
2    0.0000
3    0.0000
4    0.6249
5    0.4588
6   -0.2755
7   -0.7096
Name: sentence, dtype: float64

In [10]:
#just creating a column to compare side by side
df['sentiment'] = df.sentence.apply(get_sentiment)
df

Unnamed: 0,sentence,sentiment
0,"When life gives you lemons, make lemonade! ðŸ™‚",0.4587
1,She bought 2 lemons for $1 at Maven Market.,0.0
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],0.0
3,"lemon, lemon, lemons, lemon, lemon, lemons",0.0
4,He's running to the market to get a lemon â€” there's a great sale today.,0.6249
5,iced tea is my favorite,0.4588
6,I didn't like the taste of that lemonade at all.,-0.2755
7,"My lemons went bad before I could use them, unfortunately.",-0.7096


### 2. Text Classification

In [11]:
#GOAL: Predict which reviews are high priority (vs low priority) that we need to address right away

In [12]:
# import libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [13]:
reviews = pd.read_excel('Natural+Language+Processing+in+Python/Course Materials/Data/Popchip_Reviews.xlsx')
reviews.head()

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more."
2,23691,A30NYUHEDLWI0Y,5,Low,Great Alternative to Potato Chips,"I just love these chips! I was always a big fan of potato chips, but haven't had one since I discovered popchips. They are great for dipping or all alone. I am constantly re-ordering them. One note however-if you are on a low salt diet these chips are probably not for you. They are high in sodium. We go through a case every two months. If you love them it pays to join the subscribe and save program through Amazon. You save money and stay supplied!"
3,23692,A2NU55U9LKTB5J,3,High,Not somthing I would crave,"These tasted like potatoe stix, that we got in grade school with our lunches usually on pizza day. They were the bomb then, not so much now. Won't buy again unless I get them for cheap or free."
4,23693,A225F7QFP5LIW2,5,Low,healthy and delicious,"These chips are great! They look almost like a flattened rice cake, but taste so much better, more like a potato chip. The bbq flavor is delicious. They are very low in fat and full of flavor. It is easy to eat an entire bag of these!"


In [14]:
#now we want to create a model that will feed in the above reviews, and tell us which review is high priority and which is low priority.
#before doing that, lets just do some EDA, and clean the TEXT column.

In [15]:
reviews.shape

(564, 6)

In [16]:
reviews.Priority.value_counts()

Priority
Low     447
High    117
Name: count, dtype: int64

In [17]:
#we will import a file called maven_text_preprocesssing which has all the functions to clean the text
#we can basically run the code here, but since we already have it in the file, lets just import that file.
#Make sure that, the file should be in the same location as this working file youre working on.

In [18]:
import maven_text_preprocessing

In [19]:
reviews['Text_Clean'] = maven_text_preprocessing.clean_and_normalize(reviews.Text)
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",like puff nature chip make unique chip market order salt vinegar absolutely love flavor hand favorite chip try cheddar regular flavor cheddar 45 regular 35 prefer strong flavor obviously case regular salt vinegar kind weak compare regular sv chip flavorful make want come


In [20]:
#now lets create a countvectorizer using naive bayes

In [21]:
cv = CountVectorizer()
X = cv.fit_transform(reviews.Text_Clean)
X_df = pd.DataFrame(X.toarray(), columns = cv.get_feature_names_out())
X_df
#we basically created a doc term matrix, with every row as doc, and col as terms

Unnamed: 0,08,08ounce,0br,10,100,1000,100150,100cal,100calories,100cals,...,yuck,yucky,yum,yummy,yummybr,zero,zesty,zip,ziplock,zowie
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
560,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
561,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
562,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#from the above output, see that there are many columns, 100cal, 100 calories, 100 cals etcc. Calories came in many different ways. Lets add some
#parameters in the countVectorizer to reduce the columns or size of DTM. in general the rule of thumb is, for every 20 rows you should have 1 col,and that will prevent you from over fitting.

In [23]:
cv = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=.2)
X = cv.fit_transform(reviews.Text_Clean)
X_df = pd.DataFrame(X.toarray(), columns = cv.get_feature_names_out())
X_df

Unnamed: 0,bag,buy,calorie,chip,eat,flavor,good,great,like,love,popchip,potato,potato chip,salt,snack,taste,try
0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,4,0,3,0,0,1,1,0,0,0,2,0,0,1
2,0,0,0,3,0,0,0,1,0,2,1,1,1,1,0,0,0
3,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0
4,1,0,0,2,1,2,0,1,2,0,0,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0,0,0,3,3,1,1,5,0,1,1,4,3,0,0,1,0
560,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1
561,0,0,0,2,0,1,0,2,0,0,0,0,0,0,0,2,0
562,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0


In [24]:
#the above X_df would be out inout, and the below y is our output.
y = reviews.Priority
y.head()

0     Low
1     Low
2     Low
3    High
4     Low
Name: Priority, dtype: object

In [25]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

#Model
model = MultinomialNB()
model.fit(X_train, y_train)

#Predict
y_pred = model.predict(X_test)

#Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#In the classification report, it says, when we trained our model, we had 94 reviews that had low label, and 19 with high label
#The way to interpret the precision scores is, out of all the reviews that the model predicted to be low priority, how often is the model right? 85% of cases it is right, but for high priority cases, the model is correct for only 60% of the time.
#the way to interpret the recall scores is, out of all the actual low priority reviews, how many did the model guessed correctly? 98%. However the high priority recall is bad, the model is only able to guess only 16% of them correctly. The rest it labelled as low priority reviews.
# F1 score is the combination of precision and recall.

Accuracy: 0.8407079646017699

Classification Report:
               precision    recall  f1-score   support

        High       0.60      0.16      0.25        19
         Low       0.85      0.98      0.91        94

    accuracy                           0.84       113
   macro avg       0.73      0.57      0.58       113
weighted avg       0.81      0.84      0.80       113



In [26]:
#now lets make some predictions with our model.
#lets bring in some sample reviews
new_reviews = pd.Series([
    "Pop chips are my favorite! I love these chips so much.",
    "Taste bad. I don't like the flavor options or taste.",
    "Solid snack."
])

In [27]:
new_reviews

0    Pop chips are my favorite! I love these chips so much.
1      Taste bad. I don't like the flavor options or taste.
2                                              Solid snack.
dtype: object

In [28]:
#lets follow the process to know which are the low and high priority reviews by our naive bayes model
#first thing to do is get data into same input format
#for that clean and normalize the data

In [29]:
new_reviews_clean = maven_text_preprocessing.clean_and_normalize(new_reviews)
#now vectorize the data
new_reviews_df = pd.DataFrame(cv.transform(new_reviews_clean).toarray(), columns=cv.get_feature_names_out())
new_reviews_df

Unnamed: 0,bag,buy,calorie,chip,eat,flavor,good,great,like,love,popchip,potato,potato chip,salt,snack,taste,try
0,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,2,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [30]:
model.predict(new_reviews_df)

array(['Low', 'High', 'Low'], dtype='<U4')

In [31]:
#now instead of cv, lets do tfidf, and instead of Naivebayes, lets do the logistic regression

In [32]:
tv = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=.2)
Xt = tv.fit_transform(reviews.Text_Clean)
Xt_df = pd.DataFrame(Xt.toarray(), columns = tv.get_feature_names_out())
Xt_df

Unnamed: 0,bag,buy,calorie,chip,eat,flavor,good,great,like,love,popchip,potato,potato chip,salt,snack,taste,try
0,0.000000,0.000000,0.0,0.392603,0.656435,0.000000,0.000000,0.000000,0.000000,0.000000,0.644170,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.0,0.561185,0.000000,0.537701,0.000000,0.000000,0.195524,0.213766,0.000000,0.000000,0.000000,0.513094,0.000000,0.000000,0.220814
2,0.000000,0.000000,0.0,0.517908,0.000000,0.000000,0.000000,0.295101,0.000000,0.526082,0.283255,0.277355,0.333330,0.315684,0.000000,0.000000,0.000000
3,0.000000,0.690063,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.512918,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.510616,0.000000
4,0.252776,0.000000,0.0,0.340747,0.284866,0.435318,0.000000,0.291234,0.474884,0.000000,0.000000,0.273721,0.328962,0.000000,0.000000,0.236376,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0.000000,0.000000,0.0,0.216106,0.361330,0.092028,0.103897,0.615680,0.000000,0.109758,0.118193,0.462925,0.417263,0.000000,0.000000,0.099942,0.000000
560,0.381673,0.000000,0.0,0.000000,0.000000,0.328649,0.000000,0.439742,0.000000,0.000000,0.422089,0.000000,0.000000,0.000000,0.459181,0.000000,0.404891
561,0.000000,0.000000,0.0,0.399843,0.000000,0.255407,0.000000,0.683486,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.554742,0.000000
562,0.000000,0.000000,0.0,0.000000,0.000000,0.537244,0.606536,0.000000,0.586074,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [33]:
#lets use the same y-value as before

In [34]:
y.head()

0     Low
1     Low
2     Low
3    High
4     Low
Name: Priority, dtype: object

In [35]:
#now we have input and output, lets fit a model., just changing the names

In [36]:
# Train/test split
Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt_df, y, test_size=0.2, random_state=42)

#Model
model_lr = LogisticRegression()
model_lr.fit(Xt_train, yt_train)

#Predict
y_pred_lr = model_lr.predict(Xt_test)

#Evaluate
print("Accuracy:", accuracy_score(yt_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(yt_test, y_pred_lr))


#when you compare the output of NB and LR models, check the recall scores for the high priorities. The NB model is giving 0.16 recall score, whilst LR is giving only 5%, which is not good.
#What is our goal here? Predict which reviews are high priority? If our model is capturing only 5% of those, then thats a problem. We need to optimize our model to specifically optimize for that metric, which we gonna see later. 

Accuracy: 0.8407079646017699

Classification Report:
               precision    recall  f1-score   support

        High       1.00      0.05      0.10        19
         Low       0.84      1.00      0.91        94

    accuracy                           0.84       113
   macro avg       0.92      0.53      0.51       113
weighted avg       0.87      0.84      0.78       113



In [37]:
#lets compare the models using their prediction scores

In [38]:
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",like puff nature chip make unique chip market order salt vinegar absolutely love flavor hand favorite chip try cheddar regular flavor cheddar 45 regular 35 prefer strong flavor obviously case regular salt vinegar kind weak compare regular sv chip flavorful make want come


In [39]:
#so far we were using the .predict() to know if something is high priority or low priority, lets use predict_proba to get exact probability.
#We are going to apply this to our input data, the data before the train and test split, that way I have the predicted probabilities for all my reviews.
model.predict_proba(X_df)[:, 0]
#every row is going to represent a review, and the columns represent high predictions vs low predictions,and we are only interested in high priority
#so we are extracting it using [:, 0]

array([2.00606025e-01, 3.05999182e-01, 4.50902259e-02, 4.56157826e-01,
       3.20883714e-01, 4.18204899e-01, 1.54005429e-01, 3.75503577e-01,
       1.78683480e-01, 2.83890796e-01, 7.26104077e-02, 1.33490244e-05,
       1.93733762e-01, 1.38622499e-01, 1.10357785e-01, 1.47288781e-01,
       2.39527407e-01, 2.50905850e-02, 3.54375244e-01, 6.02176877e-01,
       2.46526372e-01, 7.57451449e-01, 1.93450910e-01, 1.88707238e-01,
       5.37637671e-01, 3.74549055e-01, 4.91842071e-01, 8.87985501e-02,
       3.07093068e-01, 2.22732121e-01, 1.43487730e-01, 2.02852387e-01,
       1.54249969e-01, 1.68274969e-01, 4.21893217e-02, 9.62578354e-02,
       3.87608333e-01, 1.75239894e-01, 9.01422882e-02, 1.92344597e-01,
       1.99513854e-01, 9.23946972e-02, 1.37767006e-01, 2.53028853e-01,
       2.13769548e-01, 1.42396287e-01, 1.05727556e-01, 2.34878470e-01,
       1.64877877e-01, 8.20076513e-02, 2.11173147e-01, 1.15203761e-01,
       1.89430799e-01, 2.74138256e-01, 3.97848365e-01, 2.53163800e-01,
      

In [40]:
model_lr.predict_proba(Xt_df)[:, 0]

array([0.2590195 , 0.22819851, 0.07517457, 0.55084249, 0.28135196,
       0.42143348, 0.13014302, 0.45950577, 0.19000063, 0.20928769,
       0.08128715, 0.11913647, 0.18970972, 0.12345081, 0.17529228,
       0.12367452, 0.22577408, 0.09994249, 0.27437717, 0.33074725,
       0.27090502, 0.24832781, 0.23109895, 0.13185631, 0.26086998,
       0.34978928, 0.53080851, 0.15765899, 0.28110348, 0.20123437,
       0.20087253, 0.16332925, 0.13850747, 0.15590012, 0.11812842,
       0.09899561, 0.32010859, 0.21458308, 0.09494073, 0.19633111,
       0.20512783, 0.10352704, 0.19914808, 0.27270382, 0.22237213,
       0.22405244, 0.11570263, 0.24937863, 0.19781639, 0.13421686,
       0.17134759, 0.1937814 , 0.17032817, 0.33000877, 0.39657518,
       0.20886955, 0.12573129, 0.17000677, 0.30940847, 0.13326123,
       0.17506651, 0.07552167, 0.13144144, 0.15706054, 0.12930077,
       0.28183166, 0.25899659, 0.14538294, 0.57666407, 0.1204422 ,
       0.16132391, 0.30459573, 0.10361138, 0.36380046, 0.07670

In [41]:
#lets add the above two outputs as two new columns to compare

In [42]:
reviews['Predictions_NB'] = model.predict_proba(X_df)[:, 0]

In [43]:
reviews['Predictions_LR'] = model_lr.predict_proba(Xt_df)[:, 0]

In [44]:
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean,Predictions_NB,Predictions_LR
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save,0.200606,0.259019
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",like puff nature chip make unique chip market order salt vinegar absolutely love flavor hand favorite chip try cheddar regular flavor cheddar 45 regular 35 prefer strong flavor obviously case regular salt vinegar kind weak compare regular sv chip flavorful make want come,0.305999,0.228199


In [45]:
#lets sort the data based on Predictions_NB, so that we could know which reviews does Naive Bayes think are high priority compared to LR

In [46]:
reviews.sort_values(by='Predictions_NB', ascending=False).head()

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean,Predictions_NB,Predictions_LR
550,24239,A2ZKS33N6Y3EPC,3,High,"Taste more like ""Tomato and Basil"" than ""Chili and Lime""","NOTE: This review is for the Chili and Lime Flavor Popchip. Amazon had a separate page for it but then merged the product and its reviews into one.<br /><br />It's hard to objectively review food since everyone's palate and tastes are different. So what I can say about this particular Popchip flavor that should be useful for most folks out there is that it doesn't really taste like Chili and Lime you're ""probably"" expecting. The Chili and Lime most folks probably are expecting if they grew up on Frito Lay products is very sharp and sweet (and of course artificial) - but it's what we liked if we ate more than a bag.<br /><br />The best way I can describe this flavor is that it has a ""tomato"" like taste to it with a somewhat tangy sour note that is suppose to be the lime component. Together they turn into an odd combination that registers other flavors in your mind than Chili and Lime - at least it did to me and others who tasted it with me. If you eat the skin of a green bell pepper, you can kind of get at what Popchips were trying to do with the Chili taste on this version, but I have no idea how some sour salt can be akin to lime. For myself personally, I thought it tasted like ""Tomato and Basil"" you would find on Pita chip flavors and baked snacks.<br /><br />Whether or not you agree with my above description of the flavor, I would highly suggest you try to get this in a sample pack and try it out first. BBQ + Salt & Vinegar Popchips are still my staples for now.",note review chili lime flavor popchip amazon separate page merge product review onebr br hard objectively review food everyone palate taste different particular popchip flavor useful folk not taste like chili lime probably expect chili lime folk probably expect grow frito lay product sharp sweet course artificial like eat bagbr br good way describe flavor tomato like taste somewhat tangy sour note suppose lime component turn odd combination register flavor mind chili lime taste eat skin green bell pepper kind popchip try chili taste version idea sour salt akin lime personally think taste like tomato basil find pita chip flavor bake snacksbr br agree description flavor highly suggest try sample pack try bbq salt vinegar popchip staple,0.973989,0.478529
96,23785,AE5AHEH3NLPBZ,3,High,Tastes Like Celery,"I really like pop chips, but this flavor isn't the best. I was expecting these to taste like chili peppers and lime (Spicy, Sweet, and Tart), but instead of going for a chili pepper taste, they went for a chili the food taste. This wouldn't be so bad, except they taste overwhelmingly of tomato and celery. The reason they didn't call them Tomato and Celery Chips is because it is sounds gross and no one would buy that, and unfortunately it tastes like it sounds.",like pop chip flavor not good expect taste like chili pepper lime spicy sweet tart instead go chili pepper taste go chili food taste not bad taste overwhelmingly tomato celery reason not tomato celery chip sound gross buy unfortunately taste like sound,0.854032,0.495605
463,24152,A2ZMMQ4W17EK2N,2,High,Original PopChips,"Bought the Original flavor from the store and just tried them tonight. They were very greasy and salty. I did not like them a lot. I will not purchase this original flavor again. However I can't complain because I got the 3 ounce bag for only $1.00 at the store while they were on sale. I tried the BBq flavor and they are delicious. I bought the sea salt & vinegar, and cheddar but haven't tried those yet.",buy original flavor store try tonight greasy salty like lot purchase original flavor not complain get 3 ounce bag 100 store sale try bbq flavor delicious buy sea salt vinegar cheddar not try,0.760037,0.439521
21,23710,ASIMCC20UVK58,5,Low,Great Chips Less Fat,"I eat chips almost every day and decided I wanted to find something that tastes as good but is lighter on unnecessary fat than regular types of chips. I bought a case of Popchips BBQ. These are satisfying and taste great. They don't taste exactly like any full fat chip products I've had mainly because they're not greasy at all, but they have a nice BBQ potato chip flavor. These are thick, crunchy, and light. I first bought the .8 oz bags and this serving size is on the small side for me with lunch (would probably be alright for a snack). 3 of the .8 oz bags works for me which of course bumps up the fat intake, but considering the same volume of ""regular"" chips has much more fat it is a significant fat decrease overall which is what I was looking for. I find the 3 ounce bags to be perfect. Even eating all 3 ounces works out to significantly less fat and calories than eating the same volume of other chips. This makes Popchips very satisfying to me, and I have bought many cases through Amazon.<br /><br />Heads up (mid-2011): Unfortunately the price has gone up significantly for these chips through Amazon, causing me to cancel my subscribe & save subscriptions. Popchips have popped up in local stores for significantly less per ounce. I love the convenience of the portioned bags and subscription but it's hard to justify paying double for the same product.<br /><br />The flavors are pretty straight forward but here's my thoughts...<br />Original flavor: Tastes like a plain potato chip minus the grease. Not my favorite flavor, but good for what it is. This flavor would probably be good with some kind of dip.<br />Chedder: Cheddar quickly became tied with BBQ for my favorite. Like BBQ the cheddar flavor is very strong. Great chips.<br />Salt & Pepper: Very strong pepper. To enjoy these you have to really like pepper. I like them, but they're not a favorite.<br />Sea Salt & Vinegar: I'm not a fan of vinegar, but strangely I enjoy this flavor. They're indeed salty with a fairly strong vinegar flavor.",eat chip day decide want find taste good light unnecessary fat regular type chip buy case popchip bbq satisfy taste great not taste exactly like fat chip product ve mainly greasy nice bbq potato chip flavor thick crunchy light buy 8 oz bag serve size small lunch probably alright snack 3 8 oz bag work course bump fat intake consider volume regular chip fat significant fat decrease overall look find 3 ounce bag perfect eat 3 ounce work significantly fat calorie eat volume chip make popchip satisfying buy case amazonbr br head mid2011 unfortunately price go significantly chip amazon cause cancel subscribe save subscription popchip pop local store significantly ounce love convenience portion bag subscription hard justify pay double productbr br flavor pretty straight forward here thoughtsbr original flavor taste like plain potato chip minus grease favorite flavor good flavor probably good kind dipbr chedder cheddar quickly tie bbq favorite like bbq cheddar flavor strong great chipsbr salt pepper strong pepper enjoy like pepper like favoritebr sea salt vinegar m fan vinegar strangely enjoy flavor salty fairly strong vinegar flavor,0.757451,0.248328
157,23846,A1HYH206E18XVC,5,Low,Tangy and terrific,"When I asked my older daughter to describe this flavor, she said to be sure to mention the word tangy. That is a fair description as the lime does heighten the taste buds and enhances the slight heat from the chili.<br /><br />My family really enjoys this flavor and it is among our favorites. We have tried most of the other varieties of Popchips and have our own preferences. My older daughter likes salt and pepper, barbecue, and this flavor the best. I like barbecue, sour cream and onion, and this flavor the best. My wife likes salt and vinegar and this flavor the best. My younger daughter does not like this flavor. She prefers barbecue, cheese, and sour cream and onion. Our least favorite is the original, probably because it is so plain by comparison.<br /><br />To me, Popchips are sort of a cross of potato chips, popcorn, and rice cakes. They are potato, but popped like popcorn and sort of puffy like rice cakes. They definitely have more flavor than many rice cakes and are a nice alternative to popcorn. They also can be used with dips although they never seem to last very long in our house.",ask old daughter describe flavor say sure mention word tangy fair description lime heighten taste bud enhance slight heat chilibr br family enjoy flavor favorite try variety popchip preference old daughter like salt pepper barbecue flavor good like barbecue sour cream onion flavor good wife like salt vinegar flavor good young daughter like flavor prefer barbecue cheese sour cream onion favorite original probably plain comparisonbr br popchip sort cross potato chip popcorn rice cake potato pop like popcorn sort puffy like rice cake definitely flavor rice cake nice alternative popcorn dip long house,0.728147,0.305065


### 3. Topic Modeling (Non-Matrix Factorization)

In [47]:
#Goal: Find the main themes in the reviews

In [48]:
#First thing we need to do in the Topic Modeling is to input the vectorized data, be it a cv or tfidf, lets bring the data from above
#for NMF tfidf tend to work better, lets pull that down here

In [49]:
#we added both min_df and max_df because, if we put min_df we only get 16 columns which is not suitable for tpoic modeling, we should have more columns
#so we are giving a range, saying as only terms appear between 5% and 20% of the reviews, should be considered.
tv2 = TfidfVectorizer(stop_words='english', min_df = 0.05, max_df=.2)
Xt2 = tv2.fit_transform(reviews.Text_Clean)
Xt_df2 = pd.DataFrame(Xt2.toarray(), columns = tv2.get_feature_names_out())
Xt_df2

Unnamed: 0,100,alternative,amazon,bad,bake,baked,bbq,big,bit,box,...,thing,think,time,variety,ve,vinegar,want,way,weight,work
0,0.0,0.465515,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.348295,0.193511,0.000000,0.000000,0.000000
2,0.0,0.000000,0.354088,0.000000,0.0,0.000000,0.000000,0.428869,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.354475,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,0.0,0.324462,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.299888,0.0,0.000000,0.000000,0.000000,0.337388,0.657147,0.000000
560,0.0,0.000000,0.190702,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.197896,0.000000,0.208227,0.000000,0.000000,0.247474
561,0.0,0.378621,0.000000,0.380993,0.0,0.396437,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
562,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.543142,0.000000,0.000000,0.000000,0.000000,0.000000


In [50]:
#lets create an NMF model

In [51]:
from sklearn.decomposition import NMF

In [52]:
#now lets instantiate a model
nmf = NMF(n_components=5, random_state=42, max_iter=500) #here 2 represetns how many topics we want
W = nmf.fit_transform(Xt_df2) # this is Documents to topics matrix
H = nmf.components_ #topics-terms

In [53]:
#so from the above steps, the matrices are created, but we wanna see the actual topics that are created. To see that we are gonna create a function
#we are going to write a funciton to display top terms in each topic

In [54]:
H.shape #from the output we can see that we have 2 topics and 81 terms

(5, 81)

In [55]:
#lets take a look at the first topic
H[0]
#What the output telling us is, these are all the weights associated with the terms, the bigger the weight, the more relevancy of that term to that topic
#imagine 0.740, it is more important term for topic1, on the other hand we have 0 associated terms, that terms are not relevant to topic1 at all.

array([2.27467301e-01, 0.00000000e+00, 1.05812554e+00, 1.90686508e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.35972253e-01,
       1.35628300e-02, 5.60885872e-01, 0.00000000e+00, 9.25628481e-01,
       0.00000000e+00, 2.64011798e-01, 1.48778731e-02, 4.32519339e-02,
       2.06228857e-01, 4.84590839e-02, 2.34325144e-01, 0.00000000e+00,
       6.92355490e-02, 9.77982923e-02, 7.64422035e-02, 8.50347323e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.36187942e-01,
       0.00000000e+00, 6.54084363e-02, 8.57621722e-02, 1.39633702e-01,
       0.00000000e+00, 5.47357474e-02, 0.00000000e+00, 3.98725569e-01,
       2.82558641e-02, 0.00000000e+00, 1.24328433e-01, 1.08049224e-01,
       1.46707201e-01, 0.00000000e+00, 1.36364600e-02, 0.00000000e+00,
       0.00000000e+00, 2.29103253e+00, 1.17250511e-01, 2.75259158e-01,
       0.00000000e+00, 2.09892619e-01, 4.42006137e-01, 2.90102692e-01,
       3.15638492e-02, 1.70030147e-01, 1.67667917e-01, 0.00000000e+00,
      

In [56]:
#lets write a function now as, for every topic we need to take a look at the above output array and find the terms that are associated with the highest weight
def display_topics(H, num_words=10): #we are defining a new function and naming it as display_topics and giving H matrix as the input. num_words=10 is to 
    for topic_num, topic_array in enumerate(H): #now I am gonna do is display every topic and its corresponding array
        top_features = topic_array.argsort()[::-1][:num_words]#within my topic array to return to return those soreted indices, i am gonna use argsort method, and save the o/p as top_features. This is going to return the index values instead of actual weights of the terms. By default argsort() is sorting from lowest value to the highest value of the weights of the terms. And I want the highes to lowest. So I am going to reverse the order using [::-1]. [:num_words] is just to let user select how many terms to display in our o/p.
        top_words = [tv2.get_feature_names_out() [i] for i in top_features]#now take the index scores or numbers from above line of code and actually extract the terms names associated with those scores. I am gonna use list comprehension, better way is to read from right to left. First we are going to look through top_features array, what do I wanna see in top_features, all the index numbers, so for i in top_features, and then I want to return the term list which has term names in it, so where can I get the term list, we have it in the very first step while doing the tfidf2, tv2.get_feature_names_out(), and call it top_words  
        print('Topic', topic_num+1, ':', ', '.join(top_words)) #lets just finalize it, so that o/p looks nicer. remove top_features, add "Topic", add +1 to the topic_num so that the number will change to 1,2 instead of 0,1, add a ':' after topic_num, so instead of o/p as list, use .join()to see it as normal punctuation and add',' to seperate the words

In [57]:
display_topics(H)

Topic 1 : order, amazon, case, time, store, box, thing, price, know, product
Topic 2 : sweet, salty, br, light, rice, texture, think, little, crunchy, fry
Topic 3 : healthy, alternative, bbq, delicious, regular, feel, work, enjoy, nice, look
Topic 4 : br, vinegar, bbq, favorite, pepper, original, lime, think, sea, sour
Topic 5 : fat, low, weight, pop, regular, serve, diet, crunch, single, tasty


In [58]:
#lets work on the NMF now. If you see the above o/p, we only have 2 topics, and thats okay because we used our n-_components valuse as 2. And from
#the o/p we cant draw any conclusio by seeing the terms, so lets change the n_components value to 3, and then 4, and then 5 and see the o/p.
#At 5 components you will get an erroe saying max iterations reached, what we need to do increase the no.of iterations. To actually check how many 
#iterations are do shift+tab, you can see max_iter value, also we have to keep the random_state value to get the converging consistently. To get same W and H weights.

In [59]:
#so the next thing is, bring the above o/p to our original reviews data.
reviews.head(2)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean,Predictions_NB,Predictions_LR
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save,0.200606,0.259019
1,23690,AQJYXC0MPRQJL,5,Low,Great chip that is different from the rest,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",like puff nature chip make unique chip market order salt vinegar absolutely love flavor hand favorite chip try cheddar regular flavor cheddar 45 regular 35 prefer strong flavor obviously case regular salt vinegar kind weak compare regular sv chip flavorful make want come,0.305999,0.228199


In [60]:
#lets add few more columns about the topics into the reviews data.
#for that we are gonna use the W matrix

In [61]:
W #565 rows are reviews, and 5 columns are topics, but the below o/p doesnt look nice, lets do some more mods below

array([[0.        , 0.        , 0.40301218, 0.        , 0.        ],
       [0.05508035, 0.        , 0.02375526, 0.11517912, 0.08804791],
       [0.06778713, 0.        , 0.        , 0.        , 0.15389049],
       ...,
       [0.08472662, 0.        , 0.20048196, 0.        , 0.09120349],
       [0.01907328, 0.        , 0.        , 0.08550474, 0.03763075],
       [0.        , 0.        , 0.        , 0.03905461, 0.07404591]],
      shape=(564, 5))

In [62]:
#lets change it into a dataframe

In [63]:
doc_to_topics = pd.DataFrame(W)
doc_to_topics.columns = ['orders', 'taste & texture', 'good', 'flavour', 'health'] #lets also update the column names 
doc_to_topics

Unnamed: 0,orders,taste & texture,good,flavour,health
0,0.000000,0.000000,0.403012,0.000000,0.000000
1,0.055080,0.000000,0.023755,0.115179,0.088048
2,0.067787,0.000000,0.000000,0.000000,0.153890
3,0.017647,0.002463,0.000000,0.000000,0.029204
4,0.000000,0.016166,0.040860,0.044669,0.190659
...,...,...,...,...,...
559,0.025953,0.010370,0.050308,0.000000,0.168847
560,0.108660,0.000000,0.022080,0.157261,0.032282
561,0.084727,0.000000,0.200482,0.000000,0.091203
562,0.019073,0.000000,0.000000,0.085505,0.037631


In [64]:
#now display this side by side with our reviews data

In [65]:
reviews.head(1)

Unnamed: 0,Id,UserId,Rating,Priority,Title,Text,Text_Clean,Predictions_NB,Predictions_LR
0,23689,A21SYVGVNG8RAS,5,Low,Yummy snacks!,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,popchip bomb use parmesan garlic scoop cottage cheese healthy alternative chip dip healthy eat program save,0.200606,0.259019


In [66]:
reviews_and_topics=pd.concat([reviews.Text, doc_to_topics], axis=1)
reviews_and_topics

Unnamed: 0,Text,orders,taste & texture,good,flavour,health
0,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.000000,0.000000,0.403012,0.000000,0.000000
1,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.055080,0.000000,0.023755,0.115179,0.088048
2,"I just love these chips! I was always a big fan of potato chips, but haven't had one since I discovered popchips. They are great for dipping or all alone. I am constantly re-ordering them. One note however-if you are on a low salt diet these chips are probably not for you. They are high in sodium. We go through a case every two months. If you love them it pays to join the subscribe and save program through Amazon. You save money and stay supplied!",0.067787,0.000000,0.000000,0.000000,0.153890
3,"These tasted like potatoe stix, that we got in grade school with our lunches usually on pizza day. They were the bomb then, not so much now. Won't buy again unless I get them for cheap or free.",0.017647,0.002463,0.000000,0.000000,0.029204
4,"These chips are great! They look almost like a flattened rice cake, but taste so much better, more like a potato chip. The bbq flavor is delicious. They are very low in fat and full of flavor. It is easy to eat an entire bag of these!",0.000000,0.016166,0.040860,0.044669,0.190659
...,...,...,...,...,...,...
559,"I love potato chips. I could eat them by the bagful but thanks to the powers that be, this would not provide anyone with enough nutrition to survive. Nonetheless I have eaten my share of potato chips over the years, and perhaps as a result I have been watching my weight lately. I discovered these great popchips and they hit the spot in a number of ways. First they are a low-calorie alternative to regular potato chips, meaning I can eat a normal-sized portion and still keep on target with my weight loss goals. Second, they are gluten-free, which is great for those of us who have gluten intolerance issues. Third, they taste great - light and airy, crispy, rich in that great potato flavor that keeps me reaching for another one. This is a great product and I hope it stays around for a good long time.",0.025953,0.010370,0.050308,0.000000,0.168847
560,"When PopChips were really hard to find, I was ordering them by the case from Amazon on a regular basis. The price was always great and these really are my go-to snack. They are always fresh and delicious from Amazon.<br /><br />I definitely prefer Original over barbecue and sour cream & onion. I haven't tried other flavors, but Original works just fine for me because I use it for various dips and will even use dry seasonings on them if I'm in the mood.<br /><br />I don't know anybody that I've offered some of these to who didn't want a bag of their own! Highly recommended.",0.108660,0.000000,0.022080,0.157261,0.032282
561,These are a much healthy alternative to most chips and they taste great. They have a great crunch and flavor and don't have that bad after taste that most baked chips have. My only regret is that I didn't order more when they were on sale there all gone now but even at regular price are worth it.,0.084727,0.000000,0.200482,0.000000,0.091203
562,"These are so good, I've started getting them automatically. I like the original flavor, but they have a lot of others.",0.019073,0.000000,0.000000,0.085505,0.037631


### 4. Combining topic modeling, Sentiment analyses, and EDA

In [67]:
#make a copy of the above reviews_and_topics data, just in case
final_topics = reviews_and_topics.copy()
final_topics.head(2)

Unnamed: 0,Text,orders,taste & texture,good,flavour,health
0,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.0,0.0,0.403012,0.0,0.0
1,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.05508,0.0,0.023755,0.115179,0.088048


In [68]:
#now I am gonna add a column that gonna assign a single largest topic value, for rxample, largest value is 'good' in the first review, and 'flavour' is the largest for the second review.
#I will use a method called index max to do that

In [69]:
#what we did here is, I just want to see every row and exclude the 0th column (the whole text), start with the 1st column and all the way to the end.
final_topics['top_topic'] = final_topics.iloc[:, 1:].idxmax(axis=1)
final_topics.head(2)

Unnamed: 0,Text,orders,taste & texture,good,flavour,health,top_topic
0,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.0,0.0,0.403012,0.0,0.0,good
1,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.05508,0.0,0.023755,0.115179,0.088048,flavour


In [70]:
#now lets do the sentiment analyses here, the get_sentiment function we created above
def get_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    return analyzer.polarity_scores(text)['compound']

In [71]:
final_topics['sentiment'] = final_topics.Text.apply(get_sentiment) #this gives sentiment score of each review
final_topics.head(2)

Unnamed: 0,Text,orders,taste & texture,good,flavour,health,top_topic,sentiment
0,Popchips are the bomb!! I use the parmesan garlic to scoop up cottage cheese as a healthy alternative to chips and dip. My healthy eating program is saved.,0.0,0.0,0.403012,0.0,0.0,good,0.9244
1,"I like the puffed nature of this chip that makes it more unique in the chip market. I ordered the Salt and Vinegar and absolutely love that flavor, hands down my favorite chip ever. I have tried the cheddar and regular flavors as well. The cheddar is about a 4/5 and the regular is about a 3/5 because I prefer strong flavors and obviously that would not be the case for the regular. The Salt and Vinegar is kind of weak compared to some regular S&V chips, but is quite flavorful and makes you wanting to come back for more.",0.05508,0.0,0.023755,0.115179,0.088048,flavour,0.7269


In [72]:
#now we can do some EDA here, my goal is for every topic, I wanna know the avg sentiment score, and see how customers feel about that topic

In [73]:
final_topics.groupby('top_topic')['sentiment'].mean().sort_values()

top_topic
orders             0.504758
health             0.711142
flavour            0.768537
good               0.816834
taste & texture    0.842701
Name: sentiment, dtype: float64