In [2]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mikail\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Mikail\AppData\Roaming\nltk_data...


True

In [3]:
df = pd.read_csv("reviews.csv", encoding="latin1")
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
df = df[['ProductId', 'Score', 'Summary', 'Text']]
df.dropna(inplace=True)


In [5]:
df = df.sample(20000, random_state=42)


In [6]:
df['review_text'] = df['Summary'] + " " + df['Text']


In [7]:
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_review'] = df['review_text'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mikail\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()
df['sentiment'] = df['clean_review'].apply(lambda x: sia.polarity_scores(x)['compound'])


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Mikail\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
df[['Score', 'sentiment']].corr()


Unnamed: 0,Score,sentiment
Score,1.0,0.52093
sentiment,0.52093,1.0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['clean_review'])

feature_names = tfidf.get_feature_names_out()


In [11]:
top_words = np.argsort(tfidf.idf_)[:20]
[feature_names[i] for i in top_words]


['like',
 'good',
 'great',
 'taste',
 'one',
 'love',
 'product',
 'flavor',
 'br',
 'would',
 'get',
 'best',
 'really',
 'dont',
 'much',
 'also',
 'coffee',
 'little',
 'buy',
 'time']

In [12]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X_tfidf)

def display_topics(model, feature_names, n_top_words=8):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([feature_names[i] for i in topic.argsort()[-n_top_words:]])
        print()

display_topics(lda, feature_names)


Topic 0:
['taste', 'sauce', 'good', 'use', 'like', 'great', 'product', 'br']

Topic 1:
['great', 'loves', 'cat', 'love', 'treats', 'dogs', 'dog', 'food']

Topic 2:
['great', 'chocolate', 'good', 'like', 'drink', 'taste', 'flavor', 'tea']

Topic 3:
['great', 'like', 'strong', 'kcups', 'good', 'flavor', 'cup', 'coffee']

Topic 4:
['love', 'find', 'order', 'good', 'price', 'amazon', 'product', 'great']



In [None]:
### Marketing Insights

- Topics associated with negative sentiment highlight customer pain points and areas for product or messaging improvement.
- Positive sentiment topics reveal features and language that resonate with customers and can be emphasized in marketing and SEO content.
- Insights can inform keyword strategy, content optimization, and customer experience improvements.


In [13]:
tfidf = TfidfVectorizer(
    max_features=1000,
    min_df=5,
    max_df=0.9
)
X_tfidf = tfidf.fit_transform(df['clean_review'])


In [14]:
df['topic'] = lda.transform(X_tfidf).argmax(axis=1)

df.groupby('topic')['sentiment'].mean().sort_values()


topic
0    0.697909
3    0.713487
4    0.722097
1    0.739784
2    0.784496
Name: sentiment, dtype: float64

In [None]:
Topics with lower average sentiment scores indicate potential customer pain points, while higher-scoring topics highlight messaging and features that resonate positively with customers.


In [None]:
Topic Interpretation

- Topic 0 (Product taste & quality): Mixed sentiment, potential opportunity to refine messaging around flavor consistency.
- Topic 1 (Pet food satisfaction): Strong positive sentiment, suitable for trust-building and testimonial-driven content.
- Topic 2 (Beverages & flavor variety): Highest sentiment, highlighting features to emphasize in SEO and product descriptions.
- Topic 3 (Coffee strength & flavor): Moderate sentiment, indicating room for expectation-setting in content.
- Topic 4 (Price & ordering experience): Generally positive sentiment, useful for conversion-focused messaging.
