In [None]:
import pandas as pd
import numpy as np
import nltk
import string
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df = pd.read_csv("Reviews.csv")

In [None]:
reviews = df.dropna()

In [None]:
reviews = df.head(10000).reset_index(drop=True)
reviews

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
9995,9996,B000P41A28,A3A63RACXR1XIL,"A. Boodhoo ""deaddodo""",10,15,1,1204502400,constipation,we switched from the advance similac to the or...
9996,9997,B000P41A28,A5VVRGL8JA7R,Adam,2,3,5,1306368000,Constipation Not A Problem if...,"Like the bad reviews say, the organic formula ..."
9997,9998,B000P41A28,A2TGDTJ8YCU6PD,geena77,0,0,5,1347494400,Love this formula!,I wanted to solely breastfeed but was unable t...
9998,9999,B000P41A28,AUV4GIZZE693O,"Susan Coe ""sueysis""",1,2,5,1203638400,very convenient,i love the fact that i can get this delieved t...


In [None]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
def preprocess_text(text):
    # a. lowercase
    text = text.lower()
    # b. remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # c. tokenize
    tokens = nltk.word_tokenize(text)
    # d. remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # e. join tokens
    return " ".join(tokens)

In [None]:
cleaned_reviews = reviews['Text'].apply(preprocess_text)
cleaned_reviews

Unnamed: 0,Text
0,bought several vitality canned dog food produc...
1,product arrived labeled jumbo salted peanutsth...
2,confection around centuries light pillowy citr...
3,looking secret ingredient robitussin believe f...
4,great taffy great price wide assortment yummy ...
...,...
9995,switched advance similac organic product think...
9996,like bad reviews say organic formula constipat...
9997,wanted solely breastfeed unable keep supplemen...
9998,love fact get delieved house delievy chargeit ...


In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

In [None]:
def retrieve_reviews(query, top_k=5):
    # a. preprocess query
    cleaned_query = preprocess_text(query)
    # b. convert query to tf-idf vector
    query_vec = vectorizer.transform([cleaned_query])
    # c. compute cosine similarity
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # d. top k most similar
    top_indices = similarity.argsort()[-top_k:][::-1]
    # e. return results
    results = []
    for idx in top_indices:
        results.append({
            "original": reviews.iloc[idx],
            "cleaned": cleaned_reviews.iloc[idx],
            "score": similarity[idx]
        })
    return results

In [None]:
queries = ["great product with fast shipping", "disappointed"]

for q in queries:
    print(f"\n🔎 Query: {q}")
    results = retrieve_reviews(q, top_k=3)
    for i, res in enumerate(results, 1):
        print(f"\nResult {i}:")
        print(f"Original: {res['original']}")
        print(f"Cleaned: {res['cleaned']}")
        print(f"Score: {res['score']:.4f}")


🔎 Query: great product with fast shipping

Result 1:
Original: Id                                                                     5227
ProductId                                                        B0009OMU00
UserId                                                       A20HSF8AHEU7PT
ProfileName                                                   Mark Mitchell
HelpfulnessNumerator                                                      0
HelpfulnessDenominator                                                    0
Score                                                                     5
Time                                                             1229040000
Summary                                   Great product and would buy again
Text                      Enjoyed the product and they also provided ver...
Name: 5226, dtype: object
Cleaned: enjoyed product also provided fast shipping im need order
Score: 0.5024

Result 2:
Original: Id                                              

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
df = pd.read_csv("Reviews.csv")

In [None]:
df = df.dropna(subset=['Text'])

In [None]:
df = df.head(1000).reset_index(drop=True)

In [None]:
df = df.head(1000).reset_index(drop=True)

reviews = df['Text']

In [None]:
def preprocess_text_spacy(text):
    # a. lowercase
    text = text.lower()
    # b. tokenize with spacy
    doc = nlp(text)
    # c/d/e. filter non-alpha, remove stopwords, lemmatize
    tokens = [
        token.lemma_
        for token in doc
        if token.is_alpha and not token.is_stop
    ]
    # f. join back
    return " ".join(tokens)

In [None]:
cleaned_reviews = reviews.apply(preprocess_text_spacy)

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

In [None]:
def retrieve_reviews(query, top_k=5):
    # Preprocess query
    cleaned_query = preprocess_text_spacy(query)
    # Convert to vector
    query_vec = vectorizer.transform([cleaned_query])
    # Cosine similarity
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # Sort by score
    top_indices = similarity.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "original_review": reviews.iloc[idx],
            "cleaned_review": cleaned_reviews.iloc[idx],
            "similarity_score": float(similarity[idx])
        })
    return results

In [None]:
queries = ["great product with fast shipping", "disappointed"]

for q in queries:
    print(f"\n🔎 Query: {q}")
    results = retrieve_reviews(q, top_k=5)
    for i, res in enumerate(results, 1):
        print(f"\nResult {i}:")
        print(f"Original: {res['original_review']}")
        print(f"Cleaned: {res['cleaned_review']}")
        print(f"Score: {res['similarity_score']:.4f}")


🔎 Query: great product with fast shipping

Result 1:
Original: Use frequently as we like to do Asian dishes at least once a week.  Love this product.  Fast shipping, as usual.  Would buy again.
Cleaned: use frequently like asian dish week love product fast shipping usual buy
Score: 0.4004

Result 2:
Original: This stuff is great because it's low glycemic. Substitute this to sugar and you'll be doing your body a great favor.  This size is economical and shipping is fast, too.  I got mine very soon.
Cleaned: stuff great low glycemic substitute sugar body great favor size economical shipping fast get soon
Score: 0.3831

Result 3:
Original: good products and fast shipping equals a happy me. a little pricey but you can hand pick a few good flavors...a few i cant find anywhere else so def worth the price
Cleaned: good product fast shipping equal happy little pricey hand pick good flavor not find def worth price
Score: 0.3399

Result 4:
Original: These chips were a great deal.  Kettle was ha