In [57]:
import pandas as pd

In [59]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
import nltk
from nltk.corpus import stopwords

In [63]:
df = pd.read_csv("Reviews.csv")

In [65]:
reviews = df["Text"]

In [67]:
reviews = reviews.dropna()

In [69]:
reviews = reviews.sample(10000, random_state=42)

In [71]:
stop_words = set(stopwords.words("english"))

In [73]:
def preprocess_text(text):
    text = text.lower()                                    
    text = re.sub(r'[^a-zA-Z\s]', '', text)                 
    tokens = text.split()                                   
    tokens = [word for word in tokens if word not in stop_words]  
    return " ".join(tokens)     

In [75]:
cleaned_reviews = reviews.apply(preprocess_text)

In [77]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

In [79]:
def search_reviews(query, top_k=5):
    cleaned_query = preprocess_text(query)                      
    query_vector = vectorizer.transform([cleaned_query])         
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()  
    top_indices = similarities.argsort()[-top_k:][::-1]          
    results = []
    for i in top_indices:
        results.append({
            "Original Review": reviews.iloc[i],
            "Cleaned Review": cleaned_reviews.iloc[i],
            "Similarity Score": similarities[i]
        })
    return results 

In [81]:
query1 = "great product with fast shipping"
query2 = "disappointed"

In [83]:
print("Query 1 Results:")
for r in search_reviews(query1, top_k=5):
    print(r, "\n")

Query 1 Results:
{'Original Review': 'I have ordered this product several times and have never been disappointed! Shipping is super fast.', 'Cleaned Review': 'ordered product several times never disappointed shipping super fast', 'Similarity Score': 0.5121833905266558} 

{'Original Review': 'I enjoy the mild coffee blends that Green Mountain has.  Great price and fast shipping.', 'Cleaned Review': 'enjoy mild coffee blends green mountain great price fast shipping', 'Similarity Score': 0.44996442527678004} 

{'Original Review': 'Great seller, fast shipping! I love these cookies! I have many allergies and I have no problems when eating these. I will be ordering more.', 'Cleaned Review': 'great seller fast shipping love cookies many allergies problems eating ordering', 'Similarity Score': 0.4320415307052864} 

{'Original Review': 'Nice size and quality for the price.  My dogs chew this things 24/7. Fast shipping.  No problems.', 'Cleaned Review': 'nice size quality price dogs chew things 

In [85]:
print("Query 2 Results:")
for r in search_reviews(query2, top_k=5):
    print(r, "\n")

Query 2 Results:
{'Original Review': 'I have ordered this product several times and have never been disappointed! Shipping is super fast.', 'Cleaned Review': 'ordered product several times never disappointed shipping super fast', 'Similarity Score': 0.366278704897463} 

{'Original Review': "I was disappointed with the taste and price of this coffee.  I don't recommend it unless you're looking for a VERY subtle flavor of chocolate only.", 'Cleaned Review': 'disappointed taste price coffee dont recommend unless youre looking subtle flavor chocolate', 'Similarity Score': 0.3363398391767886} 

{'Original Review': 'I was disappointed that there was a lot of juice and very few clams. It was expensive and not worth the cost.', 'Cleaned Review': 'disappointed lot juice clams expensive worth cost', 'Similarity Score': 0.33313266349762477} 

{'Original Review': "Bought several of these cakes and was not disappointed. Most delicious and moist cake I've ever had. Will definitely buy more again!!!"