In [1]:
!pip install pandas numpy scikit-learn nltk
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

df = pd.read_csv('Reviews.csv')
reviews = df['Text'].dropna().head(10000)  # a. Select column, b. Remove nulls, c. Limit to 10,000

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()  # a. Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # b. Remove punctuation/special chars
    tokens = word_tokenize(text)  # c. Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # d. Remove stopwords
    return ' '.join(tokens)  # e. Join tokens

cleaned_reviews = reviews.apply(preprocess)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

def retrieve_reviews(query, top_k=5):
    query_clean = preprocess(query)  # a. Preprocess query
    query_vec = vectorizer.transform([query_clean])  # b. TF-IDF vector
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()  # c. Cosine similarity
    top_indices = similarity.argsort()[-top_k:][::-1]  # d. Top k reviews
    results = []
    for idx in top_indices:
        results.append({
            'Original': reviews.iloc[idx],
            'Cleaned': cleaned_reviews.iloc[idx],
            'Similarity': similarity[idx]
        })
    return pd.DataFrame(results)

print("Query: great product with fast shipping")
print(retrieve_reviews("great product with fast shipping"))

print("\nQuery: disappointed")
print(retrieve_reviews("disappointed"))


Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TCS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TCS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Query: great product with fast shipping
                                            Original  \
0  Enjoyed the product and they also provided ver...   
1  The tea is good and fresh. We enjoy it. The sh...   
2  My daughter lives in Hawaii and sent me some g...   
3  The energy drink is a great product. The shipp...   
4  Fast shipping, items were packaged nicely and ...   

                                             Cleaned  Similarity  
0  enjoyed product also provided fast shipping im...    0.502383  
1  tea good fresh enjoy shipping fast cost reason...    0.443917  
2  daughter lives hawaii sent great coffee keurig...    0.405738  
3  energy drink great product shipping price craz...    0.403072  
4  fast shipping items packaged nicely described ...    0.390348  

Query: disappointed
                                            Original  \
0  I am a bit disappointed.  The flavor was not w...   
1  The product is very good. Way too expensive an...   
2  Disappointed.  The big boxes 