In [None]:
# Step 1: Download the dataset manually from Kaggle and place it in your working directory as 'Reviews.csv'

# Step 2: Install and import required libraries
!pip install pandas numpy scikit-learn nltk

import pandas as pd
import numpy as np
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 3: Import NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Step 4: Load the dataset
df = pd.read_csv('Reviews.csv')
reviews = df['Text'].dropna().head(10000)  # a. Select column, b. Remove nulls, c. Limit to 10,000

# Step 5: Load English stopwords
stop_words = set(stopwords.words('english'))

# Step 6: Preprocess review text
def preprocess(text):
    text = text.lower()  # a. Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # b. Remove punctuation/special chars
    tokens = word_tokenize(text)  # c. Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # d. Remove stopwords
    return ' '.join(tokens)  # e. Join tokens

# Step 7: Apply preprocessing
cleaned_reviews = reviews.apply(preprocess)

# Step 8: Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

# Step 9: Retrieve documents based on query
def retrieve_reviews(query, top_k=5):
    query_clean = preprocess(query)  # a. Preprocess query
    query_vec = vectorizer.transform([query_clean])  # b. TF-IDF vector
    similarity = cosine_similarity(query_vec, tfidf_matrix).flatten()  # c. Cosine similarity
    top_indices = similarity.argsort()[-top_k:][::-1]  # d. Top k reviews
    results = []
    for idx in top_indices:
        results.append({
            'Original': reviews.iloc[idx],
            'Cleaned': cleaned_reviews.iloc[idx],
            'Similarity': similarity[idx]
        })
    return pd.DataFrame(results)

# Step 10 & 11: Query examples
print("Query: great product with fast shipping")
print(retrieve_reviews("great product with fast shipping"))

print("\nQuery: disappointed")
print(retrieve_reviews("disappointed"))
