In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

In [None]:
# Load dataset
df = pd.read_csv('/content/Reviews.csv')

# Limit to first 10,000 entries
df = df.head(10000)

# Display first few rows
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [None]:
#data Cleaning
# Remove missing values
df = df.dropna(subset=['Text', 'Score', 'ProductId'])

# Text preprocessing: Remove special characters, punctuation, and lowercasing
def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

df['Cleaned_Text'] = df['Text'].apply(clean_text)

In [None]:
#Feature Engineering
# Helpfulness ratio
df['Helpfulness_Score'] = df['HelpfulnessNumerator'] / df['HelpfulnessDenominator'].replace(0, np.nan)

# Sentiment analysis: 1 for positive, 0 for negative
def get_sentiment(text):
    analysis = TextBlob(text)
    return 1 if analysis.sentiment.polarity > 0 else 0

df['Sentiment'] = df['Cleaned_Text'].apply(get_sentiment)

In [None]:
#Feature Extraction
# TF-IDF Vectorizer for text
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the review text
X_text = tfidf.fit_transform(df['Cleaned_Text'])

# Show the shape of the transformed matrix
print(X_text.shape)

(10000, 5000)


In [None]:
#Feature Selection(Collaborative Filtering)
# Ensure 'Score' is numeric
df['Score'] = pd.to_numeric(df['Score'], errors='coerce')

# Drop rows where 'Score' is NaN (non-numeric rows)
df = df.dropna(subset=['Score'])
# Create user-item matrix with numeric scores
user_item_matrix = df.pivot_table(
    index='UserId',
    columns='ProductId',
    values='Score',
    aggfunc='mean'
).fillna(0)
# Check the user-item matrix
print(user_item_matrix.head())

ProductId       B00002NCJC  B00002Z754  B00005V3DC  B000084DVR  B000084E1U  \
UserId                                                                       
A10012K7DF3SBQ         0.0         0.0         0.0         0.0         0.0   
A1001TYW5FZYD9         0.0         0.0         0.0         0.0         0.0   
A1008DPSP6KC9J         0.0         0.0         0.0         0.0         0.0   
A101C99CG8EFUH         0.0         0.0         0.0         0.0         0.0   
A102XKYZE9Q9L4         0.0         0.0         0.0         0.0         0.0   

ProductId       B000084EK4  B000084EK5  B000084EK6  B000084EK7  B000084EK8  \
UserId                                                                       
A10012K7DF3SBQ         0.0         0.0         0.0         0.0         0.0   
A1001TYW5FZYD9         0.0         0.0         0.0         0.0         0.0   
A1008DPSP6KC9J         0.0         0.0         0.0         0.0         0.0   
A101C99CG8EFUH         0.0         0.0         0.0         0.0 

In [None]:
#Feature Selection(Content-Based Filtering)
# Similarity matrix using cosine similarity for content-based filtering
cosine_sim = cosine_similarity(X_text, X_text)

# Function to get recommendations based on content similarity
def get_content_based_recommendations(product_id, cosine_sim, df):
    product_idx = df[df['ProductId'] == product_id].index[0]
    similar_indices = cosine_sim[product_idx].argsort()[:-11:-1]
    return df.iloc[similar_indices][['ProductId', 'Summary']]

# Example: Get content-based recommendations
example_product_id = df['ProductId'].iloc[0]
print(get_content_based_recommendations(example_product_id, cosine_sim, df))


       ProductId                                            Summary
0     B001E4KFG0                              Good Quality Dog Food
2172  B000N5XCPM          Grammy's Pot Pie for Dogs - Dogs Love It!
4818  B00139TT72                                    Still the Best!
5564  B001ASZV5K                                         Good stew.
1591  B001CWZXIY                               Harmony Farms Review
6414  B002AYEBQI  Neither cat will touch it :( But dogs go wild....
2803  B000J2DQ46                                             MMMMMM
4791  B00139TT72  I agree with the previous reviewer, the name i...
4690  B000N5Z5RU               My cats love this real food cat food
5996  B003SE52K8        The "paws down" favorite among canned food!
