### ***Imports & Load Data***

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

df = pd.read_csv(r"C:\Users\Aakash\Desktop\ECOM-- SENTIMENT\Src\Data\Processed\flipkart_cleaned.csv")
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aakash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aakash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,rating,review_title,review_text,reviewer_name,verified_purchase,review_date,helpful_upvotes,helpful_total,review_date_raw,product
0,4.0,Pretty good,Good compact phone and having good camera but ...,Swayam Vish,True,2024-11-01,37.0,4.0,1 month ago,Samsung S24
1,4.0,Good quality product,After using S24 for 4 days am writing this rev...,Jayasuriya Anbu,True,2024-11-01,127.0,27.0,1 month ago,Samsung S24
2,5.0,Awesome,Today received this phone. It's just outstandi...,Arnab Deb,True,2024-11-01,10.0,0.0,1 month ago,Samsung S24
3,5.0,Mind-blowing purchase,"Powerful performance, excellent display,compac...",Amit Kumar Behera,True,2024-11-01,34.0,6.0,1 month ago,Samsung S24
4,5.0,Brilliant,Just looking like a wow,Raja,True,2024-11-01,64.0,15.0,1 month ago,Samsung S24


### ***Handle missing values & duplicates***

In [2]:
print("Initial rows:", len(df))

# drop rows where review_text or rating is missing
df = df.dropna(subset=["review_text", "rating"])

# drop duplicate reviews (same text for same product)
df = df.drop_duplicates(subset=["product", "review_text"])

print("After cleaning rows:", len(df))
df.head()

Initial rows: 1623
After cleaning rows: 1429


Unnamed: 0,rating,review_title,review_text,reviewer_name,verified_purchase,review_date,helpful_upvotes,helpful_total,review_date_raw,product
0,4.0,Pretty good,Good compact phone and having good camera but ...,Swayam Vish,True,2024-11-01,37.0,4.0,1 month ago,Samsung S24
1,4.0,Good quality product,After using S24 for 4 days am writing this rev...,Jayasuriya Anbu,True,2024-11-01,127.0,27.0,1 month ago,Samsung S24
2,5.0,Awesome,Today received this phone. It's just outstandi...,Arnab Deb,True,2024-11-01,10.0,0.0,1 month ago,Samsung S24
3,5.0,Mind-blowing purchase,"Powerful performance, excellent display,compac...",Amit Kumar Behera,True,2024-11-01,34.0,6.0,1 month ago,Samsung S24
4,5.0,Brilliant,Just looking like a wow,Raja,True,2024-11-01,64.0,15.0,1 month ago,Samsung S24


### ***Create sentiment labels***

In [3]:
def label_sentiment(r):
    if r >= 4:
        return "positive"
    elif r <= 2:
        return "negative"
    else:
        return "neutral"

df["sentiment"] = df["rating"].apply(label_sentiment)
df["sentiment"].value_counts()


sentiment
positive    1079
negative     257
neutral       93
Name: count, dtype: int64

### ***Text cleaning function***

In [4]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    # lowercase
    text = text.lower()
    
    # remove URLs
    text = re.sub(r"http\S+|www.\S+", " ", text)
    
    # keep only letters and spaces
    text = re.sub(r"[^a-z\s]", " ", text)
    
    # tokenize
    tokens = text.split()
    
    # remove stopwords and very short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    
    # lemmatize (better than stemming for readability)
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return " ".join(tokens)


### ***Apply text cleaning + add word count***

In [5]:
df["clean_text"] = df["review_text"].astype(str).apply(clean_text)
df["word_count"] = df["clean_text"].str.split().str.len()

df[["review_text", "clean_text", "sentiment", "product"]].head(10)

Unnamed: 0,review_text,clean_text,sentiment,product
0,Good compact phone and having good camera but ...,good compact phone good camera facing issu bat...,positive,Samsung S24
1,After using S24 for 4 days am writing this rev...,using day writing review overall good compact ...,positive,Samsung S24
2,Today received this phone. It's just outstandi...,today received phone outstanding battery good ...,positive,Samsung S24
3,"Powerful performance, excellent display,compac...",powerful performance excellent display compact...,positive,Samsung S24
4,Just looking like a wow,looking like wow,positive,Samsung S24
5,"Excellent product and worth it to buy , if th...",excellent product worth buy option choose ipho...,positive,Samsung S24
6,Everything is just excellent but only battery ...,everything excellent battery average,positive,Samsung S24
7,I got the phone today the behaviour of deliver...,got phone today behaviour delivery agent aweso...,positive,Samsung S24
8,"Camera qulity and build qulity , sound qulity ...",camera qulity build qulity sound qulity phon o...,positive,Samsung S24
9,Hi All. I am writing this review based on my 1...,writing review based week usage bought bbd sal...,positive,Samsung S24


### ***Save processed data for EDA & Model***

In [6]:
import os

# Create directory if not exists
os.makedirs("../Data/Processed", exist_ok=True)

# Save final processed dataset
output_path = "../Data/Processed/flipkart_model_ready.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print("✅ Saved processed dataset to:", output_path)
print("Rows:", len(df))

✅ Saved processed dataset to: ../Data/Processed/flipkart_model_ready.csv
Rows: 1429
