<a href="https://www.kaggle.com/code/nadaarfaoui/preprocessing-the-amazon-electronics-dataset?scriptVersionId=289227868" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources (run once)
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
df = pd.read_csv("/kaggle/input/merged-amazon-electronics-dataset/merged_electronics_dataset.csv")

In [None]:
df.head()

In [None]:
df['name'] = df['name'].astype(str) \
                     .str.replace(r'\(Renewed\)', '', regex=True) \
                     .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True) \
                     .str.strip()  # remove leading/trailing spaces

In [None]:
# Drop unwanted columns
df = df.drop(columns=['main_category', 'sub_category'])
# Extract brand (first word before any space or parenthesis)
df['brand'] = df['name'].str.extract(r'^(\w+)')
# Clean review_rating to keep only numeric value
df['review_rating'] = df['review_rating'].str.extract(r'(\d+\.\d+)').astype(float)

In [None]:
print("Missing values before cleaning:\n", df.isnull().sum(), "\n")

In [None]:
df = df.dropna(subset=['review_text'])

In [None]:
def clean_numeric(col):
    col = col.astype(str).str.replace(r'[^\d.]', '', regex=True)  # remove non-numeric chars
    col = pd.to_numeric(col, errors='coerce')                      # convert invalids to NaN
    return col

# Apply cleaning
for col in ['no_of_ratings', 'discount_price', 'actual_price', 'review_rating']:
    df[col] = clean_numeric(df[col])
    # Fill missing values with mean (skip review_rating if you want to keep raw ratings)
    if col != 'review_rating':
        df[col] = df[col].fillna(df[col].mean())

In [None]:
print("Missing values after cleaning:\n", df.isnull().sum(), "\n")

In [None]:
# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define text preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()  # lowercase
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation and numbers
    text = ' '.join(word for word in text.split() if word not in stop_words)  # remove stopwords
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())  # lemmatize words
    return text

# Apply preprocessing to review_text
df['cleaned_review_text'] = df['review_text'].apply(preprocess_text)

# Display results
print(df[['review_text', 'cleaned_review_text']].head())
df = df.rename(columns={'review_rating': 'rating'})


# 1️⃣ Create binary sentiment label
def label_sentiment(rating):
    if rating >= 4:
        return "Positive"
    elif rating <= 2:
        return "Negative"
    else:
        return None  # Neutral reviews will be dropped

df['sentiment'] = df['rating'].apply(label_sentiment)

# 2️⃣ Drop Neutral reviews
df_binary = df[df['sentiment'].notnull()]

# 3️⃣ Optional: check class distribution
print("Class distribution (binary):")
print(df_binary['sentiment'].value_counts())

# 4️⃣ Save cleaned binary dataset
df_binary.to_csv("cleaned_dataset.csv", index=False)