In [3]:
import pandas as pd
import re



In [4]:
DATA_PATH = "../data/raw/Amazon_Unlocked_Mobile.csv"

df = pd.read_csv(DATA_PATH)
df.shape


(413840, 6)

In [5]:
df = df.dropna(subset=["Reviews"])
df["Reviews"] = df["Reviews"].astype(str)

def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_review"] = df["Reviews"].apply(clean_text)


In [6]:
ASPECT_KEYWORDS = {
    "battery": ["battery", "charge", "charging", "drain"],
    "camera": ["camera", "photo", "picture", "video"],
    "screen": ["screen", "display", "touch"],
    "performance": ["performance", "slow", "lag", "fast", "speed"],
    "build": ["build", "quality", "design", "durable"],
    "price": ["price", "cost", "value", "worth"],
    "storage": ["storage", "memory", "ram", "gb"],
    "network": ["signal", "network", "call", "wifi", "bluetooth"]
}


In [7]:
def extract_aspects(text):
    found = []
    for aspect, keywords in ASPECT_KEYWORDS.items():
        if any(k in text for k in keywords):
            found.append(aspect)
    return found

df["aspects"] = df["clean_review"].apply(extract_aspects)


In [8]:
absa_df = df[df["aspects"].map(len) > 0].copy()


In [9]:
absa_df["aspect_count"] = absa_df["aspects"].apply(len)


In [10]:
absa_df = absa_df[
    [
        "Product Name",
        "Brand Name",
        "Price",
        "Rating",
        "clean_review",
        "aspects",
        "aspect_count"
    ]
]


In [11]:
# Full cleaned dataset (all mobile reviews)
df.to_csv(
    "../data/processed/mobile_reviews_full.csv",
    index=False
)

# ABSA-ready dataset (only aspect mentions)
absa_df.to_csv(
    "../data/processed/mobile_reviews_absa.csv",
    index=False
)


In [12]:
print("Full dataset:", df.shape)
print("ABSA dataset:", absa_df.shape)

absa_df["aspect_count"].value_counts().head()


Full dataset: (413770, 8)
ABSA dataset: (174249, 7)


aspect_count
1    97215
2    39141
3    17932
4     9482
5     5232
Name: count, dtype: int64

# ðŸ”’ PREPROCESSING FROZEN
# Any change beyond this point invalidates downstream analysis
