In [13]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

nltk.download("punkt")
nltk.download("punkt_tab")


DATA_PATH = "../data/processed/mobile_reviews_absa.csv"
df = pd.read_csv(DATA_PATH)

df.shape


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aryanpatel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aryanpatel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


(174249, 7)

In [14]:
ASPECTS = {
    "battery": ["battery", "charge", "charging", "power"],
    "camera": ["camera", "photo", "picture", "video"],
    "display": ["screen", "display", "resolution"],
    "performance": ["performance", "speed", "lag", "slow", "fast"],
    "build_quality": ["build", "quality", "durable", "sturdy"],
    "price": ["price", "cost", "value", "worth"],
    "software": ["software", "ui", "interface", "os", "update"]
}


In [15]:
def split_into_sentences(text):
    if not text or not isinstance(text, str):
        return []
    return sent_tokenize(text)


In [16]:
df["sentences"] = df["clean_review"].apply(split_into_sentences)
df[["clean_review", "sentences"]].head(10)


Unnamed: 0,clean_review,sentences
0,it works good but it goes slow sometimes but i...,[it works good but it goes slow sometimes but ...
1,i already had a phone with problems... i know ...,[i already had a phone with problems... i know...
2,the charging port was loose. i got that solder...,"[the charging port was loose., i got that sold..."
3,"phone looks good but wouldn't stay charged, ha...","[phone looks good but wouldn't stay charged, h..."
4,i originally was using the samsung s2 galaxy f...,[i originally was using the samsung s2 galaxy ...
5,it's battery life is great. it's very responsi...,"[it's battery life is great., it's very respon..."
6,i was able to get the phone i previously owned...,[i was able to get the phone i previously owne...
7,i love the phone. it does everything i need an...,"[i love the phone., it does everything i need ..."
8,the battery was old & had been over used becau...,[the battery was old & had been over used beca...
9,"pros-beautiful screen,capable of running chrom...","[pros-beautiful screen,capable of running chro..."


In [17]:
sentence_df = df.explode("sentences").reset_index(drop=True)
sentence_df = sentence_df[sentence_df["sentences"].str.len() > 0]


In [18]:
sentence_df = sentence_df.rename(columns={"sentences": "sentence"})


In [19]:
ASPECT_DICT = {
    "battery": ["battery", "battery life", "charge", "charging"],
    "camera": ["camera", "photo", "picture", "video"],
    "display": ["screen", "display", "resolution"],
    "performance": ["performance", "speed", "lag", "slow", "fast"],
    "build": ["build", "quality", "design"],
    "price": ["price", "cost", "value", "worth"]
}


In [20]:
def extract_sentence_aspects(sentence):
    sentence = sentence.lower()
    aspects = []
    for aspect, keywords in ASPECT_DICT.items():
        if any(k in sentence for k in keywords):
            aspects.append(aspect)
    return aspects


In [21]:
sentence_df["sentence_aspects"] = sentence_df["sentence"].apply(extract_sentence_aspects)


In [22]:
sentence_df = sentence_df[sentence_df["sentence_aspects"].str.len() > 0]


In [23]:
sentence_df = sentence_df.explode("sentence_aspects")
sentence_df = sentence_df.rename(columns={"sentence_aspects": "aspect"})


In [24]:
sentence_df[[
    "Product Name",
    "Brand Name",
    "Rating",
    "sentence",
    "aspect"
]].head()


Unnamed: 0,Product Name,Brand Name,Rating,sentence,aspect
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,4,it works good but it goes slow sometimes but i...,performance
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,1,i already had a phone with problems... i know ...,battery
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,1,i wish i would have read these comments then i...,battery
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,2,the charging port was loose.,battery
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,2,then needed a new battery as well.,battery


cos diffrent sentence have diffrent sentiment

In [27]:
import joblib
import re

# load trained pipeline
sentiment_model = joblib.load("../data/sentiment_model.joblib")




# reuse SAME cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

sentence_df["sentiment"] = sentiment_model.predict(
    sentence_df["sentence"].apply(clean_text)
)

sentence_df.head()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,Product Name,Brand Name,Price,Rating,clean_review,aspects,aspect_count,sentence,aspect,sentiment
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,it works good but it goes slow sometimes but i...,['performance'],1,it works good but it goes slow sometimes but i...,performance,neutral
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,i already had a phone with problems... i know ...,['battery'],1,i already had a phone with problems... i know ...,battery,negative
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,i already had a phone with problems... i know ...,['battery'],1,i wish i would have read these comments then i...,battery,negative
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,the charging port was loose. i got that solder...,"['battery', 'price']",2,the charging port was loose.,battery,negative
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,the charging port was loose. i got that solder...,"['battery', 'price']",2,then needed a new battery as well.,battery,positive


In [28]:
aspect_summary = (
    sentence_df
    .groupby("aspect")["sentiment"]
    .value_counts()
    .unstack()
    .fillna(0)
)

aspect_summary


sentiment,negative,neutral,positive
aspect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
battery,29353,22627,31458
build,5945,7721,17602
camera,12645,17250,28562
display,21037,18470,25702
performance,8642,11349,33513
price,7487,12514,34751


In [29]:
sentence_df.to_csv("../data/processed/final_absa_results.csv", index=False)


In [30]:
probs = sentiment_model.predict_proba(
    sentence_df["sentence"].apply(clean_text)
)


In [31]:
preds = sentiment_model.predict(
    sentence_df["sentence"].apply(clean_text)
)

sentence_df["sentiment"] = preds
sentence_df["confidence"] = probs.max(axis=1)


In [32]:
final_output = (
    sentence_df[["sentence", "aspect", "sentiment", "confidence"]]
    .to_dict(orient="records")
)

final_output[:2]


[{'sentence': 'it works good but it goes slow sometimes but its a very good phone i love it',
  'aspect': 'performance',
  'sentiment': 'neutral',
  'confidence': 0.623313972280907},
 {'sentence': 'i already had a phone with problems... i know it stated it was used, but dang, it did not state that it did not charge.',
  'aspect': 'battery',
  'sentiment': 'negative',
  'confidence': 0.5267655634257146}]