# Data Cleaning

In [2]:
'''
Import required packages and libraries for data exploration
'''
import pandas as pd
import numpy as np
import re

In [None]:
'''
Set up file path and data handling objects
'''
PATH = "../data/reviews.csv"
data = pd.read_csv(PATH)

In [413]:
data.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


## Case Sensitivity
Convert the input features in the raw dataset into a case insensitive format (all lowercase/uppercase) to reduce the amount of distinct words in the data.

In [414]:
# Remove null values from tokenizer strings
data["Summary"] = data["Summary"].fillna("")
data["Text"] = data["Text"].fillna("")

In [415]:
# Convert all words to lowercase to reduce the number of unique features
data["Summary"] = data["Summary"].str.lower()
data["Text"] = data["Text"].str.lower()

data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,good quality dog food,i have bought several of the vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,not as advertised,product arrived labeled as jumbo salted peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""delight"" says it all",this is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,cough medicine,if you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,great taffy,great taffy at a great price. there was a wid...


## Punctuation Handling
Some words that contain punctuation can be recorded as separate features without punctuation handling (e.g., "Steve's pizza is great!" and "Steve makes great pizza!").

| is | great | great! | makes | pizza | pizza! | Steve | Steve's |
|----|-------|--------|-------|-------|--------|-------|---------|
|1   | 1     | 1      | 1     | 1     | 1      | 1     | 1       |

We want to remove uncessesary punctuation so that we don't have duplicates of effectively the same word.
| is | great | makes | pizza | Steve |
|----|-------|-------|-------|-------|
| 1  | 2     | 1     | 2     | 2     |

Doing this prevents our model from interpreting duplicate words as two separate features and reduces the number of dimensions our model has to process (increasing efficiency).

In [416]:
pattern = r"(?u)\b\w\w+\b"
tokenizer = lambda string : " ".join(re.findall(pattern=pattern, string=string))

data["Summary"] = data["Summary"].apply(tokenizer)
data["Text"] = data["Text"].apply(tokenizer)

## Remove Filler Words
Some words like "I", "the", "a", etc. don't impact the sentiment of the text content. Remove these words from all review content so there is less redundant features for the final model.

In [417]:
import nltk
from nltk.corpus import stopwords    

print("Downloading NLTK stopwords...")
nltk.download('stopwords', quiet=True)
    
stop_words = set(stopwords.words('english'))

# Print sample of stopwords
print("\nSample of English stopwords:")
print(sorted(list(stop_words))[:10])  # Print first 10 stopwords

def remove_stopwords_from_text(text):
    if pd.isna(text):
        return text
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Process both columns
print("Removing stop words from Summary...")
data['Summary'] = data['Summary'].apply(remove_stopwords_from_text)

print("Removing stop words from Text...")
data['Text'] = data['Text'].apply(remove_stopwords_from_text)

# Print samples for verification
print("\nSample of processed Summary:")
print(data['Summary'].head())
print("\nSample of processed Text:")
print(data['Text'].head())
    

Downloading NLTK stopwords...

Sample of English stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']
Removing stop words from Summary...
Removing stop words from Text...

Sample of processed Summary:
0    good quality dog food
1               advertised
2             delight says
3           cough medicine
4              great taffy
Name: Summary, dtype: object

Sample of processed Text:
0    bought several vitality canned dog food produc...
1    product arrived labeled jumbo salted peanuts p...
2    confection around centuries light pillowy citr...
3    looking secret ingredient robitussin believe f...
4    great taffy great price wide assortment yummy ...
Name: Text, dtype: object


## Lemmatization

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
def lemmatize(text):
    doc = nlp(text)
    result = " ".join([token.lemma_ for token in doc])
    return result

In [None]:
# Process both columns
print("Lemmatizing Summary...")
data['Summary'] = data['Summary'].apply(lemmatize)

print("Lemmatizing Text...")
data['Text'] = data['Text'].apply(lemmatize)

# Print samples for verification
print("\nSample of processed Summary:")
print(data['Summary'].head())
print("\nSample of processed Text:")
print(data['Text'].head())

In [None]:
data.to_csv("../data/lemmatize.csv")
data = pd.read_csv("../data/lemmatize.csv")

In [91]:
data = pd.read_csv("../data/lemmatize.csv")
data["Summary"] = data["Summary"].fillna("")
data["Text"] = data["Text"].fillna("")

## Remove Irrelevant Data Points
The first stage of data cleaning is to identify and remove data points that aren't related to our task. In "Amazon Fine Food Reviews", we have many different product reviews including: pet food, medicine, microwavable food, fine foods, etc.
- Is this category of food or type of review relevant to our task?
- Would removing this type of review from the data improve the accuracy of our model?
- If we remove this type of review, how will it effect our training process (would there be too little data remaining?)

In [92]:
non_aspects = {
    "pet_species":[
        "dog","cat","puppy","kitten","fish","hamster","rabbit","guinea pig","bird","parrot","turtle",
        "lizard", "snake", "ferret", "gerbil", "chinchilla", "mouse", "rat", "iguana", "gecko",
        "dogs","cats","puppys","kittens","fishs","hamsters","rabbits","guinea pigs","birds","parrots","turtles",
        "lizards", "snakes", "ferrets", "gerbils", "chinchillas", "mouses", "rats", "iguanas", "geckos"
    ],
    "pet_food_brands":[
        "purina", "pedigre", "iams", "blue buffalo", "hill science diet", "royal canin", "fancy feast", "friskies",
        "cesar", "meow mix", "nutro", "wellness", "orijen", "acana", "greenies", "temptations", "whiskas"
    ],
    "digestive": [
        "nausea", "vomiting", "diarrhea", "constipation", "bloating",
        "stomach ache", "indigestion", "heartburn", "cramps", "gas"
    ],
    "neurological": [
        "headache", "migraine", "dizziness", "fatigue", "insomnia",
        "brain fog", "numbness", "tingling", "vertigo"
    ],
    "respiratory": [
        "cough", "shortness of breath", "wheezing", "congestion",
        "runny nose", "sore throat", "sneezing"
    ],
    "skin": [
        "rash", "itching", "hives", "acne", "eczema", "redness",
        "dry skin", "swelling"
    ],
    "pain": [
        "pain", "ache", "soreness", "stiffness", "joint pain",
        "back pain", "chest pain", "muscle pain"
    ],
    "psychological": [
        "anxiety", "depression", "irritability", "mood swings",
        "panic attacks", "restlessness"
    ],
    "general": [
        "fever", "chills", "sweating", "weakness", "loss of appetite",
        "weight loss", "weight gain"
    ],
    "allergic": [
        "allergy", "anaphylaxis", "sensitivity", "intolerance",
        "swelling of the lips", "swelling of the throat"
    ]
}

In [93]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284226.5,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,0.0,1.0,0.0,0.0,1.0,939340800.0
25%,142113.25,142114.25,0.0,0.0,4.0,1271290000.0
50%,284226.5,284227.5,0.0,1.0,5.0,1311120000.0
75%,426339.75,426340.75,2.0,2.0,5.0,1332720000.0
max,568453.0,568454.0,866.0,923.0,5.0,1351210000.0


In [94]:
data.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,good quality dog food,buy several vitality can dog food product find...
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,advertise,product arrive label jumbo salt peanut peanut ...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,delight say,confection around century light pillowy citrus...
3,3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,cough medicine,look secret ingredient robitussin believe find...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,great taffy,great taffy great price wide assortment yummy ...


In [95]:
def search_prod(value, dataframe, series):
    products = set()

    for i, string in enumerate(series):
        for word in string.split():
            if value == word:
                products.add(dataframe.iloc[i]["ProductId"])
                break
    return products

In [99]:
for key in non_aspects.keys():
    for value in non_aspects[key]:
        sum_id = search_prod(value=value, dataframe=data, series=data["Summary"])
        txt_id = search_prod(value=value, dataframe=data, series=data["Text"])
        prod_id = sum_id.union(txt_id)

        data = data[~data["ProductId"].isin(prod_id)]

In [100]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,213513.0,213513.0,213513.0,213513.0,213513.0,213513.0
mean,286390.488851,286391.488851,1.555643,1.978381,4.20188,1296387000.0
std,164426.745625,164426.745625,5.470705,6.003509,1.319164,49575230.0
min,1.0,2.0,0.0,0.0,1.0,961718400.0
25%,142760.0,142761.0,0.0,0.0,4.0,1270253000.0
50%,285247.0,285248.0,0.0,1.0,5.0,1312848000.0
75%,429994.0,429995.0,2.0,2.0,5.0,1333930000.0
max,568453.0,568454.0,538.0,544.0,5.0,1351210000.0


In [101]:
data.head()

Unnamed: 0.1,Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
1,1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,advertise,product arrive label jumbo salt peanut peanut ...
2,2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,delight say,confection around century light pillowy citrus...
4,4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,great taffy,great taffy great price wide assortment yummy ...
5,5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,nice taffy,get wild hair taffy order five pound bag taffy...
6,6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,great good expensive brand,saltwater taffy great flavor soft chewy candy ...


## Remove Uncecessary Columns
- What columns are necessary for our model? 
- Is there anything that needs to be removed?

In [102]:
data["Text"] = data["Summary"].fillna("") + " " + data["Text"].fillna("")

In [103]:
# As seen in the data exploration stage, most numerical features excluding 
# the newly created "Helpfulness" were not indicative of Score
data = data.drop(columns=[
    "Id",
    "UserId", 
    "ProfileName", 
    "HelpfulnessNumerator", 
    "HelpfulnessDenominator",
    "Time",
    "Summary"
])

In [104]:
data.head()

Unnamed: 0.1,Unnamed: 0,ProductId,Score,Text
1,1,B00813GRG4,1,advertise product arrive label jumbo salt pean...
2,2,B000LQOCH0,4,delight say confection around century light pi...
4,4,B006K2ZZ7K,5,great taffy great taffy great price wide assor...
5,5,B006K2ZZ7K,4,nice taffy get wild hair taffy order five poun...
6,6,B006K2ZZ7K,5,great good expensive brand saltwater taffy gre...


## Convert Columns to Numerical
For the more complex columns we will be doing word embedding. However, features such as 'ProductId' can be converted into numerical form so the model has an easier time interpreting it.

In [105]:
# Convert ProductId to numerical values for modelling input
data["ProductId"] = pd.factorize(data["ProductId"])[0]

In [106]:
data.head()

Unnamed: 0.1,Unnamed: 0,ProductId,Score,Text
1,1,0,1,advertise product arrive label jumbo salt pean...
2,2,1,4,delight say confection around century light pi...
4,4,2,5,great taffy great taffy great price wide assor...
5,5,2,4,nice taffy get wild hair taffy order five poun...
6,6,2,5,great good expensive brand saltwater taffy gre...


In [107]:
len(data)

213513

## Dependency Parsing Split
In this section we need to split the dataset into single entity and multiple entity data points. This step is necessary because the framework for our model requires that single entity data points are handled by **model A** and multiple entity data points are handled by **model B**.

In [111]:
import pyabsa
from pyabsa.framework.checkpoint_class.checkpoint_template import CheckpointManager
from pyabsa.framework.checkpoint_class.checkpoint_template import available_checkpoints
from pyabsa import AspectTermExtraction as ATEPC
import json

[2025-05-31 21:18:59] (2.4.1.post1) PyABSA(2.4.1.post1): If your code crashes on Colab, please use the GPU runtime. Then run "pip install pyabsa[dev] -U" and restart the kernel.
Or if it does not work, you can use v1.x versions, e.g., pip install pyabsa<2.0 -U




Try to downgrade transformers<=4.29.0.






  _warn(f"unclosed running multiprocessing pool {self!r}",


In [112]:
checkpoint = CheckpointManager()
checkpoint_path = checkpoint._get_remote_checkpoint(checkpoint="multilingual", task_code="ATEPC")
print("Checkpoint downloaded to:", checkpoint_path)

[2025-05-31 21:19:16] (2.4.1.post1) ********** Available ATEPC model checkpoints for Version:2.4.1.post1 (this version) **********
[2025-05-31 21:19:16] (2.4.1.post1) ********** Available ATEPC model checkpoints for Version:2.4.1.post1 (this version) **********
[2025-05-31 21:19:16] (2.4.1.post1) Downloading checkpoint:multilingual 
[2025-05-31 21:19:16] (2.4.1.post1) Notice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets
[2025-05-31 21:19:16] (2.4.1.post1) Checkpoint already downloaded, skip
Checkpoint downloaded to: ./checkpoints\ATEPC_MULTILINGUAL_CHECKPOINT


In [113]:
aspect_extractor = ATEPC.AspectExtractor(
    'english',
    auto_device=True,  # False means load model on CPU
    cal_perplexity=False,
)

[2025-05-31 21:19:16] (2.4.1.post1) ********** Available ATEPC model checkpoints for Version:2.4.1.post1 (this version) **********
[2025-05-31 21:19:16] (2.4.1.post1) ********** Available ATEPC model checkpoints for Version:2.4.1.post1 (this version) **********
[2025-05-31 21:19:16] (2.4.1.post1) Downloading checkpoint:english 
[2025-05-31 21:19:16] (2.4.1.post1) Notice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets
[2025-05-31 21:19:16] (2.4.1.post1) Checkpoint already downloaded, skip
[2025-05-31 21:19:16] (2.4.1.post1) Load aspect extractor from checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
[2025-05-31 21:19:16] (2.4.1.post1) config: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43\fast_lcf_atepc.config
[2025-05-31 21:19:16] (2.4.1.post1) state_dict: checkpoints\ATEPC_ENGLISH_CHECKPOINT\fast_lcf_atepc_English_cdw_apc

  torch.load(


In [136]:
batch_size  = 5000
results     = []

for i in range(15000, len(data), batch_size):
    batch   = data["Text"][i:i+batch_size].to_list()
    res     = aspect_extractor.batch_predict(
        target_file=batch,
        save_result=False,
        print_result=False,
        pred_sentiment=True
    )
    results.extend(res)

    if res is not None:
        results.extend(res)

        with open(f"./absa_batch/results_batch{i}.json", "w", encoding="utf-8") as f:
            json.dump(res, f, ensure_ascii=False, indent=4)
    else:
        print(f"Warning: batch starting at index {i} returned None.")

preparing ate inference dataloader: 100%|██████████| 5000/5000 [00:11<00:00, 446.67it/s]
extracting aspect terms: 100%|██████████| 157/157 [04:11<00:00,  1.60s/it]
preparing apc inference dataloader: 100%|██████████| 8924/8924 [00:27<00:00, 323.05it/s]
classifying aspect sentiments: 100%|██████████| 279/279 [07:26<00:00,  1.60s/it]
preparing ate inference dataloader: 100%|██████████| 5000/5000 [00:10<00:00, 490.56it/s]
extracting aspect terms: 100%|██████████| 157/157 [03:52<00:00,  1.48s/it]
preparing apc inference dataloader: 100%|██████████| 8924/8924 [00:28<00:00, 313.39it/s]
classifying aspect sentiments: 100%|██████████| 279/279 [07:06<00:00,  1.53s/it]
preparing ate inference dataloader: 100%|██████████| 5000/5000 [00:09<00:00, 524.30it/s]
extracting aspect terms: 100%|██████████| 157/157 [03:48<00:00,  1.46s/it]
preparing apc inference dataloader: 100%|██████████| 8731/8731 [00:26<00:00, 332.71it/s]
classifying aspect sentiments: 100%|██████████| 273/273 [07:00<00:00,  1.54s/it

In [None]:
with open("Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

## Word Embedding

In [137]:
import json
from pathlib import Path
from scipy.sparse import lil_matrix
from scipy.sparse import vstack
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer

In [138]:
sentiment_to_score = {
    "Negative" : 1,
    "Neutral"  : 2,
    "Positive" : 3
}

In [139]:
RESULTS_PATH    = Path("./absa_batch")
results         = []

# Open and load the JSON file
for json_file in RESULTS_PATH.glob("*.json"):
    with open(json_file, "r", encoding="utf-8") as f:
        content = json.load(f)
        results.extend(content)

In [140]:
# Build vocabulary of all unique aspect terms
all_aspects = set()

for result in results:
    for aspect in result.get("aspect", []):
        all_aspects.add(aspect)

In [141]:
len(all_aspects)

35513

In [142]:
# Assign each aspect a column index
aspect_names = sorted(all_aspects)
aspect_vocab = {aspect: idx for idx, aspect in enumerate(aspect_names)}
n_samples    = len(results)
n_features   = len(aspect_vocab)

In [143]:
# Create sparse matrix using LIL (good for row-wise construction)
sentiment_matrix = lil_matrix((n_samples, n_features), dtype=np.float32)

# Keep track of rows with one or fewer aspect counts
nonzero_counts = []

In [144]:
# Fill the matrix
for i, result in enumerate(results):
    aspects     = result.get("aspect", [])
    sentiments  = result.get("sentiment", [])
    count       = 0
    for aspect, sentiment in zip(aspects, sentiments):
        col_idx = aspect_vocab.get(aspect)
        if col_idx is not None:
            sentiment_score = sentiment_to_score.get(sentiment, 0)
            sentiment_matrix[i, col_idx] = sentiment_score
            count += 1

    nonzero_counts.append(count)

# Convert to CSR for efficient arithmetic / storage
sentiment_matrix = sentiment_matrix.tocsr()

In [145]:
# Filter indicies
low_aspect_indicies     = [i for i, count in enumerate(nonzero_counts) if count <= 1]
high_aspect_indicies    = [i for i, count in enumerate(nonzero_counts) if count > 1]

# Create two separate matrices
low_aspect_matrix   = sentiment_matrix[low_aspect_indicies]
high_aspect_matrix  = sentiment_matrix[high_aspect_indicies]

In [146]:
'''
For single aspect and multi aspect split, let's save a copy of the raw copy to compare
'''
single_data     = data.iloc[low_aspect_indicies]
multiple_data   = data.iloc[high_aspect_indicies]

In [147]:
print(f"Single Aspect Size: {len(single_data)}")
print(f"Multiple Aspect Size: {len(multiple_data)}")

Single Aspect Size: 104247
Multiple Aspect Size: 109266


In [148]:
len(single_data)

104247

In [149]:
single_data.to_csv("./../data/single_raw.csv")
multiple_data.to_csv("./../data/multiple_raw.csv")

In [150]:
'''
Let's say:
    - "sentiment_matrix" is your sparse matrix (CSR format)
    - "data" is your original DataFrame with columns like "Score", "ProductID", "Helpfulness"
'''

# Dense columns for inclusion into sparse matrix
dense_features = ["ProductId", "Score"]

# Get dense data for each split
low_dense   = data.iloc[low_aspect_indicies][dense_features].values
high_dense  = data.iloc[high_aspect_indicies][dense_features].values

# Convert dense to sparse and stack
low_combined    = hstack([csr_matrix(low_dense), low_aspect_matrix])
high_combined   = hstack([csr_matrix(high_dense), high_aspect_matrix])

In [151]:
column_names = dense_features + aspect_names

## Save to External Dataset

In [152]:
import joblib

In [153]:
# Save both matrix and column names
joblib.dump({
    "matrix": low_combined,
    "columns": column_names
}, "single_aspect.pkl")

joblib.dump({
    "matrix": high_combined,
    "columns": column_names
}, "multi_aspect.pkl")

['multi_aspect.pkl']

In [154]:
PATH = "../cleaning/multi_aspect.pkl"

data    = joblib.load(PATH)
matrix  = data["matrix"]
columns = data["columns"]

In [155]:
matrix.shape

(109266, 35515)

In [158]:
PATH = "../cleaning/single_aspect.pkl"

data    = joblib.load(PATH)
matrix  = data["matrix"]
columns = data["columns"]

In [159]:
matrix.shape

(104247, 35515)