# Data Cleaning

In [2]:
'''
Import required packages and libraries for data exploration
'''
import pandas as pd

In [6]:
'''
Set up file path and data handling objects
'''
PATH = "../data/reviews.csv"
data = pd.read_csv(PATH)

## Remove Irrelevant Data Points
The first stage of data cleaning is to identify and remove data points that aren't related to our task. In "Amazon Fine Food Reviews", we have many different product reviews including: pet food, medicine, microwavable food, fine foods, etc.
- Is this category of food or type of review relevant to our task?
- Would removing this type of review from the data improve the accuracy of our model?
- If we remove this type of review, how will it effect our training process (would there be too little data remaining?)

## Remove Uncecessary Columns
- What columns are necessary for our model? 
- Is there anything that needs to be removed?

## Case Sensitivity
Convert the input features in the raw dataset into a case insensitive format (all lowercase/uppercase) to reduce the amount of distinct words in the data.

In [8]:
# Convert text features to lowercase
data['Summary'] = data['Summary'].str.lower()
data['Text'] = data['Text'].str.lower()

# Verify the conversion
print("Sample of converted Summary:")
print(data['Summary'].head())
print("\nSample of converted Text:")
print(data['Text'].head())

Sample of converted Summary:
0    good quality dog food
1        not as advertised
2    "delight" says it all
3           cough medicine
4              great taffy
Name: Summary, dtype: object

Sample of converted Text:
0    i have bought several of the vitality canned d...
1    product arrived labeled as jumbo salted peanut...
2    this is a confection that has been around a fe...
3    if you are looking for the secret ingredient i...
4    great taffy at a great price.  there was a wid...
Name: Text, dtype: object


## Remove Filler Words
Some words like "I", "the", "a", etc. don't impact the sentiment of the text content. Remove these words from all review content so there is less redundant features for the final model.

In [14]:
import nltk
from nltk.corpus import stopwords    

print("Downloading NLTK stopwords...")
nltk.download('stopwords', quiet=True)
    
stop_words = set(stopwords.words('english'))

# Print sample of stopwords
print("\nSample of English stopwords:")
print(sorted(list(stop_words))[:10])  # Print first 10 stopwords

def remove_stopwords_from_text(text):
    if pd.isna(text):
        return text
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Process both columns
print("Removing stop words from Summary...")
data['Summary'] = data['Summary'].apply(remove_stopwords_from_text)

print("Removing stop words from Text...")
data['Text'] = data['Text'].apply(remove_stopwords_from_text)

# Print samples for verification
print("\nSample of processed Summary:")
print(data['Summary'].head())
print("\nSample of processed Text:")
print(data['Text'].head())
    

Downloading NLTK stopwords...

Sample of English stopwords:
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']
Removing stop words from Summary...
Removing stop words from Text...

Sample of processed Summary:
0    good quality dog food
1               advertised
2           "delight" says
3           cough medicine
4              great taffy
Name: Summary, dtype: object

Sample of processed Text:
0    bought several vitality canned dog food produc...
1    product arrived labeled jumbo salted peanuts.....
2    confection around centuries. light, pillowy ci...
3    looking secret ingredient robitussin believe f...
4    great taffy great price. wide assortment yummy...
Name: Text, dtype: object


## Punctuation Handling
Some words that contain punctuation can be recorded as separate features without punctuation handling (e.g., "Steve's pizza is great!" and "Steve makes great pizza!").

| is | great | great! | makes | pizza | pizza! | Steve | Steve's |
|----|-------|--------|-------|-------|--------|-------|---------|
|1   | 1     | 1      | 1     | 1     | 1      | 1     | 1       |

We want to remove uncessesary punctuation so that we don't have duplicates of effectively the same word.
| is | great | makes | pizza | Steve |
|----|-------|-------|-------|-------|
| 1  | 2     | 1     | 2     | 2     |

Doing this prevents our model from interpreting duplicate words as two separate features and reduces the number of dimensions our model has to process (increasing efficiency).

In [15]:
import string
import re
    
# Function to remove punctuation from text
def clean_text(text):
    if pd.isna(text):
        return text
    
    # Remove other punctuation
    text = ''.join(char for char in text if char not in string.punctuation)
    
    # Handle multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()
    
# Process both columns
print("Removing punctuation from Summary...")
data['Summary'] = data['Summary'].apply(clean_text)

print("Removing punctuation from Text...")
data['Text'] = data['Text'].apply(clean_text)

    # Print samples for verification
print("\nSample of processed Summary:")
print(data['Summary'].head())
print("\nSample of processed Text:")
print(data['Text'].head())

Removing punctuation from Summary...
Removing punctuation from Text...

Sample of processed Summary:
0    good quality dog food
1               advertised
2             delight says
3           cough medicine
4              great taffy
Name: Summary, dtype: object

Sample of processed Text:
0    bought several vitality canned dog food produc...
1    product arrived labeled jumbo salted peanutsth...
2    confection around centuries light pillowy citr...
3    looking secret ingredient robitussin believe f...
4    great taffy great price wide assortment yummy ...
Name: Text, dtype: object


## Dependency Parsing Split
In this section we need to split the dataset into single entity and multiple entity data points. This step is necessary because the framework for our model requires that single entity data points are handled by **model A** and multiple entity data points are handled by **model B**.

In [30]:
import pandas as pd

food_aspects = {
    'taste': ['taste', 'flavor', 'flavour', 'delicious', 'yummy', 'tasty', 'sweet', 'sour', 'bitter', 'spicy'],
    'delivery': ['delivery', 'shipping', 'arrived', 'received', 'package', 'shipment'],
    'quality': ['quality', 'fresh', 'freshness', 'premium', 'grade', 'standard'],
    'price': ['price', 'cost', 'expensive', 'cheap', 'affordable', 'value', 'worth'],
    'packaging': ['packaging', 'package', 'container', 'box', 'seal', 'wrapped'],
    'service': ['service', 'customer service', 'support', 'help', 'assistance'],
    'texture': ['texture', 'consistency', 'soft', 'hard', 'crunchy', 'smooth', 'creamy'],
    'appearance': ['appearance', 'look', 'color', 'colour', 'shape', 'size']
}

def count_aspects(text):
    """Count the number of food-related aspects mentioned in the text."""
    if pd.isna(text):
        return 0
    
    text = text.lower()
    aspects_found = set()
    
    for aspect, keywords in food_aspects.items():
        for keyword in keywords:
            if keyword in text:
                aspects_found.add(aspect)
                break
    
    return len(aspects_found)

# Count aspects in each text
print("Analyzing aspects in Summary...")
data['summary_aspect_count'] = data['Summary'].apply(count_aspects)

print("Analyzing aspects in Text...")
data['text_aspect_count'] = data['Text'].apply(count_aspects)

# Split the data
print("\nSplitting data into single and multiple aspect sets...")

# Single aspect: either Summary or Text mentions exactly one aspect
single_aspect_mask = (data['summary_aspect_count'] <= 1) | (data['text_aspect_count'] <= 1)
single_aspect_data = data[single_aspect_mask].copy()

# Multiple aspect: both Summary and Text mention more than one aspect
multiple_aspect_mask = (data['summary_aspect_count'] > 1) & (data['text_aspect_count'] > 1)
multiple_aspect_data = data[multiple_aspect_mask].copy()


# Remove temporary columns before saving
single_aspect_data = single_aspect_data.drop(['summary_aspect_count', 'text_aspect_count'], axis=1)
multiple_aspect_data = multiple_aspect_data.drop(['summary_aspect_count', 'text_aspect_count'], axis=1)

# Save the split datasets
single_entity_file = "../data/reviews_single_aspect.csv"
multiple_entity_file = "../data/reviews_multiple_aspect.csv"
print(f"Saving single aspect data to {single_entity_file}...")
single_aspect_data.to_csv(single_entity_file, index=False)

print(f"Saving multiple aspect data to {multiple_entity_file}...")
multiple_aspect_data.to_csv(multiple_entity_file, index=False)

Analyzing aspects in Summary...
Analyzing aspects in Text...

Splitting data into single and multiple aspect sets...
Saving single aspect data to ../data/reviews_single_aspect.csv...
Saving multiple aspect data to ../data/reviews_multiple_aspect.csv...


## Word Embedding