### Data Cleaning

#### 1. Load the dependencies

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import spacy
import string
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

#### 2. Load the data

In [5]:
df = pd.read_csv('Amazon_Food_Reviews.csv')

In [6]:
df.head(1).T

Unnamed: 0,0
Id,1
ProductId,B001E4KFG0
UserId,A3SGXH7AUHU8GW
ProfileName,delmartian
HelpfulnessNumerator,1
HelpfulnessDenominator,1
Score,5
Time,1303862400
Summary,Good Quality Dog Food
Text,I have bought several of the Vitality canned d...


#### 3. Handling missing values

In [8]:
missing_values = df.isna().sum()
missing_values

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [9]:
df = df.dropna(subset=['ProfileName', 'Summary'])

In [10]:
missing_values = df.isna().sum()
missing_values

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [11]:
## Converting Date-Time to readable format
df['Time_converted'] = pd.to_datetime(df['Time'], unit='s')
df['Time_converted']

0        2011-04-27
1        2012-09-07
2        2008-08-18
3        2011-06-13
4        2012-10-21
            ...    
568449   2011-03-09
568450   2012-03-09
568451   2012-02-21
568452   2012-03-13
568453   2012-05-31
Name: Time_converted, Length: 568401, dtype: datetime64[ns]

#### 4. Lemmatization

In [13]:
# Define the total sample size
total_sample_size = 5000

# Count instances in each stratum
stratum_counts = df['Score'].value_counts()

# Determine sample size for each stratum
sample_sizes = (stratum_counts / stratum_counts.sum() * total_sample_size).round().astype(int)

# List to hold sampled data
samples = []

# Randomly sample from each stratum
for score, size in sample_sizes.items():
    stratum_samples = df[df['Score'] == score].sample(n=size, random_state=1)
    samples.append(stratum_samples)

# Combine samples into a final DataFrame
final_sample = pd.concat(samples)

# print(final_sample)

In [14]:
# Count instances in each stratum
stratum_counts = final_sample['Score'].value_counts()
stratum_counts

Score
5    3194
4     709
1     460
3     375
2     262
Name: count, dtype: int64

In [15]:
# Load the English model
nlp = spacy.load("en_core_web_sm")

# Lemmatization function for batch processing
def lemmatize_batch(texts):
    docs = nlp.pipe(texts, disable=["ner", "parser"], batch_size=100)
    return [" ".join([token.lemma_ for token in doc]) for doc in docs]

# Apply the lemmatization function to the DataFrame in batches
final_sample['lemmas'] = lemmatize_batch(final_sample['Text'].tolist())

# Display the DataFrame with lemmas
print(final_sample[['Text', 'lemmas']])

                                                     Text  \
519868  I love these chips! This is the best snack eve...   
518974  This is a really good chocolate candy, has jus...   
55123   My wife has a male cat who throws up EVERYTHIN...   
508510  This entire product line is absolutely great! ...   
155616  I use this to keep my blood level stable all d...   
...                                                   ...   
226450  As with Mr. Carlson's review, my order was als...   
437066  I've been ordering a lot of these. Many of the...   
366627  The Dogswell Happy Hips Sweet Potato Chews I b...   
230944  Just so you understand me, my last meal on ear...   
126886  These traps work and they require less excavat...   

                                                   lemmas  
519868  I love these chip ! this be the good snack eve...  
518974  this be a really good chocolate candy , have j...  
55123   my wife have a male cat who throw up everythin...  
508510  this entire product

#### 5. Tokenization

In [17]:
# Tokenization function with punctuation and stop word removal and lowercasing
def tokenize_batch(texts):
    docs = nlp.pipe(texts, disable=["ner", "parser"], batch_size=100)
    return [
        [token.text.lower() for token in doc 
         if token.text not in string.punctuation 
         and not token.is_stop 
         and token.text.strip()]  # Remove empty strings
        for doc in docs
    ]

def clean_text(text):
    # Remove all characters except letters
    return re.sub(r'[^a-zA-Z\s]', '', text)  # Allow spaces for tokenization

# Apply the lemmatization function
final_sample['lemmas'] = lemmatize_batch(final_sample['Text'].tolist())

# Clean the lemmas to keep only letters
final_sample['cleaned_lemmas'] = final_sample['lemmas'].apply(clean_text)

# Apply the tokenization function to the cleaned lemmas
final_sample['tokens'] = tokenize_batch(final_sample['cleaned_lemmas'].tolist())

# Display the DataFrame with original text, lemmas, and tokens
print(final_sample[['Text', 'lemmas', 'tokens']])

                                                     Text  \
519868  I love these chips! This is the best snack eve...   
518974  This is a really good chocolate candy, has jus...   
55123   My wife has a male cat who throws up EVERYTHIN...   
508510  This entire product line is absolutely great! ...   
155616  I use this to keep my blood level stable all d...   
...                                                   ...   
226450  As with Mr. Carlson's review, my order was als...   
437066  I've been ordering a lot of these. Many of the...   
366627  The Dogswell Happy Hips Sweet Potato Chews I b...   
230944  Just so you understand me, my last meal on ear...   
126886  These traps work and they require less excavat...   

                                                   lemmas  \
519868  I love these chip ! this be the good snack eve...   
518974  this be a really good chocolate candy , have j...   
55123   my wife have a male cat who throw up everythin...   
508510  this entire pro

In [18]:
def tokenize(text):
    # Simple tokenization
    return re.findall(r'\b\w+\b', text.lower())

final_sample['tokens_simple'] = final_sample['Text'].apply(tokenize)

In [19]:
final_sample.head(1).T

Unnamed: 0,519868
Id,519869
ProductId,B000YSTIL0
UserId,A3R8GMMQBX0OAD
ProfileName,Ma Princesse
HelpfulnessNumerator,0
HelpfulnessDenominator,0
Score,5
Time,1272067200
Summary,Best Snack Ever!!
Text,I love these chips! This is the best snack eve...


In [20]:
## Save as csv file
final_sample.to_csv('Amazon_Food_Reviews_cleaned.csv')