### Data Preparation

Data Cleaning

In [1]:
import pandas as pd

df = pd.read_csv('amazon_sales_2023.csv')

# define the categories to select
selected_categories = ['Health and Personal Care','Electronics','Home and Kitchen','Toys and Games','Clothing Shoes and Jewelry']

df = df[df['category'].isin(selected_categories)]
df.head()

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,"This sunglasses are Made in China , originals ...",1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,"I ordered the lenses that fade down, and I got...",2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,The material of this 3-pack cotton boyshorts s...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,This product arrived this evening. One item m...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,Not sized the same as the others. I will retur...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry


Drop missing values

In [2]:
# drop missing values
print("Before Drop: ")
print(df.isnull().sum())

df = df.dropna()

print("\nAfter Drop: ")
print(df.isnull().sum())

Before Drop: 
reviewText    175
overall         0
reviewTime      0
sentiment       0
category        0
dtype: int64

After Drop: 
reviewText    0
overall       0
reviewTime    0
sentiment     0
category      0
dtype: int64


Drop duplicated

In [3]:
# drop duplicates
df = df.drop_duplicates()
print('Total Duplicated:', df.duplicated().sum())

Total Duplicated: 0


Last checking

In [4]:
# last check missing values 
print(df.isnull().sum())

# last check duplicated
print(df.duplicated().sum())

# dimensions 
print(df.shape)

reviewText    0
overall       0
reviewTime    0
sentiment     0
category      0
dtype: int64
0
(749810, 5)


Text Cleaning

In [5]:
# lowercase text
def lowercase_text(text):
    return text.lower()

# change apostrophe
def replace_apostrophe(text):
    return text.replace("’", "'")

df[['reviewText']] = df[['reviewText']].applymap(lowercase_text)
df[['reviewText']] = df[['reviewText']].applymap(replace_apostrophe)
df.head(10)

  df[['reviewText']] = df[['reviewText']].applymap(lowercase_text)
  df[['reviewText']] = df[['reviewText']].applymap(replace_apostrophe)


Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,"this sunglasses are made in china , originals ...",1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,"i ordered the lenses that fade down, and i got...",2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,the material of this 3-pack cotton boyshorts s...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,this product arrived this evening. one item m...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,not sized the same as the others. i will retur...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
1191067,"i have an original messenger bag, which is per...",1,"03 16, 2014",negative,Clothing Shoes and Jewelry
1191068,"hi, ken here ..well now writing a review on th...",2,"05 4, 2013",negative,Clothing Shoes and Jewelry
1191069,"looks cheap, what do you expect though for the...",1,"03 18, 2014",negative,Clothing Shoes and Jewelry
1191070,"watch arrived on time, item itself too small f...",2,"05 15, 2013",negative,Clothing Shoes and Jewelry
1191071,"good price on this shoe, but it's an older sty...",1,"02 24, 2012",negative,Clothing Shoes and Jewelry


Word segmentation

In [6]:
import wordninja

# Word Segmentation on text column
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join(wordninja.split(x)))
df.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,this sunglasses are made in china originals bo...,1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,i ordered the lenses that fade down and i got ...,2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,the material of this 3 pack cotton boy shorts ...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,this product arrived this evening one item mis...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,not sized the same as the others i will return...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
1191067,i have an original messenger bag which is perf...,1,"03 16, 2014",negative,Clothing Shoes and Jewelry
1191068,hi ken here well now writing a review on this ...,2,"05 4, 2013",negative,Clothing Shoes and Jewelry
1191069,looks cheap what do you expect though for the ...,1,"03 18, 2014",negative,Clothing Shoes and Jewelry
1191070,watch arrived on time item itself too small fo...,2,"05 15, 2013",negative,Clothing Shoes and Jewelry
1191071,good price on this shoe but it's an older styl...,1,"02 24, 2012",negative,Clothing Shoes and Jewelry


Expand word

In [7]:
import contractions

# expand the words like cnnt to cannot / ve to have and so on
def expand_contractions(text):

    expanded_text = contractions.fix(text)
    return expanded_text

df['reviewText'] = df['reviewText'].apply(expand_contractions)
df.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,this sunglasses are made in china originals bo...,1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,i ordered the lenses that fade down and i got ...,2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,the material of this 3 pack cotton boy shorts ...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,this product arrived this evening one item mis...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,not sized the same as the others i will return...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
1191067,i have an original messenger bag which is perf...,1,"03 16, 2014",negative,Clothing Shoes and Jewelry
1191068,hi ken here well now writing a review on this ...,2,"05 4, 2013",negative,Clothing Shoes and Jewelry
1191069,looks cheap what do you expect though for the ...,1,"03 18, 2014",negative,Clothing Shoes and Jewelry
1191070,watch arrived on time item itself too small fo...,2,"05 15, 2013",negative,Clothing Shoes and Jewelry
1191071,good price on this shoe but it is an older sty...,1,"02 24, 2012",negative,Clothing Shoes and Jewelry


Remove emoji and emoticons

In [9]:
import re
import emoji
from emoticons_lib import emoticons_lib

# convert emoticon with text   { :), :( }
def convert_emojis_with_text(text):
    emoticon_pattern = re.compile('|'.join(re.escape(emoticon) for emoticon in emoticons_lib.keys()))
    return emoticon_pattern.sub(lambda match: emoticons_lib[match.group(0)], text)

columns_to_transform = ['reviewText']

for column in columns_to_transform:
    # convert emoticons to text
    df[column] = df[column].apply(lambda e: convert_emojis_with_text(e))

    # convert emojis to text
    df[column] = df[column].apply(lambda e: emoji.demojize(e, language="en"))

    # replace underscore with whitespace (caused by emojis text like 😊 convert to :smiling_smiling_face ）
    df[column] = df[column].apply(lambda e: e.replace('_', ' '))

    # remove semicolon with whitespace (caused by emojis text)
    df[column] = df[column].apply(lambda e: e.replace(':', ' '))

df.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,this sunglasses are made in china originals bo...,1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,i ordered the lenses that fade down and i got ...,2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,the material of this 3 pack cotton boy shorts ...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,this product arrived this evening one item mis...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,not sized the same as the others i will return...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
1191067,i have an original messenger bag which is perf...,1,"03 16, 2014",negative,Clothing Shoes and Jewelry
1191068,hi ken here well now writing a review on this ...,2,"05 4, 2013",negative,Clothing Shoes and Jewelry
1191069,looks cheap what do you expect though for the ...,1,"03 18, 2014",negative,Clothing Shoes and Jewelry
1191070,watch arrived on time item itself too small fo...,2,"05 15, 2013",negative,Clothing Shoes and Jewelry
1191071,good price on this shoe but it is an older sty...,1,"02 24, 2012",negative,Clothing Shoes and Jewelry


General cleaning

In [10]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# remove words =  or > 2 characters
def remove_short_words(text):
    return ' '.join([word for word in text.split() if len(word) > 2])

# remove symbols
def remove_symbols(text):
    symbol_pattern = re.compile(r'[\(\)\[\]:]')
    return symbol_pattern.sub('', text)

# remove symbols and digits
def remove_symbols_digits(text):
    return re.sub('[^a-zA-Z\s]', ' ', text)

# remove URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

# remove HTML tags
def remove_html_tags(text):
    return re.sub(r'<[^>]+>', '', text)

# remove extra whitespace
def remove_whitespace(text):
    return ' '.join(text.split())

# remove punctuation
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([token for token in text.split() if token.lower() not in stop_words])

# lemmatizing text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(token) for token in text.split()])

# list of columns to apply the functions
columns_to_transform = ['reviewText']

# apply the functions to both columns
for column in columns_to_transform:
    df[column] = df[column].apply(remove_short_words)
    df[column] = df[column].apply(remove_symbols)
    df[column] = df[column].apply(remove_symbols_digits)
    df[column] = df[column].apply(remove_urls)
    df[column] = df[column].apply(remove_html_tags)
    df[column] = df[column].apply(remove_whitespace)
    df[column] = df[column].apply(remove_punctuation)
    df[column] = df[column].apply(remove_stopwords)
    df[column] = df[column].apply(lemmatize_text)

df.head(10)

  return re.sub('[^a-zA-Z\s]', ' ', text)


Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,sunglass made china original bol sunglass fran...,1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,ordered lens fade got plain black much large l...,2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,material pack cotton boy short stretch much ma...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,product arrived evening one item missing instr...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,sized others return get different style still ...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
1191067,original messenger bag perfect every way ten y...,1,"03 16, 2014",negative,Clothing Shoes and Jewelry
1191068,ken well writing review product well look nice...,2,"05 4, 2013",negative,Clothing Shoes and Jewelry
1191069,look cheap expect though price center stone lo...,1,"03 18, 2014",negative,Clothing Shoes and Jewelry
1191070,watch arrived time item small taste watch litt...,2,"05 15, 2013",negative,Clothing Shoes and Jewelry
1191071,good price shoe older style part white trim tu...,1,"02 24, 2012",negative,Clothing Shoes and Jewelry


Reduce repeat characters

In [11]:
# reduce repeated characters (like wayssss to wayss or helloooooo to helloo)
def reduce_repeated_characters(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

df['reviewText'] = df['reviewText'].apply(reduce_repeated_characters)
df.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
1191062,sunglass made china original bol sunglass fran...,1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1191063,ordered lens fade got plain black much large l...,2,"03 22, 2014",negative,Clothing Shoes and Jewelry
1191064,material pack cotton boy short stretch much ma...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
1191065,product arrived evening one item missing instr...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
1191066,sized others return get different style still ...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
1191067,original messenger bag perfect every way ten y...,1,"03 16, 2014",negative,Clothing Shoes and Jewelry
1191068,ken well writing review product well look nice...,2,"05 4, 2013",negative,Clothing Shoes and Jewelry
1191069,look cheap expect though price center stone lo...,1,"03 18, 2014",negative,Clothing Shoes and Jewelry
1191070,watch arrived time item small taste watch litt...,2,"05 15, 2013",negative,Clothing Shoes and Jewelry
1191071,good price shoe older style part white trim tu...,1,"02 24, 2012",negative,Clothing Shoes and Jewelry


Spelling check

In [11]:
# define the categories to select
df_health = df.copy()
select_health = ['Health and Personal Care']
df_health = df_health[df_health['category'].isin(select_health)]

df_elec = df.copy()
select_elec = ['Electronics']
df_elec = df_elec[df_elec['category'].isin(select_elec)]

df_homenkit = df.copy()
select_homenkit = ['Home and Kitchen']
df_homenkit = df_homenkit[df_homenkit['category'].isin(select_homenkit)]

df_toyngame = df.copy()
select_toyngame= ['Toys and Games']
df_toyngame = df_toyngame[df_toyngame['category'].isin(select_toyngame)]

df_cloth = df.copy()
select_cloth = ['Clothing Shoes and Jewelry']
df_cloth = df_cloth[df_cloth['category'].isin(select_cloth)]

In [14]:
from textblob import TextBlob

# correct the spelling (like wayss to ways or helloo to hello)
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

# process texts in batches
def batch_process_texts(texts, batch_size=10000):
    cleaned_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # process each text in the batch
        cleaned_texts.extend([correct_spelling(text) for text in batch])
    return cleaned_texts

df_health['reviewText'] = batch_process_texts(df_health['reviewText'].tolist())
df_health.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,used product immediately threw trash briefly c...,1,"03 25, 2013",negative,Health and Personal Care
1,read review bought got gal bladder ten year ag...,2,"02 27, 2014",negative,Health and Personal Care
2,doubt quality product cannot handle taste good...,2,"06 24, 2014",negative,Health and Personal Care
3,first time used product water oil leaned bowl ...,2,"11 14, 2013",negative,Health and Personal Care
4,understand amazon allows company post face rev...,1,"06 27, 2013",negative,Health and Personal Care
5,product get even near edge lint sundry boy pro...,1,"12 22, 2013",negative,Health and Personal Care
6,hard adjust put hand head strap go around back...,1,"12 18, 2012",negative,Health and Personal Care
7,would expect price purchased much higher quali...,2,"08 16, 2013",negative,Health and Personal Care
8,work husband arch much high foot hard,1,"05 31, 2014",negative,Health and Personal Care
9,first could take loss weight tried start lower...,1,"02 14, 2011",negative,Health and Personal Care


In [15]:
from textblob import TextBlob

# correct the spelling (like wayss to ways or helloo to hello)
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

# process texts in batches
def batch_process_texts(texts, batch_size=10000):
    cleaned_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # process each text in the batch
        cleaned_texts.extend([correct_spelling(text) for text in batch])
    return cleaned_texts

df_elec['reviewText'] = batch_process_texts(df_elec['reviewText'].tolist())
df_elec.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,ordered three ring album adored camera sent sc...,1,"08 17, 2007",negative,Electronics
1,motorola analogy telephone adapted voice quali...,1,"01 23, 2005",negative,Electronics
2,bought play walking exercising unable purpose ...,2,"04 22, 2004",negative,Electronics
3,plastic causing surrounding input computer cam...,2,"06 11, 2013",negative,Electronics
4,ate two video tape horrible job auto setting c...,1,"01 15, 2005",negative,Electronics
5,small print remote difficult see especially da...,2,"11 29, 2005",negative,Electronics
6,got system mail time took three week get free ...,1,"02 25, 2002",negative,Electronics
7,miracle get device running install software co...,1,"08 15, 2005",negative,Electronics
8,bought called connect single monitor trip like...,2,"02 4, 2014",negative,Electronics
9,bought outer previous link outer kept resectin...,1,"12 30, 2011",negative,Electronics


In [16]:
from textblob import TextBlob

# correct the spelling (like wayss to ways or helloo to hello)
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

# process texts in batches
def batch_process_texts(texts, batch_size=10000):
    cleaned_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # process each text in the batch
        cleaned_texts.extend([correct_spelling(text) for text in batch])
    return cleaned_texts

df_homenkit['reviewText'] = batch_process_texts(df_homenkit['reviewText'].tolist())
df_homenkit.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,bought teenage daughter room loved couple day ...,1,"08 25, 2011",negative,Home and Kitchen
1,re lie red product poor condition miss manufac...,1,"11 28, 2008",negative,Home and Kitchen
2,used shower curtain month along entire bottom ...,1,"03 30, 2014",negative,Home and Kitchen
3,lost brain mixed flood trying save buck bought...,2,"12 13, 2008",negative,Home and Kitchen
4,product size description come dis pointed expe...,1,"07 4, 2013",negative,Home and Kitchen
5,suppose wood nightstand instead june plastic n...,1,"07 13, 2010",negative,Home and Kitchen
6,really need small ice cure tray fine bought se...,2,"10 20, 2008",negative,Home and Kitchen
7,okay used lit mill grew unfortunately stopped ...,2,"06 16, 2010",negative,Home and Kitchen
8,description incorrect separated plastic glass ...,1,"01 25, 2014",negative,Home and Kitchen
9,summary good entry level can her vacuum people...,2,"07 20, 2012",negative,Home and Kitchen


In [17]:
from textblob import TextBlob

# correct the spelling (like wayss to ways or helloo to hello)
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

# process texts in batches
def batch_process_texts(texts, batch_size=10000):
    cleaned_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # process each text in the batch
        cleaned_texts.extend([correct_spelling(text) for text in batch])
    return cleaned_texts

df_toyngame['reviewText'] = batch_process_texts(df_toyngame['reviewText'].tolist())
df_toyngame.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,old daughter got doll christmas love play hair...,1,"02 1, 2013",negative,Toys and Games
1,kid uncle bought liked couple week sound cocke...,2,"12 29, 2013",negative,Toys and Games
2,purchased different tablecloth could alternati...,1,"03 25, 2014",negative,Toys and Games
3,rip daughter got set birthday get lot air box ...,1,"10 6, 2011",negative,Toys and Games
4,first child liked sent ken doll peter pan joe ...,2,"05 1, 2014",negative,Toys and Games
5,spending lot time constructing elaborate track...,2,"12 29, 2010",negative,Toys and Games
6,excited set month daughter love swing slide ho...,1,"06 18, 2014",negative,Toys and Games
7,mother bought youngest daughter last christmas...,1,"10 2, 2009",negative,Toys and Games
8,supposed two pack puzzle fearing two different...,2,"01 13, 2013",negative,Toys and Games
9,doll moved made sound right taken packing soon...,1,"05 12, 2013",negative,Toys and Games


In [18]:
from textblob import TextBlob

# correct the spelling (like wayss to ways or helloo to hello)
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

# process texts in batches
def batch_process_texts(texts, batch_size=10000):
    cleaned_texts = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        # process each text in the batch
        cleaned_texts.extend([correct_spelling(text) for text in batch])
    return cleaned_texts

df_cloth['reviewText'] = batch_process_texts(df_cloth['reviewText'].tolist())
df_cloth.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,spyglass made china original boy spyglass fran...,1,"04 22, 2014",negative,Clothing Shoes and Jewelry
1,ordered lens fade got plain black much large l...,2,"03 22, 2014",negative,Clothing Shoes and Jewelry
2,material pack cotton boy short stretch much ma...,2,"07 10, 2013",negative,Clothing Shoes and Jewelry
3,product arrived evening one item missing instr...,2,"11 22, 2011",negative,Clothing Shoes and Jewelry
4,sized others return get different style still ...,2,"02 24, 2014",negative,Clothing Shoes and Jewelry
5,original messenger bag perfect every way ten y...,1,"03 16, 2014",negative,Clothing Shoes and Jewelry
6,ken well writing review product well look nice...,2,"05 4, 2013",negative,Clothing Shoes and Jewelry
7,look cheap expect though price center stone lo...,1,"03 18, 2014",negative,Clothing Shoes and Jewelry
8,watch arrived time item small taste watch litt...,2,"05 15, 2013",negative,Clothing Shoes and Jewelry
9,good price shoe older style part white trim tu...,1,"02 24, 2012",negative,Clothing Shoes and Jewelry


In [19]:
df = pd.concat([df_health, df_elec, df_homenkit, df_toyngame, df_cloth])
df.reset_index(drop=True, inplace=True)

df.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,used product immediately threw trash briefly c...,1,"03 25, 2013",negative,Health and Personal Care
1,read review bought got gal bladder ten year ag...,2,"02 27, 2014",negative,Health and Personal Care
2,doubt quality product cannot handle taste good...,2,"06 24, 2014",negative,Health and Personal Care
3,first time used product water oil leaned bowl ...,2,"11 14, 2013",negative,Health and Personal Care
4,understand amazon allows company post face rev...,1,"06 27, 2013",negative,Health and Personal Care
5,product get even near edge lint sundry boy pro...,1,"12 22, 2013",negative,Health and Personal Care
6,hard adjust put hand head strap go around back...,1,"12 18, 2012",negative,Health and Personal Care
7,would expect price purchased much higher quali...,2,"08 16, 2013",negative,Health and Personal Care
8,work husband arch much high foot hard,1,"05 31, 2014",negative,Health and Personal Care
9,first could take loss weight tried start lower...,1,"02 14, 2011",negative,Health and Personal Care


In [20]:
# after check spelling, contains duplicated and missing values （Important as remove english function cannot have missing values）
print('\nBefore drop')
print(df.isnull().sum()) 

df.dropna(inplace=True)

print('\nAfter drop')
print(df.isnull().sum())


Before drop
reviewText    121
overall         0
reviewTime      0
sentiment       0
category        0
dtype: int64

After drop
reviewText    0
overall       0
reviewTime    0
sentiment     0
category      0
dtype: int64


Remove non-English

In [21]:
from nltk.corpus import words
from nltk.tokenize import word_tokenize

# tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# remove any non-English words
english_words = set(words.words())
def remove_non_english(tokens):
    english_tokens = []
    for word in tokens:
        if word.lower() in english_words:
            english_tokens.append(word)
    return english_tokens

# preprocess a column
def preprocess_column(column):
    column = column.apply(tokenize_text)
    column = column.apply(remove_non_english)
    column = column.apply(lambda tokens: ' '.join(tokens))
    
    return column

df['reviewText'] = preprocess_column(df['reviewText'])
df.head(10)

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
0,used product immediately threw trash briefly c...,1,"03 25, 2013",negative,Health and Personal Care
1,read review bought got gal bladder ten year ag...,2,"02 27, 2014",negative,Health and Personal Care
2,doubt quality product can not handle taste goo...,2,"06 24, 2014",negative,Health and Personal Care
3,first time used product water oil bowl tried h...,2,"11 14, 2013",negative,Health and Personal Care
4,understand company post face review entire pur...,1,"06 27, 2013",negative,Health and Personal Care
5,product get even near edge lint sundry boy pro...,1,"12 22, 2013",negative,Health and Personal Care
6,hard adjust put hand head strap go around back...,1,"12 18, 2012",negative,Health and Personal Care
7,would expect price much higher quality glass h...,2,"08 16, 2013",negative,Health and Personal Care
8,work husband arch much high foot hard,1,"05 31, 2014",negative,Health and Personal Care
9,first could take loss weight tried start lower...,1,"02 14, 2011",negative,Health and Personal Care


In [22]:
# check missing values 
print(df.isnull().sum())

reviewText    0
overall       0
reviewTime    0
sentiment     0
category      0
dtype: int64


In [23]:
# check duplicated
print(df.duplicated().sum())

170


In [24]:
# check one of the duplicated
duplicated = df[df.duplicated(keep=False)]
duplicated

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
280,small,1,"07 15, 2014",negative,Health and Personal Care
1990,waste money,1,"07 13, 2014",negative,Health and Personal Care
2600,work,1,"07 10, 2014",negative,Health and Personal Care
3509,work,1,"07 19, 2014",negative,Health and Personal Care
3714,work,1,"07 20, 2014",negative,Health and Personal Care
...,...,...,...,...,...
745202,great fit,5,"07 8, 2014",positive,Clothing Shoes and Jewelry
746638,perfect,5,"07 11, 2014",positive,Clothing Shoes and Jewelry
747249,perfect,5,"07 14, 2014",positive,Clothing Shoes and Jewelry
747592,love,5,"07 18, 2014",positive,Clothing Shoes and Jewelry


In [25]:
# drop duplicates
df = df.drop_duplicates()
print('Total Duplicated:', df.duplicated().sum())

Total Duplicated: 0


In [26]:
# check empty string in text column
df[df['reviewText'] == '']

Unnamed: 0,reviewText,overall,reviewTime,sentiment,category
3624,,1,"07 4, 2014",negative,Health and Personal Care
5632,,2,"07 8, 2014",negative,Health and Personal Care
6610,,2,"07 22, 2014",negative,Health and Personal Care
21541,,2,"07 7, 2014",negative,Health and Personal Care
22168,,2,"07 12, 2014",negative,Health and Personal Care
...,...,...,...,...,...
733879,,5,"06 27, 2014",positive,Clothing Shoes and Jewelry
741200,,5,"07 8, 2014",positive,Clothing Shoes and Jewelry
744028,,4,"07 17, 2014",positive,Clothing Shoes and Jewelry
746475,,4,"07 6, 2014",positive,Clothing Shoes and Jewelry


In [27]:
# filter out empty string
df = df[df['reviewText'] != '']

Last check

In [28]:
# last check missing values 
print(df.isnull().sum())

# last check duplicated
print(df.duplicated().sum())

#last check empty string
df[df['reviewText'] == '']

# dimensions 
print(df.shape)

reviewText    0
overall       0
reviewTime    0
sentiment     0
category      0
dtype: int64
0
(749402, 5)


In [29]:
df.to_csv('amazon_sales_2023_cleaned.csv', index=False)