In [38]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from num2words import num2words

In [3]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\57317\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\57317\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\57317\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 1. Dataset Exploration

In [25]:
#Load dataset
df = pd.read_csv("./data/amazon.csv")

In [None]:
# Add column for lenght of reviews 
df['text_length'] = df['Text'].apply(len)
print("\nText length statistics:")
print(df['text_length'].describe())


Text length statistics:
count    19996.000000
mean       175.787257
std         58.964840
min          3.000000
25%        123.000000
50%        164.000000
75%        254.000000
max        254.000000
Name: text_length, dtype: float64


In [27]:
# Dataset shape and info
print("Shape of dataset:", df.shape)
print(df.info())
print("\nSample rows:")
print(df.head())

Shape of dataset: (19996, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19996 entries, 0 to 19995
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Text         19996 non-null  object
 1   label        19996 non-null  int64 
 2   text_length  19996 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 468.8+ KB
None

Sample rows:
                                                Text  label  text_length
0  This is  the best apps acording to a bunch of ...      1          121
1  This is a pretty good version of the game for ...      1          129
2  this is a really . there are a bunch of levels...      1           87
3  This is a silly game and can be frustrating, b...      1          105
4  This is a terrific game on any pad. Hrs of fun...      1          117


## 2. Text Cleaning

**Importance of lowercasing for text preprocesing:**

Lowercasing unifies words so it does not interpretate ones that means the same as different because of caps among them.

By lowercasing words will be treated consistently. As a result, it will improve quality, accuracy and efficienty of text analysis.

In [29]:
# a) Convert to lowercase
df['clean'] = df['Text'].str.lower()
print("\nLowercased sample:")
print(df['clean'].head())


Lowercased sample:
0    this is  the best apps acording to a bunch of ...
1    this is a pretty good version of the game for ...
2    this is a really . there are a bunch of levels...
3    this is a silly game and can be frustrating, b...
4    this is a terrific game on any pad. hrs of fun...
Name: clean, dtype: object


**Trade-offs of punctuation removal:**

Punctuation doesn't usually carries meaning. Removing them simplifies the text and makes it cleaner whichs makes it easier to process. It also improves consistency during tokenization and reduces vocabulary size and punctuation attached to words creates duplicates.

However, loss of emotional, contextual information or grammatical structure may occurs since punctuation can express strong sentiment, emphasis and sentence structure.

Wheater removing punctuation depends on the goal. For simple frequency analysis or classification, it is fine doing. But if tone, emotion or sentence structure are important, certain punctuation marks might be kept.


In [31]:
# b) Remove punctuation
def remove_punct(text):
    return re.sub(r'[^\w\s]', '', text)

df['clean'] = df['clean'].apply(remove_punct)
print("\nAfter punctuation removal:")
print(df['clean'].head())


After punctuation removal:
0    this is  the best apps acording to a bunch of ...
1    this is a pretty good version of the game for ...
2    this is a really  there are a bunch of levels ...
3    this is a silly game and can be frustrating bu...
4    this is a terrific game on any pad hrs of fun ...
Name: clean, dtype: object


**Stop words removal:**
Stopwords are common words in a language that appear very frequently but carry little meaningful information on their own.

These words are essential for grammar, but usually do not help distinguish topics, sentiment, or meaning in text analysis.

Removing stop words reduces noise in the data, decrease vocabulary size, improve model focus on meaningful words and simplify text for tasks such as as tokenization, vectorization or text embeddings.

In [42]:
# c) Remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([w for w in text.split() if w not in stop_words])

df['clean'] = df['clean'].apply(remove_stopwords)
print("\nAfter stopword removal:")
print(df['clean'].head())
# Rationale: Stopwords carry little semantic meaning



After stopword removal:
0    best apps acording bunch people agree bombs eg...
1    pretty good version game free lots different l...
2       really bunch levels find golden eggs super fun
3    silly game frustrating lots fun definitely rec...
4    terrific game pad hrs fun grandkids love great...
Name: clean, dtype: object


**Tokenization:**

Tokenization is the process of splitting text into smaller units, called tokens. Tokens are usually words, but they can also be subwords, characters, or sentences depending on the task.

It enables analysis at the word lever, prepares text for preprocessing steps, reduces complexity for machine learning algorithms and helps in handling punctuation and special characters.

In [43]:
# d) Tokenization
df['tokens'] = df['clean'].apply(word_tokenize)
print("\nTokenized sample:")
print(df['tokens'].head())


Tokenized sample:
0    [best, apps, acording, bunch, people, agree, b...
1    [pretty, good, version, game, free, lots, diff...
2    [really, bunch, levels, find, golden, eggs, su...
3    [silly, game, frustrating, lots, fun, definite...
4    [terrific, game, pad, hrs, fun, grandkids, lov...
Name: tokens, dtype: object


## 3.Text Normalization

**Lemmatization:**

Lemmatization is the process of reducing words to their base or dictionary form, considering context and part of speech (e.g., "running" → "run", "better" → "good"). 

Its main advantage is that it produces real, meaningful words, reduces vocabulary redundancy, and improves the consistency of NLP tasks like sentiment analysis or topic modeling. 

However, it is slower than stemming, depends on accurate part-of-speech tagging, may not handle slang or misspellings, and requires external libraries or dictionaries.

In [44]:
# a) Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmas'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(w) for w in tokens])
print("\nAfter lemmatization:")
print(df['lemmas'].head())


After lemmatization:
0    [best, apps, acording, bunch, people, agree, b...
1    [pretty, good, version, game, free, lot, diffe...
2    [really, bunch, level, find, golden, egg, supe...
3    [silly, game, frustrating, lot, fun, definitel...
4    [terrific, game, pad, hr, fun, grandkids, love...
Name: lemmas, dtype: object


**Handling emojis and slangs:**

Handling emojis and slang in text data can be approached in different ways. Each of them has pros and cons.

**Pros and cons:**

**Replacing with descriptive words:** Preserves meaning and sentiment, but requires comprehensive mappings and may not cover all cases.

**Ignoring them:** Easy to implement and reduces noise, but valuable information about emotion or emphasis is lost.

**Using libraries:** Automates handling and covers many cases, but adds dependencies and may still miss rare or creative expressions.

In [45]:
# b) Handling slangs
words_mapping ={
    "hrs":"hours",
    "hr":"hour",
    "&":"and",
    "ad":"advertising",
    "omg":"oh my god",
    "fav": "favorite"
}

def norm_text(tok_list):
    return [ words_mapping.get(w, w) for w in tok_list]

df['normalized'] = df['lemmas'].apply(norm_text)
print("\nAfter handling slang:")
print(df['normalized'].head())


After handling slang:
0    [best, apps, acording, bunch, people, agree, b...
1    [pretty, good, version, game, free, lot, diffe...
2    [really, bunch, level, find, golden, egg, supe...
3    [silly, game, frustrating, lot, fun, definitel...
4    [terrific, game, pad, hour, fun, grandkids, lo...
Name: normalized, dtype: object


**Handling Numbers:**

I selected the approach to convert numbers to text as it preserves the semantic meaning of numerical information, which is often important in product reviews for ratings, quantities, or measurements. In this particular case, some reviews mention ages.

Converting numbers to words allows NLP models to process them as meaningful tokens rather than isolated digits, improving consistency and interpretability during text analysis. Keeping numbers as digits could cause models to treat them differently from words, while removing them would result in loss of information. 

**Note:** num2words library was used for this task.


In [48]:
# c) Handling numbers
def numbers_to_words(words_list):
    new_tokens = []
    for token in words_list:
        if token.isdigit():  # check if token is a number
            new_tokens.append(num2words(int(token)))
        else:
            new_tokens.append(token)
    return new_tokens

# Apply to the 'normalized' column
df['normalized_numbers'] = df['normalized'].apply(numbers_to_words)

# Combine tokens into final cleaned text
df['final_text'] = df['normalized_numbers'].apply(lambda lst: ' '.join(lst))

# Display sample
print(df['final_text'].head())

0    best apps acording bunch people agree bomb egg...
1    pretty good version game free lot different le...
2         really bunch level find golden egg super fun
3    silly game frustrating lot fun definitely reco...
4    terrific game pad hour fun grandkids love grea...
Name: final_text, dtype: object
