In [1]:
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv("reviews_updated.csv")
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   536 non-null    object 
 1   stars   469 non-null    float64
 2   name    536 non-null    object 
 3   text    536 non-null    object 
 4   label   536 non-null    object 
dtypes: float64(1), object(4)
memory usage: 21.1+ KB


In [3]:
# Drop unnamed junk columns automatically
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   536 non-null    object 
 1   stars   469 non-null    float64
 2   name    536 non-null    object 
 3   text    536 non-null    object 
 4   label   536 non-null    object 
dtypes: float64(1), object(4)
memory usage: 21.1+ KB


In [4]:
# drop unnecessary columns ie "author_name" and "photo"
df = df.drop(columns=["author_name", "photo"])

KeyError: "['author_name', 'photo'] not found in axis"

In [None]:
# handle duplicates
df.duplicated().sum() # no duplicates

In [None]:
# handle noise in text column

# 1. Remove HTML tags only
df["clean_text"] = df["text"].apply(lambda x: re.sub(r"<.*?>", " ", str(x)))

# 2. Keep URLs, numbers, promo words intact
#    - Allow letters (a-zA-Z), digits (0-9), colons, slashes, dots, hyphens (for urls/phones),
#      and @ (for emails/mentions)
df["clean_text"] = df["clean_text"].str.replace(r"[^a-zA-Z0-9\s:/\.\-\@]", " ", regex=True)

# 3. Normalize spaces, lowercase
df["clean_text"] = df["clean_text"].str.lower().str.strip()
df["clean_text"] = df["clean_text"].str.replace(r"\s+", " ", regex=True)

In [None]:
# text-specific coding

# 1. Initialize stopwords
stop_words = set(stopwords.words("english"))

# 2. Keep important words for classification
negation_words = {"not", "no", "never"} # if removed, it'll change the meaning
promo_words = {"free", "discount", "sale", "voucher", "promo", "offer", "deal"}

# 3. Adjust stopword list: remove negations and promo words
custom_stopwords = stop_words - negation_words - promo_words

# 4. Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# 5. Define cleaning function
def clean_tokens(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in custom_stopwords]
    return " ".join(tokens)

# 6. Apply cleaning
df["processed_text"] = df["clean_text"].apply(clean_tokens)

In [None]:
# Drop the clean_text column
df = df.drop(columns=["clean_text"])

In [None]:
# Show the cleaned dataframe preview
df.head()

In [None]:
df.to_csv("cleaned_dataset.csv", index=False)