In [1]:
# preprocessing.py

import pandas as pd
import re
import string
import nltk

In [2]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dev\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dev\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\dev\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [4]:
def clean_text(text):
    if pd.isna(text):
        return ""

    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)

    tokens = text.split()
    tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words
    ]

    return " ".join(tokens)

In [5]:
if __name__ == "__main__":
    df = pd.read_csv("data/reviews_clean.csv")

    # ✅ CORRECT COLUMN
    df["text_clean"] = df["text"].apply(clean_text)

    df.to_csv("data/reviews_preprocessed.csv", index=False)

    print("✅ Preprocessing done on 'text' column")

✅ Preprocessing done on 'text' column


In [6]:
df[["text", "text_clean"]].head()

Unnamed: 0,text,text_clean
0,"If you decide to eat here, just be aware it is...",decide eat aware going take hour beginning end...
1,I've taken a lot of spin classes over the year...,ive taken lot spin class year nothing compare ...
2,Family diner. Had the buffet. Eclectic assortm...,family diner buffet eclectic assortment large ...
3,"Wow! Yummy, different, delicious. Our favo...",wow yummy different delicious favorite lamb cu...
4,Cute interior and owner (?) gave us tour of up...,cute interior owner gave u tour upcoming patio...


In [7]:
df.head()

Unnamed: 0,review_id,stars,text,text_clean
0,KU_O5udG6zpxOg-VcAEodg,3.0,"If you decide to eat here, just be aware it is...",decide eat aware going take hour beginning end...
1,BiTunyQ73aT9WBnpR9DZGw,5.0,I've taken a lot of spin classes over the year...,ive taken lot spin class year nothing compare ...
2,saUsX_uimxRlCVr67Z4Jig,3.0,Family diner. Had the buffet. Eclectic assortm...,family diner buffet eclectic assortment large ...
3,AqPFMleE6RsU23_auESxiA,5.0,"Wow! Yummy, different, delicious. Our favo...",wow yummy different delicious favorite lamb cu...
4,Sx8TMOWLNuJBWer-0pcmoA,4.0,Cute interior and owner (?) gave us tour of up...,cute interior owner gave u tour upcoming patio...
