In [12]:
import pandas as pd
import re

In [13]:
df = pd.read_csv("labelreviews.csv")
df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   business_name    312 non-null    object 
 1   author_name      312 non-null    object 
 2   text             312 non-null    object 
 3   photo            312 non-null    object 
 4   rating           312 non-null    int64  
 5   rating_category  312 non-null    object 
 6   label            312 non-null    object 
 7   Unnamed: 7       0 non-null      float64
 8   Unnamed: 8       0 non-null      float64
 9   Unnamed: 9       0 non-null      float64
 10  Unnamed: 10      4 non-null      object 
dtypes: float64(3), int64(1), object(7)
memory usage: 26.9+ KB


In [14]:
# Drop unnamed junk columns automatically
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    312 non-null    object
 1   author_name      312 non-null    object
 2   text             312 non-null    object
 3   photo            312 non-null    object
 4   rating           312 non-null    int64 
 5   rating_category  312 non-null    object
 6   label            312 non-null    object
dtypes: int64(1), object(6)
memory usage: 17.2+ KB


In [15]:
# drop unnecessary columns ie "author_name" and "photo"
df = df.drop(columns=["author_name", "photo"])

In [16]:
# handle duplicates
df.duplicated().sum() # no duplicates

np.int64(0)

In [17]:
# handle noise in text column

# 1. Remove HTML tags only
df["clean_text"] = df["text"].apply(lambda x: re.sub(r"<.*?>", " ", str(x)))

# 2. Keep URLs, numbers, promo words intact
#    - Allow letters (a-zA-Z), digits (0-9), colons, slashes, dots, hyphens (for urls/phones),
#      and @ (for emails/mentions)
df["clean_text"] = df["clean_text"].str.replace(r"[^a-zA-Z0-9\s:/\.\-\@]", " ", regex=True)

# 3. Normalize spaces, lowercase
df["clean_text"] = df["clean_text"].str.lower().str.strip()
df["clean_text"] = df["clean_text"].str.replace(r"\s+", " ", regex=True)

In [18]:
# text-specific coding

# 1. Initialize stopwords
stop_words = set(stopwords.words("english"))

# 2. Keep important words for classification
negation_words = {"not", "no", "never"} # if removed, it'll change the meaning
promo_words = {"free", "discount", "sale", "voucher", "promo", "offer", "deal"}

# 3. Adjust stopword list: remove negations and promo words
custom_stopwords = stop_words - negation_words - promo_words

# 4. Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# 5. Define cleaning function
def clean_tokens(text):
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in custom_stopwords]
    return " ".join(tokens)

# 6. Apply cleaning
df["processed_text"] = df["clean_text"].apply(clean_tokens)

In [19]:
# Drop the clean_text column
df = df.drop(columns=["clean_text"])

In [20]:
# Show the cleaned dataframe preview
df.head()

Unnamed: 0,business_name,text,rating,rating_category,label,processed_text
0,Abidin Tantuni,It was too oily and the place had problems wit...,2,taste,Clean Review,oily place problem cleanliness.
1,Abidin Tantuni,It was too spicy on my first try; but after a ...,4,outdoor_atmosphere,Clean Review,spicy first try bite got used it. taste really...
2,Abidin Tantuni,it tasted so good that I even ordered and ate ...,5,indoor_atmosphere,Clean Review,tasted good even ordered ate second one
3,Abidin Tantuni,Tantuni is delicious; service is fast; service...,5,indoor_atmosphere,Clean Review,tantuni delicious service fast service quality...
4,Ahsap Iskender,We wanted to try this place as it is famous. W...,3,outdoor_atmosphere,Clean Review,wanted try place famous. paid 800 lira 2 peopl...


In [21]:
df.to_csv("cleaned_dataset.csv", index=False)