In [70]:
import pandas as pd
import re

In [71]:
# Data cleaning steps for Google Maps Reviews dataset

# 1) clean date columns for readability
# 2) encode categorical variables as needed (e.g. location)
# 3) rough clean up the text data (remove special characters, lowercasing, etc.)
# 4) handle missing values appropriately
# 5) save cleaned dataframe to new CSV file for analysis

In [72]:
combined_df = pd.read_csv('../data/interim/all_reviews_data.csv')

In [73]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266 entries, 0 to 1265
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   review_id            1266 non-null   object 
 1   rating               1266 non-null   float64
 2   likes                1266 non-null   int64  
 3   date_review_scraped  1266 non-null   object 
 4   review_date          1266 non-null   object 
 5   source               1266 non-null   object 
 6   location             1266 non-null   object 
 7   review_text          848 non-null    object 
 8   owner_response_text  1209 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 89.1+ KB


In [74]:
# 1) Clean up date columns for readability
combined_df['review_date'] = pd.to_datetime(combined_df['review_date'])
combined_df['review_date'] = combined_df['review_date'].dt.strftime('%Y/%m/%d')
combined_df['date_review_scraped'] = pd.to_datetime(combined_df['date_review_scraped'])
combined_df['date_review_scraped'] = combined_df['date_review_scraped'].dt.strftime('%Y/%m/%d')

In [75]:
# 2) encode categorical variables as needed (e.g. location)
unique_locations = combined_df['location'].unique()
location_mapping = {location: idx+1 for idx, location in enumerate(unique_locations)}
combined_df['location_id'] = combined_df['location'].map(location_mapping)

In [76]:
# 3) rough clean up the text data (remove special characters, lowercasing, whitespace formatting, etc.)
def clean_text(text):
    if pd.isna(text):
        return text # Return as is if NaN

    text = text.lower()  # Lowercase
    text = re.sub(r'\n|\r|\t', ' ', text)  # Remove newline characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()


combined_df['review_text'] = combined_df['review_text'].apply(clean_text)
combined_df['owner_response_text'] = combined_df['owner_response_text'].apply(clean_text)


In [77]:
combined_df.head()

Unnamed: 0,review_id,rating,likes,date_review_scraped,review_date,source,location,review_text,owner_response_text,location_id
0,Ci9DQUlRQUNvZENodHljRjlvT25jdFNHdDJTamhUYlhkRU...,1.0,0,2025/12/16,2025/08/18,Google Maps,"12407 N Mopac Expy #125, Austin, TX 78758",i brought my son to this location today for a ...,"hi divya, thank you for sharing your experienc...",1
1,Ci9DQUlRQUNvZENodHljRjlvT2pCSVpGQXdWWEZyZW5OWF...,1.0,0,2025/12/16,2025/10/17,Google Maps,"12407 N Mopac Expy #125, Austin, TX 78758",9/16/25 9:30am got my hair cut at the parmer a...,"hi dale, thank you for sharing your experience...",1
2,ChZDSUhNMG9nS0VJQ0FnTURvckt6S2FBEAE,1.0,0,2025/12/16,2025/05/20,Google Maps,"12407 N Mopac Expy #125, Austin, TX 78758",worst great clips i’ve ever been to. older man...,"thank you for your feedback, arturo. we're sor...",1
3,Ci9DQUlRQUNvZENodHljRjlvT25KdGRWOTBNV3A0V1ZSQm...,5.0,0,2025/12/16,2025/10/17,Google Maps,"12407 N Mopac Expy #125, Austin, TX 78758",myra did my long hair straight cut since glenn...,"hi cheryl, thank you for your wonderful review...",1
4,ChZDSUhNMG9nS0VJQ0FnSUR2LVBiOUVREAE,1.0,1,2025/12/16,2025/01/20,Google Maps,"12407 N Mopac Expy #125, Austin, TX 78758",terrible experience at great clips i had the w...,we're sorry to hear this was your experience. ...,1


In [78]:
combined_df['review_text'][0]

'i brought my son to this location today for a back-to-school haircut. there is no proper welcoming into the store too they are not showing any respect towards customers… i clearly showed reference pictures of the style we wanted. the stylist (ben)did not listen patiently, rushed through the process, and completely ignored the details we explained. the haircut looked nothing like what we asked for. my kid was so upset with the result that he cried the entire time while doing..the experience upset him so much that now he doesn’t even want to go to school tomorrow. this was a very disappointing experience, and i would not recommend this stylist or this location. i hope management addresses this so other customers don’t go through the same frustration.'

In [79]:
combined_df['owner_response_text'][0]

"hi divya, thank you for sharing your experience with us. we're sorry to hear about your visit and understand how important it is to get the right haircut, especially for back-to-school. please reach out to our customer service at www.greatclips.com/customer-service so we can see if there's a way we can make things right."

In [82]:
combined_df = combined_df.drop(columns=['location'])

In [83]:
combined_df.to_csv('../data/processed/cleaned_combined_reviews_data.csv', index=False)