In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Downloading necessary NLTK data (only need to run this once)
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv('scraped_reviews.csv')

print("Initial Data Info")
df.info()

print("\nMissing Values")
print(df.isnull().sum())

print("\nFirst 5 Rows")
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gagan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gagan\AppData\Roaming\nltk_data...


--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Cafe_URL       150 non-null    object
 1   Reviewer_Name  150 non-null    object
 2   Rating         150 non-null    object
 3   Review_Text    150 non-null    object
dtypes: object(4)
memory usage: 4.8+ KB

--- Missing Values ---
Cafe_URL         0
Reviewer_Name    0
Rating           0
Review_Text      0
dtype: int64

--- First 5 Rows ---


[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Cafe_URL,Reviewer_Name,Rating,Review_Text
0,https://www.google.com/maps/place/Sri+Sri+Cafe...,Akash Raj,5 stars,Had a great experience at Sri Sri Cafe! The at...
1,https://www.google.com/maps/place/Sri+Sri+Cafe...,Simran Fathima,5 stars,Pizza at its best🍕as said as is💖! From service...
2,https://www.google.com/maps/place/Sri+Sri+Cafe...,Uday,5 stars,The Unlimited Gujrati Thali was amazing and su...
3,https://www.google.com/maps/place/Sri+Sri+Cafe...,Amrita Chattopadhyay,5 stars,The food is delicious. The place is clean. The...
4,https://www.google.com/maps/place/Sri+Sri+Cafe...,Dharam Hinduja,5 stars,This is one of those places that we discovered...


In [None]:
# Function to extract the number from the rating text
def extract_rating_number(rating_text):
    if isinstance(rating_text, str):
        
        # Using regex to find the first number in the string
        match = re.search(r'\d+', rating_text)
        if match:
            return int(match.group(0))
    return None # if no number is found Return None

# Applying to the 'Rating' column
df['Rating_Clean'] = df['Rating'].apply(extract_rating_number)

print(df[['Rating', 'Rating_Clean']].head())

    Rating  Rating_Clean
0  5 stars             5
1  5 stars             5
2  5 stars             5
3  5 stars             5
4  5 stars             5


In [None]:
# Initializing the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if not isinstance(text, str):
        return "" # Returning empty string for non-text data
    
    # Lowercase the text, Removing the punctuation, numbers & stopwords
    text = text.lower()
    
    text = re.sub(r'[^a-z\s]', '', text)
    
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization to get the root form of words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

# Applying the cleaning function to the review text
df['Review_Text_Clean'] = df['Review_Text'].apply(clean_text)

print(df[['Review_Text', 'Review_Text_Clean']].head())

                                         Review_Text  \
0  Had a great experience at Sri Sri Cafe! The at...   
1  Pizza at its best🍕as said as is💖! From service...   
2  The Unlimited Gujrati Thali was amazing and su...   
3  The food is delicious. The place is clean. The...   
4  This is one of those places that we discovered...   

                                   Review_Text_Clean  
0  great experience sri sri cafe atmosphere pleas...  
1  pizza bestas said service hygiene taste price ...  
2  unlimited gujrati thali amazing super deliciou...  
3  food delicious place clean staff member good o...  
4  one place discovered chance delighted result l...  


In [None]:
# Dropping the rows where original review text was missing
df.dropna(subset=['Review_Text'], inplace=True)

# Dropping the rows if the cleaned rating is missing
df.dropna(subset=['Rating_Clean'], inplace=True)


# The final clean columns
df_clean = df[['Cafe_URL', 'Reviewer_Name', 'Rating_Clean', 'Review_Text_Clean']].copy()

# Renaming the columns
df_clean.rename(columns={
    'Rating_Clean': 'Rating',
    'Review_Text_Clean': 'Review_Text'
}, inplace=True)

# Saving to a new CSV file
df_clean.to_csv('cleaned_reviews.csv', index=False)

print("\n--- Final Clean Data Info ---")
df_clean.info()

print("\nCleaning complete! Data saved to 'cleaned_reviews.csv'")
df_clean.head()


--- Final Clean Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Cafe_URL       150 non-null    object
 1   Reviewer_Name  150 non-null    object
 2   Rating         150 non-null    int64 
 3   Review_Text    150 non-null    object
dtypes: int64(1), object(3)
memory usage: 4.8+ KB

Cleaning complete! Data saved to 'cleaned_reviews.csv'


Unnamed: 0,Cafe_URL,Reviewer_Name,Rating,Review_Text
0,https://www.google.com/maps/place/Sri+Sri+Cafe...,Akash Raj,5,great experience sri sri cafe atmosphere pleas...
1,https://www.google.com/maps/place/Sri+Sri+Cafe...,Simran Fathima,5,pizza bestas said service hygiene taste price ...
2,https://www.google.com/maps/place/Sri+Sri+Cafe...,Uday,5,unlimited gujrati thali amazing super deliciou...
3,https://www.google.com/maps/place/Sri+Sri+Cafe...,Amrita Chattopadhyay,5,food delicious place clean staff member good o...
4,https://www.google.com/maps/place/Sri+Sri+Cafe...,Dharam Hinduja,5,one place discovered chance delighted result l...
