In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re


In [2]:
df = pd.read_csv("/content/drive/MyDrive/CSV_data/restaurant_reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Customer Name,Review
0,0,Binaya,Loved the pasta! Best I've had in years. 🍝😍 #F...
1,1,Rachel,The <b>service</b> was very slow... waited 40 ...
2,2,Liam,Amazing ambiance and music 🎶 #ChillVibes
3,3,Sophie,I got food poisoning after eating sushi here. 😷
4,4,Vicky,<div>Great customer service and friendly staff...


In [3]:
df_reviews = df[['Review']]
df_reviews

Unnamed: 0,Review
0,Loved the pasta! Best I've had in years. 🍝😍 #F...
1,The <b>service</b> was very slow... waited 40 ...
2,Amazing ambiance and music 🎶 #ChillVibes
3,I got food poisoning after eating sushi here. 😷
4,<div>Great customer service and friendly staff...
5,"The steak was raw in the middle, not what I as..."
6,Absolutely delicious desserts. Will come again...
7,Found a hair in my soup. 🤢 pls fix hygiene iss...
8,Tables were dirty when we arrived. Not good. :(
9,The best vegan burger I've tried! https://vega...


In [10]:
#pip install contractions
import contractions
def expand_contractions(text):
    text = contractions.fix(text)
    return text

df_reviews = df_reviews.apply(expand_contractions)
df_reviews


Unnamed: 0,Review
0,Loved the pasta! Best I have had in years. 🍝😍 ...
1,The <b>service</b> was very slow... waited 40 ...
2,Amazing ambiance and music 🎶 #ChillVibes
3,I got food poisoning after eating sushi here. 😷
4,<div>Great customer service and friendly staff...
5,"The steak was raw in the middle, not what I as..."
6,Absolutely delicious desserts. Will come again...
7,Found a hair in my soup. 🤢 pls fix hygiene iss...
8,Tables were dirty when we arrived. Not good. :(
9,The best vegan burger I have tried! https://ve...


In [11]:
#removing noise (like HTML tags, URLs, punctuation, emojis, and emoticons)
def remove_noise(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    #Remove emojis
    text = re.sub(r'[\U0001F600-\U0001F64F'  # emoticons
                  r'\U0001F300-\U0001F5FF'  # symbols & pictographs
                  r'\U0001F680-\U0001F6FF'  # transport & map symbols
                  r'\u2600-\u26FF\u2700-\u27BF]+', '', text, flags=re.UNICODE)  # Remove emojis
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

In [12]:
df_review_1 = df_reviews.apply(remove_noise)
df_review_1

Unnamed: 0,Review
0,Loved the pasta Best I have had in years Foodie
1,The service was very slow waited 40 mins for food
2,Amazing ambiance and music ChillVibes
3,I got food poisoning after eating sushi here
4,Great customer service and friendly staff
5,The steak was raw in the middle not what I ask...
6,Absolutely delicious desserts Will come again
7,Found a hair in my soup pls fix hygiene issues
8,Tables were dirty when we arrived Not good
9,The best vegan burger I have tried


In [13]:
#converting text to lower
df_review_2 = df_review_1.str.lower()
df_review_2.head()

Unnamed: 0,Review
0,loved the pasta best i have had in years foodie
1,the service was very slow waited 40 mins for food
2,amazing ambiance and music chillvibes
3,i got food poisoning after eating sushi here
4,great customer service and friendly staff


In [14]:
#spelling correction
'''
from textblob import TextBlob

def correct_spell(text):
    return str(TextBlob(text).correct())

df_review_2 = df_review_2.apply(correct_spell)
df_review_2.head()
'''
#Skipped as the text already has correct spelling also, caused more harm than help

'\nfrom textblob import TextBlob\n\ndef correct_spell(text):\n    return str(TextBlob(text).correct())\n\ndf_review_2 = df_review_2.apply(correct_spell)\ndf_review_2.head()\n'

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [16]:
#Removing stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df_review_2 = df_review_2.apply(remove_stopwords)
df_review_2.head()

Unnamed: 0,Review
0,loved pasta best years foodie
1,service slow waited 40 mins food
2,amazing ambiance music chillvibes
3,got food poisoning eating sushi
4,great customer service friendly staff


In [17]:
#Stemming using porter stemmer
from nltk.stem import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    words = word_tokenize(text)
    stemmed_words = [ps.stem(word) for word in words]
    return ' '.join(stemmed_words)

df_review_3 = df_review_2.apply(stem_words)
df_review_3.head()

Unnamed: 0,Review
0,love pasta best year foodi
1,servic slow wait 40 min food
2,amaz ambianc music chillvib
3,got food poison eat sushi
4,great custom servic friendli staff


In [19]:
Stemmed_df = df_review_3
Stemmed_df

Unnamed: 0,Review
0,love pasta best year foodi
1,servic slow wait 40 min food
2,amaz ambianc music chillvib
3,got food poison eat sushi
4,great custom servic friendli staff
5,steak raw middl ask
6,absolut delici dessert come
7,found hair soup pl fix hygien issu
8,tabl dirti arriv good
9,best vegan burger tri


In [28]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [40]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_words(text):
  tokens = word_tokenize(text)
  pos_tags = pos_tag(tokens)
  lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
  return ' '.join(lemmatized)

df_review_4 = df_review_2.apply(lemmatize_words)

In [41]:
lemmatized_df = df_review_4
lemmatized_df

Unnamed: 0,Review
0,love pasta best year foodie
1,service slow wait 40 min food
2,amaze ambiance music chillvibes
3,get food poison eat sushi
4,great customer service friendly staff
5,steak raw middle ask
6,absolutely delicious dessert come
7,find hair soup pls fix hygiene issue
8,table dirty arrive good
9,best vegan burger try
