In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
import string

from nltk.stem import PorterStemmer
from stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk import word_tokenize

In [2]:
porter_stemmer = PorterStemmer()
stop_words = list(get_stop_words('en'))         #About 900 stopwords
nltk_words = list(stopwords.words('english')) #About 150 stopwords
stop_words.extend(nltk_words)

In [3]:
data = r'C:\Users\AS\Desktop\European Restaurant Reviews\European Restaurant Reviews.csv'
df = pd.read_csv(data)
df.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,May 2024 •,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...


In [4]:
df['Sentiment'] = df['Sentiment'].replace({'Negative': 0, 'Positive': 1})
df.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,0,Rude manager,May 2024 •,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,0,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,0,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,0,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,0,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...


In [5]:
df['Sentiment'].unique()

array([0, 1], dtype=int64)

In [6]:

  # [] for range
  # + one or more
  # | or
  # ^ not
  # \w word
    
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"(^\w+\S+)", '', text)
    tokens = nltk.word_tokenize(text)
    return tokens
    
    

In [7]:
def remove_punctuation(text):
    without_punct =" ".join(i for i in text if i not in string.punctuation)
    return without_punct

In [8]:
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return filtered_tokens

In [9]:
def perform_lemmatization(tokens):
    lemma = nltk.WordNetLemmatizer()
    lemma_tokens = [lemma.lemmatize(token) for token in tokens]
    return lemma_tokens

In [10]:
def clean_text(text):
    tokens = preprocess(text)
   
    filtered_tokens = remove_stopwords(tokens)
    remove_punct = remove_punctuation(filtered_tokens) 
    lemma_tokens = perform_lemmatization(remove_punct)
    
    clean_text = ''.join(lemma_tokens)
    return clean_text
    

In [11]:
CT=[]
for i in df['Review']:
    data = clean_text(i)
    CT.append(data)
    

In [12]:
type(CT)
len(CT)

1502

In [13]:
CT

['manager became agressive said carbonara good rude bad experience worst rome many years',
 'ordered beef fillet ask done medium got well done cooked dry told took steak minutes brought another steak completely rare left steak asked charged fool price',
 'attractive venue welcoming albeit somewhat slow service offerings pleasingly presented everything taste though ingredients assembled without seasoning lovely restaurant except food',
 'used high tripadvisor rating literally fortunate eat number fine restaurants know amazing buzz mouthful fine freshly cooked food great chef never eaten tasting menu never ad hoc great ambience attentive friendly service paired wines exceptional maybe hit night chef every single course food certainly freshly cooked passed kitchen way bathroom noticed plates precooked food ready go tasting menu plates odd dish disappointed would accepted might palates liking every plate food hot bland lacking spark probably sitting around while.l serving staff great wine 

In [14]:
df['Review']

0       The manager became agressive when I said the c...
1       I ordered a beef fillet ask to be done medium,...
2       This is an attractive venue with welcoming, al...
3       Sadly I  used the high TripAdvisor rating too ...
4       From the start this meal was bad- especially g...
                              ...                        
1497    Despite the other reviews saying that this is ...
1498    beer is good.  food is awfull  The only decent...
1499    for terrible service of a truly comedic level,...
1500    We visited the Havana's Club Museum which is l...
1501    Food and service was awful. Very pretty stop. ...
Name: Review, Length: 1502, dtype: object

In [15]:
#new_colum = {'newReview': CT} 
new_colum = pd.Series(CT)

In [16]:
df['New_Review'] = new_colum

In [17]:
#df=df.drop(['newReview'],axis=1)

In [18]:
df.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review,New_Review
0,France,The Frog at Bercy Village,0,Rude manager,May 2024 •,The manager became agressive when I said the c...,manager became agressive said carbonara good r...
1,France,The Frog at Bercy Village,0,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,...",ordered beef fillet ask done medium got well d...
2,France,The Frog at Bercy Village,0,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al...",attractive venue welcoming albeit somewhat slo...
3,France,The Frog at Bercy Village,0,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...,used high tripadvisor rating literally fortuna...
4,France,The Frog at Bercy Village,0,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...,start meal bad- especially given price visited...


# Create N-grams


In [19]:
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [118]:
df['New_Review'] = df['New_Review'].apply(lambda x: list(ngrams(x.split(), 2)))

# Print the first few rows of the dataset
print(df['New_Review'])

0       [(manager, became), (became, agressive), (agre...
1       [(ordered, beef), (beef, fillet), (fillet, ask...
2       [(attractive, venue), (venue, welcoming), (wel...
3       [(used, high), (high, tripadvisor), (tripadvis...
4       [(start, meal), (meal, bad-), (bad-, especiall...
                              ...                        
1497    [(reviews, saying), (saying, 'lovely), ('lovel...
1498    [(good, food), (food, awfull), (awfull, decent...
1499    [(terrible, service), (service, truly), (truly...
1500    [(visited, havana), (havana, 's), ('s, club), ...
1501    [(service, awful), (awful, pretty), (pretty, s...
Name: New_Review, Length: 1502, dtype: object


In [23]:
"""    ngram_vect = CountVectorizer(ngram_range=(2,3))
    x_count = ngram_vect.fit_transform(df['New_Review'])

    x_count_df = pd.DataFrame(x_count.toarray())

    x_count_df.columns = ngram_vect.get_feature_names_out()
    x_count_df"""

"    ngram_vect = CountVectorizer(ngram_range=(2,3))\n    x_count = ngram_vect.fit_transform(df['New_Review'])\n\n    x_count_df = pd.DataFrame(x_count.toarray())\n\n    x_count_df.columns = ngram_vect.get_feature_names_out()\n    x_count_df"

In [26]:
import unicodedata
data = unicodedata.normalize('NFKD',str(df['New_Review'])).encode('ascii','ignore').decode('utf-8','ignore')

In [49]:
vectorizer = TfidfVectorizer(ngram_range=(1, 1))

tfidf_matrix = vectorizer.fit_transform(df['New_Review'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray())
tfidf_df.columns=vectorizer.get_feature_names_out()


In [50]:
print(tfidf_df )

      _black  abbie  able  abound  abrupt  absolute  absolutely  absolutey  \
0        0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
1        0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
2        0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
3        0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
4        0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
...      ...    ...   ...     ...     ...       ...         ...        ...   
1497     0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
1498     0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
1499     0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
1500     0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   
1501     0.0    0.0   0.0     0.0     0.0       0.0         0.0        0.0   

      absolutny  absurd  ...  zoubair  zoubir  zucca  zucchini 

In [51]:
print(tfidf_matrix.shape)

(1502, 6020)


In [58]:
print(tfidf_matrix)

  (0, 5979)	0.28747670097260236
  (0, 3175)	0.20345265066227647
  (0, 4466)	0.1665996387274238
  (0, 5948)	0.32266654899739955
  (0, 1929)	0.17876529293399562
  (0, 445)	0.26590426754118046
  (0, 4492)	0.31874458633420905
  (0, 2345)	0.12161336430674521
  (0, 827)	0.2812691676912178
  (0, 4521)	0.25887731954635423
  (0, 112)	0.4332835850191821
  (0, 505)	0.33637562592417936
  (0, 3161)	0.2793420030866631
  (1, 4034)	0.1292365471286803
  (1, 2149)	0.25044549729186133
  (1, 929)	0.188941948694637
  (1, 341)	0.13506337613950428
  (1, 2985)	0.13656858275817632
  (1, 4196)	0.20928229689362357
  (1, 1146)	0.1649264966583671
  (1, 226)	0.14339118288116304
  (1, 712)	0.1637315310801496
  (1, 3312)	0.13980311997417963
  (1, 5057)	0.5747159596151418
  (1, 5417)	0.1309219387063835
  :	:
  (1500, 2659)	0.1476795401118057
  (1500, 889)	0.15395397476329023
  (1500, 60)	0.18027330499909167
  (1500, 3589)	0.20872038967051754
  (1500, 301)	0.11508577522451975
  (1500, 2596)	0.11972334920470369
  (1500,