In [1]:
import pandas as pd
import os
from dotenv import load_dotenv
import nltk
import string
import re
nltk.download('punkt') 
load_dotenv()
home_path = os.getenv('LOCAL_ENV')
google_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/processed_google_reviews.csv')

[nltk_data] Downloading package punkt to /Users/mylene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
import numpy as np

We want to analyse at the sentence level so what we will do is:
1. count the amount of sentences per review.
2. split the reviews in the sentence count column that have more than one sentence.
3. count the amount of words in each review sentence.
4. remove reviews that are less than 5 words.
5. create a pattern for the fuzzy matcher 
6. get the synonyms for the words in that matcher



In [2]:
google_reviews["Sentence Count"] = google_reviews["Text"].apply(lambda x: len(nltk.sent_tokenize(x)))

split the reviews in the sentence count column that have more than one sentence.

In [3]:
# Custom tokenization pattern excluding certain punctuation marks
pattern = r'\b\w+\b|[' + re.escape(string.punctuation.replace('.', '')) + '](?<!\.)'

# split the google reviews 
split_google_reviews = google_reviews.copy()
split_google_reviews['Sentences'] = split_google_reviews['Text'].apply(nltk.sent_tokenize)
split_google_reviews = split_google_reviews.explode('Sentences').reset_index(drop=True)


In [4]:
# Count words with custom tokenization pattern
split_google_reviews['Word Count'] = split_google_reviews['Sentences'].apply(lambda x: len(nltk.regexp_tokenize(x, pattern)))

In [5]:
pd.set_option('display.max_colwidth', None)

In [6]:
# Assign unique numeric ID to each review
split_google_reviews['Review ID'] = split_google_reviews.groupby('Sentences').ngroup()

split_google_reviews = split_google_reviews[split_google_reviews['Sentence Count'] > 1]

In [7]:
# Filter out sentences with less than 5 words
split_google_reviews = split_google_reviews[split_google_reviews['Word Count'] >= 5]

In [8]:
split_google_reviews

Unnamed: 0.1,Unnamed: 0,Name,Sentiment,Text,Sentence Count,Sentences,Word Count,Review ID
0,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case!",24,360688
1,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,After a little while the place was cozily busy.,9,152807
3,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,"The burgers (and nachos) were lovely, as was the staff.",13,501320
4,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,I would definitely recommend this place if you’re around and in the need of a good burger.,18,342036
5,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5,Nice cozy place which serves very tasty burgers!,9,402895
...,...,...,...,...,...,...,...,...
894092,575689,Amstelhoeck,positive,Located next to the Stopera on the Amstel.\n\n …,2,Located next to the Stopera on the Amstel.,8,65897
894111,575707,Amstelhoeck,negative,To say the least so-so!\n\n\nМягко говоря так себе!,2,To say the least so-so!,8,109168
894112,575707,Amstelhoeck,negative,To say the least so-so!\n\n\nМягко говоря так себе!,2,Мягко говоря так себе!,5,644131
894113,575708,Amstelhoeck,positive,It is advisable to reserve a table.\n\n …,2,It is advisable to reserve a table.,7,62105


In [9]:
import nltk
import spacy
from spacy.util import filter_spans
from spaczz.matcher import FuzzyMatcher
from spacy import matcher
from spacy.tokens import Doc
from nltk.corpus import wordnet
from spacy.tokens import Span


2023-05-25 18:21:29.189668: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)

Use the Fuzzy matcher from spaczz and phrase matcher to look for the synonyms related to my previous regex expressipn.

In [11]:
#'breakfast', 'lunch', 'dinner', 'alcohol'
word_patterns = ['food', 'drink', 'lunch', 'breakfast', 'dinner', 'alcohol', 'beer', 'wine','pancakes', 'drink', 'desserts', 'gin', 'wine', 'breakfast', 'lunch', 'pasta',
                 'vegeterian', 'vegan', 'burgers', 'pasta', 'dish', 'beer', 'pizza', 'taste',
                 'food', 'cocktail', 'coffee', 'menu', 'tasty', 'delicious', 'staff', 'host',
                 'ambience', 'atmosphere', 'cozy', 'gezellig', 'service', 'pricey', 'cheap',
                 'nice place', 'great place', 'amazing place', 'good place', 'bad place',
                 'terrible place', 'great experience', 'chicken', 'burger']


In [12]:
def get_synonyms(word):
    synonyms = []
    synsets = wordnet.synsets(word)
    for synset in synsets:
        for lemma in synset.lemmas():
            synonyms.append(lemma.name())
    return list(set(synonyms))

In [13]:
def has_synset(word):
    synsets = wordnet.synsets(word)
    return len(synsets) > 0

In [14]:
patterns = []
for word in word_patterns:
    synonyms = get_synonyms(word)
    pattern = nlp(" ".join([word] + synonyms))
    patterns.append(pattern)


In [15]:
fuzzy_matcher = FuzzyMatcher(nlp.vocab)
fuzzy_matcher.add("FOOD_PATTERN", patterns)

In [16]:
@spacy.Language.component("filter_noisy_tokens")
def filter_noisy_tokens(doc):
    noisy_tokens = []
    matches = fuzzy_matcher(doc)
    spans = [Span(doc, start, end) for _, start, end, _, _ in matches]
    filtered_spans = filter_spans(spans)

    with doc.retokenize() as retokenizer:
        for span in filtered_spans:
            retokenizer.merge(span)

    for span in filtered_spans:
        noisy_tokens.extend(range(span.start, span.end))

    words = [token.text for token in doc if token.i not in noisy_tokens]
    doc = Doc(doc.vocab, words=words)

    return doc



In [17]:
# Register max_token_index as an extension
Doc.set_extension("max_token_index", default=-1, force=True)

In [18]:
nlp.add_pipe("filter_noisy_tokens", last=True)



<function __main__.filter_noisy_tokens(doc)>

In [19]:
# clean_reviews = []
# max_token_index = -1  # Initialize max_token_index

# for _, row in split_google_reviews.iterrows():
#     text = row['Sentences']
#     doc = nlp(text)

#     # Check if the document has any tokens after noise filtering
#     if len(doc) > 0:
#         # Ensure that doc._.max_token_index is a valid index before comparing it to max_token_index
#         if doc._.max_token_index < len(doc):
#             max_token_index = max(max_token_index, doc._.max_token_index)
#             clean_reviews.append(row)

# clean_split_google_reviews = pd.DataFrame(clean_reviews)

noisy_reviews = []

for _, row in split_google_reviews.iterrows():
    text = row['Sentences']
    doc = nlp(text)

    # Check if any matches were detected in the document
    if len(fuzzy_matcher(doc)) > 0:
        noisy_reviews.append(row)

noisy_split_google_reviews = pd.DataFrame(noisy_reviews)

# Drop noisy rows from original dataframe
clean_split_google_reviews = split_google_reviews.drop(noisy_split_google_reviews.index)



In [32]:
clean_split_google_reviews.to_excel(home_path+'data/processed/aspect_classification_data/sample.xlsx')

In [34]:
aspects_patterns = ['Wheelchair', 'Access', 'Staff', 'Toilets', 'Transport & Parking', 'Atmosphere', 'Overview', 'Noise levels']

synonyms = {word: get_synonyms(word) for word in aspects_patterns}

# Create a set of all your keywords and their synonyms for efficient lookup
keywords = set(aspects_patterns)
for word, synonym_list in synonyms.items():
    for synonym in synonym_list:
        keywords.add(synonym)

# Define a function to check if a review contains any keyword
def contains_keyword(review):
    return any(keyword in review for keyword in keywords)

# Create patterns with both the word and its synonyms
synonym_aspect_patterns = [nlp(word) for word in keywords]

# Add patterns to the fuzzy matcher
fuzzy_matcher.add("RELEVANT_PATTERN", synonym_aspect_patterns)

# Filter reviews to only include those that contain a keyword
relevant_reviews = []
for _, row in split_google_reviews.iterrows():
    if contains_keyword(row['Sentences']):
        relevant_reviews.append(row)
relevant_split_google_reviews = pd.DataFrame(relevant_reviews)


In [36]:
relevant_split_google_reviews.to_excel(home_path+'data/processed/aspect_classification_data/better_sample.xlsx')

In [37]:
relevant_split_google_reviews

Unnamed: 0.1,Unnamed: 0,Name,Sentiment,Text,Sentence Count,Sentences,Word Count,Review ID
3,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,"The burgers (and nachos) were lovely, as was the staff.",13,501320
6,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5,The staff is friendly and you are served pretty quickly.,10,537486
8,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5,The best thing is you can even get a gluten-free bun for no additional cost.,17,500015
19,4,Ellis,positive,I had a very nice experience! The staff were really nice and were kind to switch the buns for iceberg lettuce (making it Keto friendly). Would highly recommend if looking for a keto friendly place in Amsterdam.,3,The staff were really nice and were kind to switch the buns for iceberg lettuce (making it Keto friendly).,21,539366
35,9,Ellis,positive,Really cosy. Has an actual fireplace. Great food and friendly staff. No problems at all.,4,Great food and friendly staff.,5,291011
...,...,...,...,...,...,...,...,...
893980,575613,Amstelhoeck,negative,Bad food and paying for the toilet!\n\n …,2,Bad food and paying for the toilet!,8,10200
893985,575615,Amstelhoeck,positive,Good views over the canal. Food type rations and combined. …,3,Good views over the canal.,5,46656
893988,575616,Amstelhoeck,positive,Beautifully situated restaurant on Amstel with terrace.\n\n …,2,Beautifully situated restaurant on Amstel with terrace.,7,11888
894050,575657,Amstelhoeck,negative,Paper cups in the restaurant?\n\n\nPappbecher im Restaurant?,2,Paper cups in the restaurant?,6,83133


In [57]:
def contains_keyword(review):
    return any(keyword in review for keyword in keywords)

relevant_split_google_reviews['relevant'] = relevant_split_google_reviews['Sentences'].apply(contains_keyword)


In [71]:
split_google_reviews['relevant'] = split_google_reviews['Sentences'].apply(contains_keyword)

In [60]:
relevant_split_google_reviews

Unnamed: 0.1,Unnamed: 0,Name,Sentiment,Text,Sentence Count,Sentences,Word Count,Review ID,is_relevant,relevant
3,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,"The burgers (and nachos) were lovely, as was the staff.",13,501320,True,True
6,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5,The staff is friendly and you are served pretty quickly.,10,537486,True,True
8,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5,The best thing is you can even get a gluten-free bun for no additional cost.,17,500015,True,True
19,4,Ellis,positive,I had a very nice experience! The staff were really nice and were kind to switch the buns for iceberg lettuce (making it Keto friendly). Would highly recommend if looking for a keto friendly place in Amsterdam.,3,The staff were really nice and were kind to switch the buns for iceberg lettuce (making it Keto friendly).,21,539366,True,True
35,9,Ellis,positive,Really cosy. Has an actual fireplace. Great food and friendly staff. No problems at all.,4,Great food and friendly staff.,5,291011,True,True
...,...,...,...,...,...,...,...,...,...,...
893980,575613,Amstelhoeck,negative,Bad food and paying for the toilet!\n\n …,2,Bad food and paying for the toilet!,8,10200,True,True
893985,575615,Amstelhoeck,positive,Good views over the canal. Food type rations and combined. …,3,Good views over the canal.,5,46656,True,True
893988,575616,Amstelhoeck,positive,Beautifully situated restaurant on Amstel with terrace.\n\n …,2,Beautifully situated restaurant on Amstel with terrace.,7,11888,True,True
894050,575657,Amstelhoeck,negative,Paper cups in the restaurant?\n\n\nPappbecher im Restaurant?,2,Paper cups in the restaurant?,6,83133,True,True


In [74]:
sample_size = 2000  # Desired sample size

# Calculate the number of samples needed from each DataFrame
num_samples_relevant = int(sample_size * len(relevant_split_google_reviews) / len(split_google_reviews))
num_samples_split = sample_size - num_samples_relevant

# Take stratified samples from relevant_split_google_reviews
stratified_samples_relevant = relevant_split_google_reviews.groupby('relevant').apply(lambda x: x.sample(n=num_samples_relevant, random_state=1)).reset_index(drop=True)

# Exclude overlapping rows from split_google_reviews
split_google_reviews_excluded = split_google_reviews[~split_google_reviews.index.isin(stratified_samples_relevant.index)]

# Take stratified samples from split_google_reviews_excluded
stratified_samples_split = split_google_reviews_excluded.groupby('relevant').apply(lambda x: x.sample(n=num_samples_split, random_state=1)).reset_index(drop=True)

# Concatenate the samples from both DataFrames
stratified_sample = pd.concat([stratified_samples_split, stratified_samples_relevant])


In [75]:
stratified_sample

Unnamed: 0.1,Unnamed: 0,Name,Sentiment,Text,Sentence Count,Sentences,Word Count,Review ID,is_relevant,relevant
0,24872,Eastwood Coffeeshop,positive,"Good shop. Bit cold staff at times but still friendly. The weed stinks in the wind for an hour. A lot is very expensive, but there are affordable options. …",5,"A lot is very expensive, but there are affordable options.",11,145119,False,False
1,463901,Dolphins Coffeeshop,positive,"Beautiful, spotlessly clean, organized and comfortable. Relatively spacious, it has perhaps the only room in Amsterdam for smoking tobacco. A little bit iron on the rules, but it's all there. There lives a splendid …",4,"A little bit iron on the rules, but it's all there.",14,144518,False,False
2,97944,Chao Phraya,positive,"Chao Phraya River is one of long River in Thailand. Start from Nakorn Sawan district until Samut Prakarn district then out to gulf of Thailand. If you traveling to Bangkok, you can take the boat for sightseeing along the river. I suggest you for dinner on the ship.",4,Chao Phraya River is one of long River in Thailand.,10,204912,False,False
3,264449,Bosco,positive,Yummy!!!!\nDelicious !!!!\nGood coffee !!! …,4,Delicious !!!!,5,219700,False,False
4,468841,Kantjil & De Tijger,positive,The food is very good. We ordered a menu consisting of 3 parts for € 11.50. The three dishes are served together in the same bowl. We liked it a lot. Good quality price …,5,The food is very good.,5,103134,False,False
...,...,...,...,...,...,...,...,...,...,...
407,529180,Geisha,positive,"One of the best sushi restaurants I've ever been to, the fish was consistently amazing. All dishes, both the fish and the dim sum, were just super tasty. The ambience is coherent, but the light could be a little …",3,"The ambience is coherent, but the light could be a little …",12,495422,True,True
408,485770,Black and Blue,negative,"Steak really nice. Burger was burned, yet replaced in no time with a good one. Staff is a little stressed- the owner has each and every centimeter in this restaurant covered with tables. Result is, it gets very loud, feels extremely cramped and one nearly gets claustrophobia. Since the food is nice: COULD be angeraten place",5,Staff is a little stressed- the owner has each and every centimeter in this restaurant covered with tables.,19,476874,True,True
409,121632,Genroku,positive,No.1 from Adam for years! The staff is also very friendly. 😍 …,3,The staff is also very friendly.,6,537247,True,True
410,14755,Big Shots,positive,"Good music, you can drink and consume what you take from the coffeshop. For mixed visitor groups, who only like alcohol and others want to smoke, it is the ideal place …",2,"Good music, you can drink and consume what you take from the coffeshop.",14,44686,True,True


I still need to think about how relevant the relevant_google_reviews dataframe really is but i combined it with the original to keep the noise.