In [79]:
import pandas as pd
import os
from dotenv import load_dotenv
import nltk
import re
import string
nltk.download('punkt') 
load_dotenv()
home_path = os.getenv('LOCAL_ENV')
google_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/processed_google_reviews.csv')

[nltk_data] Downloading package punkt to /Users/mylene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


We want to analyse at the sentence level so what we will do is:
1. count the amount of sentences per review.
2. split the reviews in the sentence count column that have more than one sentence.
3. count the amount of words in each review sentence.
4. remove reviews that are less than 5 words.
5. create a pattern for the fuzzy matcher 
6. get the synonyms for the words in that matcher



In [80]:
google_reviews["Sentence Count"] = google_reviews["Text"].apply(lambda x: len(nltk.sent_tokenize(x)))

In [81]:
google_reviews

Unnamed: 0.1,Unnamed: 0,Name,Sentiment,Text,Sentence Count
0,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5
1,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5
2,2,Ellis,positive,Really nice place. One of my favourite burger joints whole visiting Amsterdam. Reasonably priced and good quality food. 40euros for 2 burgers and 2 drinks,4
3,3,Ellis,negative,"The Service was quite good but the burgers we ordered were small for the price we paid for. The classic burger was okay but a bit too cold. The Portobello burger says ""grilled vegetables"" but it had almost no vegetables on it. The Burger …",4
4,4,Ellis,positive,I had a very nice experience! The staff were really nice and were kind to switch the buns for iceberg lettuce (making it Keto friendly). Would highly recommend if looking for a keto friendly place in Amsterdam.,3
...,...,...,...,...,...
399718,575747,Amstelhoeck,positive,In a word super\n\n\nEn un mot super,1
399719,575748,Amstelhoeck,negative,Recommended\n\n\nEmpfehlenswert,1
399720,575749,Amstelhoeck,positive,Location location location\n\n\nLokatie lokatie lokatie,1
399721,575750,Amstelhoeck,negative,Nice moment\n\n\nNice moment,1


split the reviews in the sentence count column that have more than one sentence.

In [82]:
# Custom tokenization pattern excluding certain punctuation marks
pattern = r'\b\w+\b|[' + re.escape(string.punctuation.replace('.', '')) + '](?<!\.)'

# split the google reviews 
split_google_reviews = google_reviews.copy()
split_google_reviews['Sentences'] = split_google_reviews['Text'].apply(nltk.sent_tokenize)
split_google_reviews = split_google_reviews.explode('Sentences').reset_index(drop=True)


In [83]:
# Count words with custom tokenization pattern
split_google_reviews['Word Count'] = split_google_reviews['Sentences'].apply(lambda x: len(nltk.regexp_tokenize(x, pattern)))

In [84]:
pd.set_option('display.max_colwidth', None)

In [85]:
# Assign unique numeric ID to each review
split_google_reviews['Review ID'] = split_google_reviews.groupby('Sentences').ngroup()

split_google_reviews = split_google_reviews[split_google_reviews['Sentence Count'] > 1]

In [86]:
# Filter out sentences with less than 5 words
split_google_reviews = split_google_reviews[split_google_reviews['Word Count'] >= 5]

In [87]:
split_google_reviews

Unnamed: 0.1,Unnamed: 0,Name,Sentiment,Text,Sentence Count,Sentences,Word Count,Review ID
0,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case!",24,360688
1,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,After a little while the place was cozily busy.,9,152807
3,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,"The burgers (and nachos) were lovely, as was the staff.",13,501320
4,0,Ellis,positive,"It was a bit quite when we went in, but don’t let that fool you if it’s ever the case! After a little while the place was cozily busy. Rightfully so! The burgers (and nachos) were lovely, as was the staff. I would definitely recommend this place if you’re around and in the need of a good burger.",5,I would definitely recommend this place if you’re around and in the need of a good burger.,18,342036
5,1,Ellis,positive,Nice cozy place which serves very tasty burgers! The staff is friendly and you are served pretty quickly. They have a good selection of burgers and other goodies. The best thing is you can even get a gluten-free bun for no additional cost. …,5,Nice cozy place which serves very tasty burgers!,9,402895
...,...,...,...,...,...,...,...,...
894092,575689,Amstelhoeck,positive,Located next to the Stopera on the Amstel.\n\n …,2,Located next to the Stopera on the Amstel.,8,65897
894111,575707,Amstelhoeck,negative,To say the least so-so!\n\n\nМягко говоря так себе!,2,To say the least so-so!,8,109168
894112,575707,Amstelhoeck,negative,To say the least so-so!\n\n\nМягко говоря так себе!,2,Мягко говоря так себе!,5,644131
894113,575708,Amstelhoeck,positive,It is advisable to reserve a table.\n\n …,2,It is advisable to reserve a table.,7,62105


In [88]:
import nltk
import spacy
from spacy.util import filter_spans
from spaczz.matcher import FuzzyMatcher
from spacy import matcher
from spacy.tokens import Doc
from nltk.corpus import wordnet
from spacy.tokens import Span


In [89]:
nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)

Use the Fuzzy matcher from spaczz and phrase matcher to look for the synonyms related to my previous regex expressipn.

In [90]:
#'breakfast', 'lunch', 'dinner', 'alcohol'
word_patterns = ['food', 'drink', 'lunch', 'breakfast', 'dinner', 'alcohol', 'beer', 'wine','pancakes', 'drink', 'desserts', 'gin', 'wine', 'breakfast', 'lunch', 'pasta',
                 'vegeterian', 'vegan', 'burgers', 'pasta', 'dish', 'beer', 'pizza', 'taste',
                 'food', 'cocktail', 'coffee', 'menu', 'tasty', 'delicious', 'staff', 'host',
                 'ambience', 'atmosphere', 'cozy', 'gezellig', 'service', 'pricey', 'cheap',
                 'nice place', 'great place', 'amazing place', 'good place', 'bad place',
                 'terrible place', 'great experience', 'chicken', 'burger', 'atmosphere']


In [91]:
def has_synset(word):
    synsets = wordnet.synsets(word)
    return len(synsets) > 0

In [92]:
def get_synonyms(word):
    synonyms = []
    synsets = wordnet.synsets(word)
    if synsets:
        for synset in synsets:
            for lemma in synset.lemmas():
                synonyms.append(lemma.name())
    return list(set(synonyms))

In [93]:
def make_pattern(word_patterns):
    patterns = []
    for word in word_patterns:
        if has_synset(word):
            synonyms = get_synonyms(word)
            pattern = [Doc(nlp.vocab, words=[word])] + [Doc(nlp.vocab, words=[synonym]) for synonym in synonyms]
            patterns.extend(pattern)
        else:
            patterns.append(Doc(nlp.vocab, words=[word]))
    return patterns


In [94]:
new_pattern = make_pattern(word_patterns)
print(new_pattern)

[food , intellectual_nourishment , food_for_thought , solid_food , food , nutrient , drink , drunkenness , toast , crapulence , beverage , tope , imbibe , wassail , pledge , boozing , fuddle , potable , drinkable , drink , drink_in , booze , drinking , deglutition , swallow , salute , lunch , tiffin , dejeuner , luncheon , lunch , breakfast , breakfast , dinner , dinner , dinner_party , alcohol , alcoholic_drink , intoxicant , alcohol , alcoholic_beverage , inebriant , beer , beer , wine , wine-coloured , wine , wine-colored , vino , pancakes , flapjack , griddlecake , hotcake , flannel-cake , flannel_cake , flapcake , pancake , hot_cake , battercake , drink , drunkenness , toast , crapulence , beverage , tope , imbibe , wassail , pledge , boozing , fuddle , potable , drinkable , drink , drink_in , booze , drinking , deglutition , swallow , salute , desserts , afters , sweet , dessert , gin , gin_rummy , noose , cotton_gin , snare , knock_rummy , gin , wine , wine-coloured , wine , win

In [95]:
print(len(new_pattern))

327


In [96]:
fuzzy_matcher = FuzzyMatcher(nlp.vocab)
fuzzy_matcher.add("FOOD_PATTERN", new_pattern)

In [104]:
def is_row_noisy(review):
    doc = nlp(review)
    matches = fuzzy_matcher(doc)
    if len(matches) > 0:
        return True
    else:
        return False
    

In [105]:
# Register max_token_index as an extension
Doc.set_extension("max_token_index", default=-1, force=True)

In [106]:
# nlp.add_pipe("is_row_noisy", last=True)

In [107]:
test_pattern = make_pattern(['Staff', 'Waitress'])
print(test_pattern)
fuzzy_matcher.add("TEST_PATTERN", test_pattern)

[Staff , stave , staff , faculty , Waitress , waitress , wait ]


In [108]:
def remove_noise_test(reviews):
    noisy_reviews = []
    # docs = [Doc(nlp.vocab, words=[review]) for review in reviews]
    for text in reviews:
        print('Before: ', text)
        matches = is_row_noisy(text)
        print('Matches: ', matches)
        
        # Check if any matches were detected in the document
        if is_row_noisy(text):
            print('Noisy Review: ', text)
            noisy_reviews.append(text)
    
    return noisy_reviews


In [109]:
test_reviews = ["The food is really delicious here."," The staff is very friendly and the waitress was very helpful. I would definitely recommend this place to my friends.", "The walls look really disgusting"]

In [110]:
remove_noise_test(test_reviews)

Before:  The food is really delicious here.
Matches:  True
Noisy Review:  The food is really delicious here.
Before:   The staff is very friendly and the waitress was very helpful. I would definitely recommend this place to my friends.
Matches:  True
Noisy Review:   The staff is very friendly and the waitress was very helpful. I would definitely recommend this place to my friends.
Before:  The walls look really disgusting
Matches:  False


['The food is really delicious here.',
 ' The staff is very friendly and the waitress was very helpful. I would definitely recommend this place to my friends.']

In [220]:
len(split_google_reviews)

483998

In [112]:
def remove_noise_better(sample_df):
    sample_df.loc[:, 'has_fuzzy_match'] = sample_df['Sentences'].apply(lambda x: is_row_noisy(x))
    matching_rows = sample_df[sample_df['has_fuzzy_match'] == True]
    noiseless_df = sample_df.drop(matching_rows.index)
    # Print the count of true and false values in the has_fuzzy_match column of the sample_df
    print("Count of true values in has_fuzzy_match column:", sample_df['has_fuzzy_match'].value_counts()[True])
    print("Count of false values in has_fuzzy_match column:", sample_df['has_fuzzy_match'].value_counts()[False])
    return noiseless_df


In [113]:
def clean_data_in_batches(df, batch_size, output_path):
    total_batches = len(df) // batch_size + 1  # Calculate the total number of batches
    
    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = (i + 1) * batch_size
        
        # Extract the current batch of data
        batch = df[start_idx:end_idx]
        
        # Clean the current batch using the remove_noise_better function
        clean_batch = remove_noise_better(batch)
        
        # Save the cleaned batch to a CSV file
        batch_output_path = output_path+'/clean_google_reviews_batch.csv'
        clean_batch.to_csv(batch_output_path, index=False)
        return clean_batch


In [228]:
clean_dataframe = clean_data_in_batches(split_google_reviews[120000:483998], 363998, home_path+'data/processed/aspect_classification_data')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df.loc[:, 'has_fuzzy_match'] = sample_df['Sentences'].apply(lambda x: is_row_noisy(x))


Count of true values in has_fuzzy_match column: 270588
Count of false values in has_fuzzy_match column: 93410


In [213]:
num_rows = clean_dataframe.shape[0]
print("Number of rows:", num_rows)

Number of rows: 7723


In [None]:
clean_dataframe

In [221]:
aspects_patterns = ['width','space','entrance','wheelchair', 'access', 'staff', 'toilets', 'transport & parking', 'overview', 'noise levels']

synonyms = {word: get_synonyms(word) for word in aspects_patterns}

# Create a set of all your keywords and their synonyms for efficient lookup
keywords = set(aspects_patterns)
for word, synonym_list in synonyms.items():
    for synonym in synonym_list:
        keywords.add(synonym)

# Define a function to check if a review contains any keyword
def contains_keyword(review):
    return any(keyword in review for keyword in keywords)

# Create patterns with both the word and its synonyms
synonym_aspect_patterns = [nlp(word) for word in keywords]

print(synonym_aspect_patterns)
# Add patterns to the fuzzy matcher
fuzzy_matcher.add("RELEVANT_PATTERN", synonym_aspect_patterns)


[capture, crapper, bewitch, sewer, infinite, entranceway, privy, trance, entryway, catch, access_code, width, charm, memory_access, blank, bathroom, becharm, overview, transport & parking, admission, lavatory, entering, ingress, space, breadth, beguile, admittance, can, blank_space, get_at, outer_space, enchant, stave, john, entry, spellbind, captivate, access, entree, toilet, toilette, commode, gutter, stool, incoming, fascinate, place, entrance, pot, noise levels, wheelchair, faculty, throne, enamour, quad, lav, enamor, distance, potty, accession, approach, staff, toilets]


In [222]:
def contains_keyword(review):
    return any(keyword in review for keyword in keywords)

clean_dataframe['relevant'] = clean_dataframe['Sentences'].apply(contains_keyword)


In [225]:
true_count = clean_dataframe['relevant'].value_counts()[True]

In [224]:
false_count = clean_dataframe['relevant'].value_counts()[False]

In [226]:
print(true_count, false_count)

128 7595


In [227]:
true_rows = clean_dataframe[clean_dataframe['relevant'] == True]
true_rows.to_csv(home_path+'data/processed/aspect_classification_data/true_rows3.csv', index=False)


In [193]:
def check_relevance(review, pattern= "RELEVANT_PATTERN"):
    doc = nlp(review)
    matches = fuzzy_matcher(doc)
    if matches != []:
        match_id = matches[0][0]
        if match_id == pattern:
            return True
    else: 
        return False

In [194]:
clean_dataframe['truly relevant'] = clean_dataframe['Sentences'].apply(lambda x: check_relevance(x))

In [None]:
clean_dataframe

In [196]:
true_count = clean_dataframe['truly relevant'].value_counts()[True]

In [197]:
print(true_count)

3931


In [198]:
true_rows['truly relevant'] = true_rows['Sentences'].apply(lambda x: check_relevance(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  true_rows['truly relevant'] = true_rows['Sentences'].apply(lambda x: check_relevance(x))
