In [170]:
%pip install top2vec[sentence_encoders]
from top2vec import Top2Vec

zsh:1: no matches found: top2vec[sentence_encoders]
Note: you may need to restart the kernel to use updated packages.


In [171]:
import pandas as pd
import os
from dotenv import load_dotenv
import nltk
import re
import string
nltk.download('punkt') 
load_dotenv()
home_path = os.getenv('LOCAL_ENV')
google_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/processed_google_reviews.csv')

[nltk_data] Downloading package punkt to /Users/mylene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Splitting reviews up

In [172]:
google_reviews["Sentence Count"] = google_reviews["Text"].apply(lambda x: len(nltk.sent_tokenize(x)))

In [173]:
# Custom tokenization pattern excluding certain punctuation marks
pattern = r'\b\w+\b|[' + re.escape(string.punctuation.replace('.', '')) + '](?<!\.)'

# split the google reviews 
split_google_reviews = google_reviews.copy()
split_google_reviews['Sentences'] = split_google_reviews['Text'].apply(nltk.sent_tokenize)
split_google_reviews = split_google_reviews.explode('Sentences').reset_index(drop=True)


In [174]:
# Count words with custom tokenization pattern
split_google_reviews['Word Count'] = split_google_reviews['Sentences'].apply(lambda x: len(nltk.regexp_tokenize(x, pattern)))

In [175]:
pd.set_option('display.max_colwidth', None)

In [176]:
# Assign unique numeric ID to each review
split_google_reviews['Review ID'] = split_google_reviews.groupby('Sentences').ngroup()

split_google_reviews = split_google_reviews[split_google_reviews['Sentence Count'] > 1]

In [177]:
# Filter out sentences with less than 5 words
split_google_reviews = split_google_reviews[split_google_reviews['Word Count'] >= 5]

### Create SpaCy pipeline

In [178]:
import nltk
import spacy
from spacy.util import filter_spans
from spaczz.matcher import FuzzyMatcher
from spacy import matcher
from spacy.tokens import Doc
from nltk.corpus import wordnet
from spacy.tokens import Span

nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)

### Investigate the top noisy words

In [179]:
documents = split_google_reviews['Sentences'].to_list()
model = Top2Vec(documents)
model.get_num_topics()

2023-06-18 13:39:07,124 - top2vec - INFO - Pre-processing documents for training
2023-06-18 13:39:21,060 - top2vec - INFO - Creating joint document/word embedding
2023-06-18 13:52:21,469 - top2vec - INFO - Creating lower dimension embedding of documents




2023-06-18 13:57:11,226 - top2vec - INFO - Finding dense areas of documents






2023-06-18 13:57:35,911 - top2vec - INFO - Finding topics




4269

1. Look for the top synonyms in the actual dataset.
2. use those most common words as a list and feed this to your synonyms function.
3. use this with the fuzzy matcher to remove a higher degree of noise from the dataset.

In [180]:
topic_words, word_scores, topic_nums = model.get_topics(4302)

ValueError: num_topics cannot exceed the number of topics: 4269.

In [None]:
print(topic_words)

[['helpfull' 'tidy' 'importantly' ... 'reccomend' 'recommendable' 'humor']
 ['relaxed' 'atmosphere' 'friendly' ... 'nice' 'modest' 'sober']
 ['sehr' 'ditto' 'halal' ... 'furthermore' 'phenomenal' 'sublime']
 ...
 ['slow' 'inattentive' 'disorganized' ... 'poor' 'crowded' 'gentle']
 ['notch' 'top' 'sehr' ... 'remains' 'african' 'et']
 ['decent' 'hop' 'reasonable' ... 'pretty' 'premium' 'basic']]


In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["wheelchair"], num_topics=4302)

In [None]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["wheelchair"], num_topics=50)
for topic in topic_nums:
    model.generate_topic_wordcloud(topic)

In [None]:
# %pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.10.1
Note: you may need to restart the kernel to use updated packages.


In [235]:
relevant_docs = []
keywords = ["toilet"]
excluded_keywords = ["credit","money","€ ","price", "cents", "cent", "charge", "€", "charging", "euros", "pay", "paid", "paying", "payed", "dirty", "filthy", "stinky", "clean", "stink", "cost"]
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=keywords, num_docs=500)

for doc, score, doc_id in zip(documents, document_scores, document_ids):
    # Convert document to lowercase and check if it contains excluded keywords
    if any(word in doc.lower() for word in excluded_keywords):
        continue  # Skip this document and move to the next one
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    relevant_docs.append((doc_id, score, doc))
    print("-----------")
    print()


Document: 79940, Score: 0.3964807987213135
-----------
Toilets without toilet paper and soap.
-----------

Document: 29464, Score: 0.3883405923843384
-----------
The toilets didn't have soap or paper towels.
-----------

Document: 386279, Score: 0.37139904499053955
-----------
The toilet is old and leaking.
-----------

Document: 158339, Score: 0.3659678101539612
-----------
(Only in the hallway to the toilet).
-----------

Document: 266475, Score: 0.36508306860923767
-----------
The men's bathrooms had no functioning lock.
-----------

Document: 313227, Score: 0.3647236227989197
-----------
The male bathroom could use a lock.
-----------

Document: 453437, Score: 0.3627600073814392
-----------
Bathroom without soap and towels, so one can't either wash its hands.
-----------

Document: 135856, Score: 0.3621731698513031
-----------
Warm, cozy, toilet.
-----------

Document: 44626, Score: 0.3620632588863373
-----------
No toilet paper for women.
-----------

Document: 90512, Score: 0.361

In [236]:
print(relevant_docs)

[(79940, 0.3964807987213135, 'Toilets without toilet paper and soap.'), (29464, 0.3883405923843384, "The toilets didn't have soap or paper towels."), (386279, 0.37139904499053955, 'The toilet is old and leaking.'), (158339, 0.3659678101539612, '(Only in the hallway to the toilet).'), (266475, 0.36508306860923767, "The men's bathrooms had no functioning lock."), (313227, 0.3647236227989197, 'The male bathroom could use a lock.'), (453437, 0.3627600073814392, "Bathroom without soap and towels, so one can't either wash its hands."), (135856, 0.3621731698513031, 'Warm, cozy, toilet.'), (44626, 0.3620632588863373, 'No toilet paper for women.'), (90512, 0.3610458970069885, 'No soap in the bathroom either.'), (313036, 0.35660603642463684, "The toilet didn't have a lock."), (201903, 0.35598379373550415, "I go to the bathroom and I can't find toilet paper, much less a sink to wash my hands .. That is, I wonder how it is possible …"), (128454, 0.35471636056900024, 'Toilet unfortunately not wheel

In [237]:
# Reset the index of the dataframe
split_google_reviews = split_google_reviews.reset_index(drop=True)


In [238]:
relevant_ids = [document[0] for document in relevant_docs]
filtered_reviews = split_google_reviews[split_google_reviews.index.isin(relevant_ids)]


In [239]:
filtered_reviews.to_csv(home_path + 'data/processed/aspect_classification_data/filtered_new_toilets_google_reviews.csv', index=False)

Join all data together

In [147]:
access_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_access_google_reviews.csv')
toilet_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_toilet_google_reviews.csv')
noise_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_noise_google_reviews.csv')
staff_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_staff_google_reviews.csv')
entrance_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_entrance_google_reviews.csv')
parking_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_parking_google_reviews.csv')
wheelchair_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_wheelchair_google_reviews.csv')
transport_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_transport_google_reviews.csv')

In [149]:
concatenated_sample = pd.concat([access_reviews, toilet_reviews, noise_reviews, staff_reviews, entrance_reviews, parking_reviews, wheelchair_reviews, transport_reviews], ignore_index=True)

In [150]:
len(concatenated_sample)

1601

In [151]:
concatenated_sample.drop_duplicates(subset=["Sentences"], inplace=True)

In [152]:
len(concatenated_sample)

1307

In [153]:
other_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/filtered_other_google_reviews.csv')
full_concatenated_sample = pd.concat([concatenated_sample, other_reviews], ignore_index=True)

In [157]:
full_concatenated_sample.drop_duplicates(subset=["Sentences"], inplace=True)

In [164]:
len(other_reviews)

1100

In [163]:
full_concatenated_sample.to_excel(home_path + 'data/processed/aspect_classification_data/full_concatenated_google_reviews.xlsx', index=False)