In [None]:
%pip install top2vec[sentence_encoders]
from top2vec import Top2Vec

In [2]:
import pandas as pd
import os
from dotenv import load_dotenv
import nltk
import re
import string
nltk.download('punkt') 
load_dotenv()
home_path = os.getenv('LOCAL_ENV')
google_reviews = pd.read_csv(home_path + 'data/processed/aspect_classification_data/processed_google_reviews.csv')

[nltk_data] Downloading package punkt to /Users/mylene/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Splitting reviews up

In [3]:
google_reviews["Sentence Count"] = google_reviews["Text"].apply(lambda x: len(nltk.sent_tokenize(x)))

In [4]:
# Custom tokenization pattern excluding certain punctuation marks
pattern = r'\b\w+\b|[' + re.escape(string.punctuation.replace('.', '')) + '](?<!\.)'

# split the google reviews 
split_google_reviews = google_reviews.copy()
split_google_reviews['Sentences'] = split_google_reviews['Text'].apply(nltk.sent_tokenize)
split_google_reviews = split_google_reviews.explode('Sentences').reset_index(drop=True)


In [5]:
# Count words with custom tokenization pattern
split_google_reviews['Word Count'] = split_google_reviews['Sentences'].apply(lambda x: len(nltk.regexp_tokenize(x, pattern)))

In [6]:
pd.set_option('display.max_colwidth', None)

In [7]:
# Assign unique numeric ID to each review
split_google_reviews['Review ID'] = split_google_reviews.groupby('Sentences').ngroup()

split_google_reviews = split_google_reviews[split_google_reviews['Sentence Count'] > 1]

In [8]:
# Filter out sentences with less than 5 words
split_google_reviews = split_google_reviews[split_google_reviews['Word Count'] >= 5]

### Create SpaCy pipeline

In [9]:
import nltk
import spacy
from spacy.util import filter_spans
from spaczz.matcher import FuzzyMatcher
from spacy import matcher
from spacy.tokens import Doc
from nltk.corpus import wordnet
from spacy.tokens import Span

nlp = spacy.blank("en")
matcher = FuzzyMatcher(nlp.vocab)

### Investigate the top noisy words

In [11]:
documents = split_google_reviews['Sentences'].to_list()
model = Top2Vec(documents)
model.get_num_topics()

2023-06-13 11:19:13,844 - top2vec - INFO - Pre-processing documents for training
2023-06-13 11:19:28,520 - top2vec - INFO - Creating joint document/word embedding
2023-06-13 11:32:33,127 - top2vec - INFO - Creating lower dimension embedding of documents




2023-06-13 11:38:44,311 - top2vec - INFO - Finding dense areas of documents






2023-06-13 11:39:10,264 - top2vec - INFO - Finding topics




4302

1. Look for the top synonyms in the actual dataset.
2. use those most common words as a list and feed this to your synonyms function.
3. use this with the fuzzy matcher to remove a higher degree of noise from the dataset.

In [12]:
topic_words, word_scores, topic_nums = model.get_topics(4302)

In [38]:
topic_words, word_scores, topic_scores, topic_nums = model.search_topics(keywords=["beer"], num_topics=50)