In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer

from nltk.stem.wordnet import WordNetLemmatizer

from gensim.corpora import Dictionary
import pickle as pkl

In [2]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["test"])

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df_combined = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_combined.head()

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


In [6]:
tokenizer = RegexpTokenizer(r'\w+')
reviews = df_combined["text"].tolist()
for idx in range(len(reviews)):
    reviews[idx] = reviews[idx].lower()  # Convert to lowercase.
    reviews[idx] = tokenizer.tokenize(reviews[idx])  # Split into words.

In [8]:
# remove numbers
reviews = [[token for token in doc if not token.isnumeric()] for doc in reviews]

# remove single characters
reviews = [[token for token in doc if len(token) > 1] for doc in reviews]


In [9]:
lemmatizer = WordNetLemmatizer()
reviews = [[lemmatizer.lemmatize(token) for token in doc] for doc in reviews]


In [11]:
with open("clean_reviews.pkl", "wb") as file:
    pkl.dump(reviews, file)


In [12]:
with open("clean_reviews.pkl", "rb") as file:
    temp_list = pkl.load(file)
    print(temp_list[0])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']


In [13]:
dictionary = Dictionary(reviews)
dictionary.filter_extremes(no_below=20, no_above=0.5)


In [14]:
corpus = [dictionary.doc2bow(doc) for doc in reviews]


In [19]:
with open("clean_review_corpus.pkl", "wb") as file:
    pkl.dump(corpus, file)

In [18]:
print(reviews[0])
print(len(corpus[1]))
print(corpus[1])

['dr', 'goldberg', 'offer', 'everything', 'look', 'for', 'in', 'general', 'practitioner', 'he', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'he', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patient', 'he', 'affiliated', 'with', 'top', 'notch', 'hospital', 'nyu', 'which', 'my', 'parent', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referral', 'to', 'see', 'specialist', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaint', 'have', 'about', 'him', 'but', 'really', 'drawing', 'blank']
59
[(4, 1), (10, 2), (17, 2), (19, 4), (24, 1), (30, 1), (36, 2), (49, 1), (56, 2), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 1), (63, 1), (64, 3), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (7