In [1]:
import spacy
import pandas as pd
import re

from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

In [3]:
nlp = spacy.load("en_core_web_trf")
text = "I love the pasta at Olive Garden, especially on Halloween. Las Vegas, Frankfurt"
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

Olive Garden ORG
Halloween DATE
Las Vegas GPE
Frankfurt GPE


In [14]:
# View all spacy labels
nlp = spacy.load("en_core_web_trf", disable=["tagger", "parser", "attribute_ruler", "lemmatizer"])
for label in nlp.get_pipe('ner').labels:
    print(f"{label}: {spacy.explain(label)}")

  model.load_state_dict(torch.load(filelike, map_location=device))


CARDINAL: Numerals that do not fall under another type
DATE: Absolute or relative dates or periods
EVENT: Named hurricanes, battles, wars, sports events, etc.
FAC: Buildings, airports, highways, bridges, etc.
GPE: Countries, cities, states
LANGUAGE: Any named language
LAW: Named documents made into laws.
LOC: Non-GPE locations, mountain ranges, bodies of water
MONEY: Monetary values, including unit
NORP: Nationalities or religious or political groups
ORDINAL: "first", "second", etc.
ORG: Companies, agencies, institutions, etc.
PERCENT: Percentage, including "%"
PERSON: People, including fictional
PRODUCT: Objects, vehicles, foods, etc. (not services)
QUANTITY: Measurements, as of weight or distance
TIME: Times smaller than a day
WORK_OF_ART: Titles of books, songs, etc.


In [82]:
def replace_specific_entities(doc, target_labels):
    replaced_text = doc.text
    for ent in sorted(doc.ents, key=lambda e: e.start_char, reverse=True):
        if ent.label_ in target_labels:
            token = f"<{ent.label_}>"
            replaced_text = replaced_text[:ent.start_char] + token + replaced_text[ent.end_char:]
    return replaced_text

# Specify the labels to target
result = replace_specific_entities(doc, target_labels=["FAC", "ORG", "DATE"])
print(result)


I love the pasta at <ORG>, especially on <DATE>.


In [None]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])

In [None]:
reviews = df.head(1000)["text"].tolist()

In [57]:
TARGET_LABELS = ["FAC", "ORG", "PERSON"]
REPLACEMENT_LABELS = ["FAC", "ORG", "PERSON"]

tag_list = "|".join([label.lower() for label in REPLACEMENT_LABELS])
token_regex = fr"(?:\b\w+\b|<(?:{tag_list})>)"

nlp = spacy.load("en_core_web_trf")


  model.load_state_dict(torch.load(filelike, map_location=device))


In [58]:
token_regex

'(?:\\b\\w+\\b|<(?:fac|org|person)>)'

In [None]:
for review in reviews:
    doc = nlp(review)
    containFlag = False
    for ent in doc.ents:
        # print(ent.text, ent.label_)
        if ent.label_ in TARGET_LABELS:
            containFlag = True
    if containFlag:
        print(review)
        for ent in doc.ents:
            print(ent.text, ent.label_)
        break

dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.
goldberg PERSON
nyu ORG
first ORDINAL


In [72]:
text = "Compared to other Whole Foods locations, this one is tiny! For comparison, the Whole Foods in Las Vegas"
doc = nlp(text)
for ent in doc.ents:
    print(ent.text, ent.label_)

Whole Foods ORG
the Whole Foods ORG
Las Vegas GPE


# Running spaCy NER on Datset

In [8]:
splits = {'train': 'yelp_review_full/train-00000-of-00001.parquet', 'test': 'yelp_review_full/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/Yelp/yelp_review_full/" + splits["train"])

In [None]:
reviews = df.head(1000)["text"].tolist()


In [62]:
def clean_text(text):
    #removes 's and apostrophe, converts to lower case
    text = text.lower()
    text = re.sub(r"'s(\s|$)", r"\1", text)
    text = re.sub(r"'", r"", text)
    return text.strip()

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

tokenizer = RegexpTokenizer(token_regex)
lemmatizer = WordNetLemmatizer()

english_stopwords = set(nltk.corpus.stopwords.words('english'))

In [69]:
print(lemmatizer.lemmatize("<person>"))
print(lemmatizer.lemmatize("<person>s"))
print(lemmatizer.lemmatize("las"))

<person>
<person>s
la


In [49]:
# TARGET_LABELS = ["FAC", "ORG", "PERSON", "PRODUCT", "WORK_OF_ART"]
TARGET_LABELS = ["FAC", "ORG", "PERSON"]
REPLACEMENT_LABELS = ["FAC", "ORG", "PERSON"]

In [50]:
# 3m 41.9s
filtered_reviews = []

for idx, review in enumerate(reviews):
    doc = nlp(review)
    contain_flag = False
    for ent in doc.ents:
        if ent.label_ in TARGET_LABELS:
            contain_flag = True
    if contain_flag:
        # print(review)
        result = replace_specific_entities(doc, target_labels=REPLACEMENT_LABELS)
        filtered_reviews.append((result, idx))

  with torch.cuda.amp.autocast(self._mixed_precision):


In [66]:
doc = nlp("Compared to other Whole Foods locations, this one is tiny! For comparison, the Whole Foods in Las Vegas must be at least three times the size. Still, it's nice to have and it is pretty decent overall albeit pricey.")
print(replace_specific_entities(doc, target_labels=REPLACEMENT_LABELS))

  with torch.cuda.amp.autocast(self._mixed_precision):


Compared to other <ORG> locations, this one is tiny! For comparison, <ORG> in Las Vegas must be at least three times the size. Still, it's nice to have and it is pretty decent overall albeit pricey.


In [79]:
len(filtered_reviews)
filtered_reviews[0]

("dr. <PERSON> offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (<ORG>) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 0)

In [77]:
def clean_tokens(tokens):
    output = []
    for token in tokens:
        pos_tag = nltk.pos_tag([token])[0][1]
        lemma = lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag))
        if lemma not in english_stopwords and len(lemma) >= 2:
            output.append(lemma)
    return output
    

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s<>/]", "", text)
    tokens = tokenizer.tokenize(text)
    cleaned_tokens = clean_tokens(tokens)
    return cleaned_tokens

In [None]:
output = []

for filtered_review in filtered_reviews[:5]:
    print(preprocess_text(filtered_review[0]))

for filtered_review in reviews:
    break
    posTokens = nltk.word_tokenize(review)
    properNounSearch = nltk.pos_tag(posTokens)
    nounFlag = False
    for _, tag in properNounSearch:
        if tag == "NNP" or tag == "NNPS":
            nounFlag = True
    if not nounFlag:
        continue
    tokens = tokenizer.tokenize(clean_text(review))
    tags = nltk.pos_tag(tokens)
    processedReview = []
    for word, tag in tags:
        lemma = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
        if lemma not in english_stopwords and len(lemma) >= 2:
            processedReview.append(lemma)
    output.append(processedReview)


['dr', '<person>', 'offers', 'everything', 'i', 'look', 'for', 'in', 'a', 'general', 'practitioner', 'hes', 'nice', 'and', 'easy', 'to', 'talk', 'to', 'without', 'being', 'patronizing', 'hes', 'always', 'on', 'time', 'in', 'seeing', 'his', 'patients', 'hes', 'affiliated', 'with', 'a', 'topnotch', 'hospital', '<org>', 'which', 'my', 'parents', 'have', 'explained', 'to', 'me', 'is', 'very', 'important', 'in', 'case', 'something', 'happens', 'and', 'you', 'need', 'surgery', 'and', 'you', 'can', 'get', 'referrals', 'to', 'see', 'specialists', 'without', 'having', 'to', 'see', 'him', 'first', 'really', 'what', 'more', 'do', 'you', 'need', 'im', 'sitting', 'here', 'trying', 'to', 'think', 'of', 'any', 'complaints', 'i', 'have', 'about', 'him', 'but', 'im', 'really', 'drawing', 'a', 'blank']
['unfortunately', 'the', 'frustration', 'of', 'being', 'dr', '<person>', 's', 'patient', 'is', 'a', 'repeat', 'of', 'the', 'experience', 'ive', 'had', 'with', 'so', 'many', 'other', 'doctors', 'in', 'nyc'

In [78]:
for filtered_review in filtered_reviews[:5]:
    print(preprocess_text(filtered_review[0]))


['dr', '<person>', 'offer', 'everything', 'look', 'general', 'practitioner', 'nice', 'easy', 'talk', 'without', 'patronize', 'always', 'time', 'see', 'patient', 'affiliate', 'topnotch', 'hospital', '<org>', 'parent', 'explain', 'important', 'case', 'something', 'happens', 'need', 'surgery', 'get', 'referral', 'see', 'specialist', 'without', 'see', 'first', 'really', 'need', 'im', 'sit', 'try', 'think', 'complaint', 'im', 'really', 'draw', 'blank']
['unfortunately', 'frustration', 'dr', '<person>', 'patient', 'repeat', 'experience', 'ive', 'many', 'doctor', 'nyc', 'good', 'doctor', 'terrible', 'staff', 'seem', 'staff', 'simply', 'never', 'answer', 'phone', 'usually', 'take', 'hour', 'repeat', 'call', 'get', 'answer', 'time', 'want', 'deal', 'run', 'problem', 'many', 'doctor', 'dont', 'get', 'office', 'worker', 'patient', 'medical', 'need', 'isnt', 'anyone', 'answer', 'phone', 'incomprehensible', 'work', 'aggravation', 'regret', 'feel', 'give', 'dr', '<person>', 'star']
['go', 'dr', '<pe

In [84]:
nltk.pos_tag(preprocess_text("I love the pasta at <ORG>, especially on <DATE>."))

[('love', 'NN'),
 ('pasta', 'NN'),
 ('<org>', 'NNP'),
 ('especially', 'RB'),
 ('date', 'NN')]

In [85]:
lemmatizer.lemmatize('<org>', pos='n')

'<org>'