In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import flair
from flair.data import Sentence
from flair.nn import Classifier

In [8]:
def get_locations_bert(article_text):
    """
    get location names from article using NER - bert model 
    https://huggingface.co/dslim/bert-base-NER
    input: article_text as a string, aggregate of h1, h2, lede, and body
    returns: locations - set of tuples of (NAME, 'LOC') and organizations - set of tuples (NAME, 'ORG) mentioned in the article
    """
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bart-large-NER")
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    
    ner_results = nlp(article_text)
    locations = set([(X['word'],X['entity_group']) for X in ner_results if X['entity_group'] == 'LOC'])
    orgs = set([(X['word'], X['entity_group']) for X in ner_results if X['entity_group'] == 'ORG'])
    return locations, orgs

In [45]:
def get_locations_flair_test(article_text : str) -> flair.data.Sentence:
    """
    get location names from article using NER - flair model 
    https://flairnlp.github.io/docs/tutorial-basics/tagging-entities
    input: article_text as a string, aggregate of h1, h2, lede, and body
    returns: locations - set of tuples of (NAME, 'LOC') and organizations - set of tuples (NAME, 'ORG) mentioned in the article
    """
    text = Sentence(article_text)
    tagger = Classifier.load('ner-ontonotes-large')
    tagger.predict(text)
    
    return text

In [46]:
test = get_locations_flair_test("I went to TD Garden which is next to Somerville and ate dinner at Burger King")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2023-04-05 18:48:08,364 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [47]:
type(test)

flair.data.Sentence

In [48]:
test

Sentence[16]: "I went to TD Garden which is next to Somerville and ate dinner at Burger King" → ["TD Garden"/FAC, "Somerville"/FAC, "Burger King"/ORG]

In [50]:
test.get_labels('ner')

['Span[3:5]: "TD Garden"'/'FAC' (1.0),
 'Span[9:10]: "Somerville"'/'FAC' (0.9999),
 'Span[14:16]: "Burger King"'/'ORG' (1.0)]

In [51]:
test.to_dict('ner')

{'text': 'I went to TD Garden which is next to Somerville and ate dinner at Burger King',
 'ner': [{'value': 'FAC', 'confidence': 0.9999774098396301},
  {'value': 'FAC', 'confidence': 0.9998844861984253},
  {'value': 'ORG', 'confidence': 0.9999557137489319}]}

In [52]:
for token in test.get_labels('ner'):
    print(str(token).split('"'))
    print(str(token).split('"')[1])
    print(str(token).split('"')[2].split(' ')[2])

In [55]:
tagger = Classifier.load('ner-ontonotes-large')
type(tagger)

2023-04-05 18:50:19,935 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


flair.models.sequence_tagger_model.SequenceTagger

In [70]:
def get_locations_flair(article_text : str, tagger : flair.models.sequence_tagger_model.SequenceTagger, 
                        entities = ['FAC', 'GPE', 'LOC', 'ORG']) -> flair.data.Sentence:
    """
    get location names from article using NER - flair model (tagger should be using "ner-ontonotes-large" model
        https://flairnlp.github.io/docs/tutorial-basics/tagging-entities
    input: article_text as a string, aggregate of h1, h2, lede, and body
    returns: final_entites - a list of tuples of (entity, tag)  mentioned in the article
    """
    text = Sentence(article_text)
    tagger.predict(text)
    
    entity_lst = []
    
    for token in text.get_labels('ner'):
        entity = str(token).split('"')[1]
        tag = str(token).split('"')[2].split(' ')[2]
        entity_lst.append((entity, tag))
        
    
    final_entities = [x for x in entity_lst if x[1] in entities]
    
    return final_entities

In [60]:
df = pd.read_csv('gbh_rss_feed_location_analysis/search-result-2022-11-08-04-04-19(1).csv')

df.head()

Unnamed: 0,Type,Label,Headline,Body,Byline,Publish Date,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53
0,Article,Watertown Police Detective Alleges Hostile Wor...,Watertown Police Detective Alleges Hostile Wor...,BOSTON (AP) — A Watertown police detective all...,Associated Press,Wed Nov 18 14:39:02 EST 2020,,,,,...,,,,,,,,,,
1,Article,"In 2020, Larissa FastHorse's 'The Thanksgiving...","In 2020, Larissa FastHorse's 'The Thanksgiving...","In late October, Native American playwright <a...",Jill Kaufman,Fri Nov 20 16:34:53 EST 2020,,,,,...,,,,,,,,,,
2,Article,In From The Cold: Martha’s Vineyard To Open It...,In From The Cold: Martha’s Vineyard To Open It...,"For the past five winters, three Martha’s Vine...",Jennette Barnes,Tue Nov 24 15:48:23 EST 2020,,,,,...,,,,,,,,,,
3,Article,Retail Behemoth Amazon May Be Coming To The Si...,Retail Behemoth Amazon May Be Coming To The Si...,A site plan filed with the city of Worcester’s...,Aaron Schachter,Fri Nov 27 15:11:17 EST 2020,,,,,...,,,,,,,,,,
4,Article,Rollins Talking Criminal Justice With Biden Team,Rollins Talking Criminal Justice With Biden Team,While she said &quot;right now I&#39;m stickin...,Michael P. Norton | State House News Service,Mon Nov 30 08:49:08 EST 2020,,,,,...,,,,,,,,,,


In [63]:
sample_text = df.Body.iloc[2]

In [64]:
tagger = Classifier.load('ner-ontonotes-large')

2023-04-05 18:54:54,385 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY


In [71]:
get_locations_flair(article_text = sample_text, tagger = tagger)

[('Martha’s Vineyard', 'GPE'),
 ('Vineyard', 'LOC'),
 ('Oak Bluffs', 'GPE'),
 ('the Portuguese-American Club.', 'ORG'),
 ('Seadale', 'ORG'),
 ('Vineyard', 'LOC'),
 ('Martha’s Vineyard Hospital', 'ORG'),
 ('The Vineyard Trust', 'ORG'),
 ('the Old Whaling Church.', 'ORG'),
 ('Harbor Homes', 'ORG'),
 ('Edgartown.', 'GPE'),
 ('Martha’s Vineyard Hospital', 'ORG'),
 ('Vineyard', 'ORG')]