In [341]:
import pandas as pd
import spacy
import requests
import googlemaps 

from bs4 import BeautifulSoup 
import nltk

# bert ner
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [230]:
# read gbh rss feed 
df = pd.read_csv("../gbh_rss/search-result-2022-11-08-04-04-19.csv")
clean_df = df.iloc[:, :6]
clean_df

Unnamed: 0,Type,Label,Headline,Body,Byline,Publish Date
0,Article,Watertown Police Detective Alleges Hostile Wor...,Watertown Police Detective Alleges Hostile Wor...,BOSTON (AP) — A Watertown police detective all...,Associated Press,Wed Nov 18 14:39:02 EST 2020
1,Article,"In 2020, Larissa FastHorse's 'The Thanksgiving...","In 2020, Larissa FastHorse's 'The Thanksgiving...","In late October, Native American playwright <a...",Jill Kaufman,Fri Nov 20 16:34:53 EST 2020
2,Article,In From The Cold: Martha’s Vineyard To Open It...,In From The Cold: Martha’s Vineyard To Open It...,"For the past five winters, three Martha’s Vine...",Jennette Barnes,Tue Nov 24 15:48:23 EST 2020
3,Article,Retail Behemoth Amazon May Be Coming To The Si...,Retail Behemoth Amazon May Be Coming To The Si...,A site plan filed with the city of Worcester’s...,Aaron Schachter,Fri Nov 27 15:11:17 EST 2020
4,Article,Rollins Talking Criminal Justice With Biden Team,Rollins Talking Criminal Justice With Biden Team,While she said &quot;right now I&#39;m stickin...,Michael P. Norton | State House News Service,Mon Nov 30 08:49:08 EST 2020
...,...,...,...,...,...,...
351,Article,Baker recommends pardons of four people,Baker recommends pardons of four people,Gov. Charlie Baker announced pardons of four p...,Jenifer B. McKim,Wed Oct 26 15:29:34 EDT 2022
352,Article,Ketanji Brown Jackson's recusal in Harvard adm...,Ketanji Brown Jackson's recusal in Harvard adm...,When the U.S. Supreme Court announces its deci...,Diane Adame,Mon Oct 31 15:15:03 EDT 2022
353,Article,It's now illegal in Mass. to throw out used je...,It's now illegal in Mass. to throw out used je...,It&#39;s now against the law in Massachusetts ...,Craig LeMoult,Tue Nov 01 09:58:52 EDT 2022
354,Article,Salem takes in record numbers of Halloween tou...,Salem takes in record numbers of Halloween tou...,Saying that Halloween in Salem is “a big deal”...,James Bennett II,Mon Oct 31 16:32:14 EDT 2022


In [311]:
def get_locations(article_text):
    """
    get location names from article using NER 
    input: article_text as a string, aggregate of h1, h2, lede, and body
    returns: locations - set of tuples of (NAME, 'GPE')
    """
    # get locations using NER 

    
    nlp = spacy.load("en_core_web_lg")

    doc = nlp(article_text)

    # get the locations only, remove duplicates from results 
    locations = set([(X.text, X.label_) for X in doc.ents if X.label_ == 'GPE' or X.label_ == 'LOC' or X.label_ == 'FAC']) # or X.label_ == 'LOC' or X.label_ == 'FAC' or X.label_ == 'ORG'
    orgs = set([(X.text, X.label_) for X in doc.ents if X.label_ == 'ORG'])
    
    return locations, orgs

In [388]:
def get_locations_bert(article_text):
    # https://huggingface.co/dslim/bert-base-NER
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-large-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-large-NER")
    nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
    
    ner_results = nlp(article_text)
    locations = set([(X['word'],X['entity_group']) for X in ner_results if X['entity_group'] == 'LOC'])
    orgs = set([(X['word'], X['entity_group']) for X in ner_results if X['entity_group'] == 'ORG'])

    return locations, orgs

In [369]:
test = "Long closed sporting-goods store in Packards Corner could be replaced by six-story residential building."
#A developer has filed plans to replace the old City Sports at 1035 Commonwealth Ave. in Allston with a six-story, 55-unit residential building with retail space on the ground floor. In a filing with the BPDA, developers BGI Homes of Brighton and D2 Development of Roxbury say the $15-million project would have units ranging from studios to three bedrooms. The plans do not specify whether the units would be rented as apartments or sold as condos. Seven of the units would be offered as affordable. The building, right on the B Line and the 57 bus route, would have no parking spaces for residents but two for the retail space. The filing notes that Fenway Park is a 29-minute walk away. Residents would have easier access to Thai food - at Brown Sugar Cafe next door. The developer expect the units will be very attractive to those seeking an active neighborhood within the urban context of Boston. City Sports closed in 2015."

In [389]:
bert_loc, bert_org = get_locations_bert(test)

Downloading (…)"pytorch_model.bin";: 100%|██████████| 1.33G/1.33G [08:10<00:00, 2.72MB/s]


In [390]:
print(bert_loc)
print(bert_org)

{('Packards Corner', 'LOC')}
set()


In [297]:
i = 133
text =  df['Headline'][i] + " " + df['Body'][i]
soup = BeautifulSoup(text,  "html.parser")
clean_text = soup.get_text()
loc, org = get_locations(clean_text)
print(loc)
print()
print(org)


{}
{('Boston', 'GPE'), ('Atkinson', 'GPE'), ('Southampton Streets', 'FAC'), ('Massachusetts Avenue', 'FAC'), ('Pine Street Inn', 'FAC')}

{('GBH News', 'ORG'), ('ModuleBut', 'ORG'), ('Downie', 'ORG')}


In [324]:
# entity recognition on first 5 sentences
org_5 = {}
loc_5 = {}
lede_5 = {}
for i in range(len(df)):
    try:
        headline = str(df['Headline'][i]) + " "
        body = df['Body'][i]
        soup = BeautifulSoup(body, "html.parser")
        lede = nltk.sent_tokenize(soup.get_text())[:5]
        lede = headline + ".".join(lede)
        loc, org = get_locations(lede)
        lede_5[i] = lede
        loc_5[i] = loc
        org_5[i] = org
        #print(lede)
        #print(loc)
        #print()
        #print(org)
        #print()
    except Exception as e:
        print("error:")
        print("index: " + str(i))
        print(e)

error:
index: 159
object of type 'float' has no len()


In [337]:
# entity recognition on first 3 sentences
org_3 = {}
loc_3 = {}
lede_3 = {}
for i in range(len(df)):
    try:
        headline = str(df['Headline'][i]) + " "
        body = df['Body'][i]
        soup = BeautifulSoup(body, "html.parser")
        lede = nltk.sent_tokenize(soup.get_text())[:3]
        lede = headline + ".".join(lede)
        loc, org = get_locations(lede)
        lede_3[i] = lede
        loc_3[i] = loc
        org_3[i] = org
        #print(lede)
        #print(loc)
        #print()
        #print(org)
        #print()
    except Exception as e:
        print("error:")
        print("index: " + str(i))
        print(e)

error:
index: 159
object of type 'float' has no len()


In [336]:
result_df = pd.DataFrame()
result_df["lede_5"] = lede_5
result_df["locations_5_spacy"] = loc_5
result_df["orgs_5_spacy"] = org_5
result_df.head()

Unnamed: 0,lede_5,locations_5_spacy,orgs_5_spacy
0,Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (..."
1,"In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (Nativ..."
2,In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE), (Vineyard, LOC)}","{(Episcopal, ORG)}"
3,Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (Wor...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo..."
4,Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (City Coun..."


In [338]:
result_df["lede_3"] = lede_3
result_df["locations_3_spacy"] = loc_3
result_df["orgs_3_spacy"] = org_3
result_df.head()

Unnamed: 0,lede_5,locations_5_spacy,orgs_5_spacy,lede_3,locations_3_spacy,orgs_3_spacy
0,Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (...",Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (..."
1,"In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (Nativ...","In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (WAM T..."
2,In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE), (Vineyard, LOC)}","{(Episcopal, ORG)}",In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE)}",{}
3,Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (Wor...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo...",Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (the...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo..."
4,Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (City Coun...",Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (Biden Tea..."


In [391]:
# entity recognition on first 5 sentences
b_org_5 = {}
b_loc_5 = {}
b_lede_5 = {}
for i in range(len(df)):
    try:
        headline = str(df['Headline'][i]) + " "
        body = df['Body'][i]
        soup = BeautifulSoup(body, "html.parser")
        lede = nltk.sent_tokenize(soup.get_text())[:5]
        lede = headline + ".".join(lede)
        loc, org = get_locations_bert(lede)
        b_lede_5[i] = lede
        b_loc_5[i] = loc
        b_org_5[i] = org
        #print(lede)
        #print(loc)
        #print()
        #print(org)
        #print()
    except Exception as e:
        print("error:")
        print("index: " + str(i))
        print(e)

error:
index: 159
object of type 'float' has no len()


In [393]:
#result_df["lede_5"] = lede_5
result_df["locations_5_bert"] = b_loc_5
result_df["orgs_5_bert"] = b_org_5
result_df.head()

Unnamed: 0,lede_5,locations_5_spacy,orgs_5_spacy,lede_3,locations_3_spacy,orgs_3_spacy,locations_5_bert,orgs_5_bert
0,Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (...",Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (...","{(##town, LOC), (Watertown, LOC)}","{(Middlesex Superior Court, ORG), (Watertown P..."
1,"In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (Nativ...","In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (WAM T...","{(Massachusetts, LOC), (Lenox, LOC)}","{(WAM Theatre, ORG)}"
2,In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE), (Vineyard, LOC)}","{(Episcopal, ORG)}",In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE)}",{},"{(Martha ’ s Vineyard, LOC), (Vineyard, LOC)}",{}
3,Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (Wor...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo...",Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (the...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo...","{(I - 290, LOC), (Worcester Mall, LOC), (Worce...","{(Worcester Telegram, ORG), (Worcester, ORG), ..."
4,Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (City Coun...",Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (Biden Tea...","{(Washington, LOC), (Boston, LOC), (Suffolk Co...","{(CBS Boston, ORG)}"


In [395]:
# entity recognition on first 5 sentences
b_org_3 = {}
b_loc_3 = {}
for i in range(len(df)):
    try:
        headline = str(df['Headline'][i]) + " "
        body = df['Body'][i]
        soup = BeautifulSoup(body, "html.parser")
        lede = nltk.sent_tokenize(soup.get_text())[:3]
        lede = headline + ".".join(lede)
        loc, org = get_locations_bert(lede)
        b_loc_3[i] = loc
        b_org_3[i] = org
        #print(lede)
        #print(loc)
        #print()
        #print(org)
        #print()
    except Exception as e:
        print("error:")
        print("index: " + str(i))
        print(e)

error:
index: 159
object of type 'float' has no len()


In [396]:
result_df["locations_3_bert"] = b_loc_3
result_df["orgs_3_bert"] = b_org_3
result_df.head()

Unnamed: 0,lede_5,locations_5_spacy,orgs_5_spacy,lede_3,locations_3_spacy,orgs_3_spacy,locations_5_bert,orgs_5_bert,locations_3_bert,orgs_3_bert
0,Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (...",Watertown Police Detective Alleges Hostile Wor...,"{(Watertown, GPE)}","{(Middlesex Superior Court, ORG), (AP, ORG), (...","{(##town, LOC), (Watertown, LOC)}","{(Middlesex Superior Court, ORG), (Watertown P...","{(##town, LOC), (Watertown, LOC)}","{(Middlesex Superior Court, ORG), (Watertown P..."
1,"In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (Nativ...","In 2020, Larissa FastHorse's 'The Thanksgiving...","{(Massachusetts, GPE), (Lenox, GPE)}","{(Native American Heritage Month, ORG), (WAM T...","{(Massachusetts, LOC), (Lenox, LOC)}","{(WAM Theatre, ORG)}","{(Massachusetts, LOC), (Lenox, LOC)}","{(WAM Theatre, ORG)}"
2,In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE), (Vineyard, LOC)}","{(Episcopal, ORG)}",In From The Cold: Martha’s Vineyard To Open It...,"{(Vineyard, GPE)}",{},"{(Martha ’ s Vineyard, LOC), (Vineyard, LOC)}",{},"{(Martha ’ s Vineyard, LOC)}",{}
3,Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (Wor...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo...",Retail Behemoth Amazon May Be Coming To The Si...,"{(Worcester, GPE), (Worcester Mall, FAC), (the...","{(I-190, ORG), (Retail Behemoth, ORG), (the Wo...","{(I - 290, LOC), (Worcester Mall, LOC), (Worce...","{(Worcester Telegram, ORG), (Worcester, ORG), ...","{(I - 290, LOC), (Worcester Mall, LOC), (Worce...","{(Worcester Telegram, ORG), (Planning Board, O..."
4,Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (City Coun...",Rollins Talking Criminal Justice With Biden Te...,"{(Washington, GPE), (Boston, GPE), (Suffolk Co...","{(Rollins, ORG), (CBS Boston, ORG), (Biden Tea...","{(Washington, LOC), (Boston, LOC), (Suffolk Co...","{(CBS Boston, ORG)}","{(Washington, LOC), (Boston, LOC), (Suffolk Co...","{(CBS Boston, ORG)}"


In [397]:
result_df.to_csv("../gbh_rss/search-result-2022-11-08-04-04-19-experiment-results.csv")

In [235]:
def get_location_geocode(API_KEY, locations):
    """
    getting coordinates from location names in articles 
    input: google maps platform API KEY, locations article 
    return: dictionary of location names (key) with coordinates (value as a dictionary with lat and lon as keys)
    """
    gmaps = googlemaps.Client(key=API_KEY)
    results = {}

    # getting coordinates
    for place in locations:
        # we can constrain google geocode api search to massachusetts or us - census geocoder will not work for places outside of U.S 
        #geocode_result = gmaps.geocode(place[0] + ", Suffok County, MA, USA") # place is a tuple, where first value is the location name 
        geocode_result = gmaps.geocode(place[0] + ", Suffolk County",  components={"administrative_area_level": "MA", 
                                                                                   "country": "US"})
        print(geocode_result)
        print()
        temp = {}
        try:
            geocode_components = geocode_result[0]['address_components']
            for i, addr_comp in enumerate(geocode_components):
                if 'administrative_area_level_2' in addr_comp['types']:
                    if "Suffolk County" == addr_comp['short_name'] and i != 0:
                        temp['lat'] = geocode_result[0]['geometry']['location']['lat']
                        temp['lon'] = geocode_result[0]['geometry']['location']['lng']
                        results[place[0]] = temp
        except IndexError: # unable to get coordinates for location
            print("Unable to locate " + place[0])

    return results 

In [221]:
import secret
get_location_geocode(secret.API_KEY, org)

{'GBH News': {'lat': 42.3571222, 'lon': -71.1480528},
 'ModuleBut': {'lat': 40.9848784, 'lon': -72.61511689999999},
 'Downie': {'lat': 40.9848784, 'lon': -72.61511689999999}}

In [236]:
get_location_geocode(secret.API_KEY, loc)

[{'address_components': [{'long_name': 'Boston', 'short_name': 'Boston', 'types': ['locality', 'political']}, {'long_name': 'Suffolk County', 'short_name': 'Suffolk County', 'types': ['administrative_area_level_2', 'political']}, {'long_name': 'Massachusetts', 'short_name': 'MA', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}], 'formatted_address': 'Boston, MA, USA', 'geometry': {'bounds': {'northeast': {'lat': 42.40081989999999, 'lng': -70.749455}, 'southwest': {'lat': 42.22788, 'lng': -71.191113}}, 'location': {'lat': 42.3600825, 'lng': -71.0588801}, 'location_type': 'APPROXIMATE', 'viewport': {'northeast': {'lat': 42.40081989999999, 'lng': -70.749455}, 'southwest': {'lat': 42.22788, 'lng': -71.191113}}}, 'place_id': 'ChIJGzE9DS1l44kRoOhiASS_fHg', 'types': ['locality', 'political']}]

[{'address_components': [{'long_name': 'Suffolk County', 'short_name': 'Suffolk County', 'types': ['adminis

{'Boston': {'lat': 42.3600825, 'lon': -71.0588801},
 'Southampton Streets': {'lat': 42.3310002, 'lon': -71.064521},
 'Massachusetts Avenue': {'lat': 42.3734217, 'lon': -71.1195063},
 'Pine Street Inn': {'lat': 42.34319869999999, 'lon': -71.0645177}}

In [237]:
get_location_geocode(secret.API_KEY, [('ModuleBut', 'ORG'), ('Downie', 'ORG'), ('Atkinson', 'GPE')])

[{'address_components': [{'long_name': 'Suffolk County', 'short_name': 'Suffolk County', 'types': ['administrative_area_level_2', 'political']}, {'long_name': 'New York', 'short_name': 'NY', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}], 'formatted_address': 'Suffolk County, NY, USA', 'geometry': {'bounds': {'northeast': {'lat': 41.3103871, 'lng': -71.777491}, 'southwest': {'lat': 40.534265, 'lng': -73.49740489999999}}, 'location': {'lat': 40.9848784, 'lng': -72.61511689999999}, 'location_type': 'APPROXIMATE', 'viewport': {'northeast': {'lat': 41.3103871, 'lng': -71.777491}, 'southwest': {'lat': 40.534265, 'lng': -73.49740489999999}}}, 'partial_match': True, 'place_id': 'ChIJ1_mbyICu4IkRU18dDUkMxJU', 'types': ['administrative_area_level_2', 'political']}]

[{'address_components': [{'long_name': 'Suffolk County', 'short_name': 'Suffolk County', 'types': ['administrative_area_level_2', 'polit

{}