In [54]:
import pandas as pd
import spacy
import requests
import googlemaps 

from bs4 import BeautifulSoup 

In [2]:
# read gbh rss feed 
df = pd.read_csv("../gbh_rss/search-result-2022-11-08-04-04-19.csv")
clean_df = df.iloc[:, :6]
clean_df

Unnamed: 0,Type,Label,Headline,Body,Byline,Publish Date
0,Article,Watertown Police Detective Alleges Hostile Wor...,Watertown Police Detective Alleges Hostile Wor...,BOSTON (AP) — A Watertown police detective all...,Associated Press,Wed Nov 18 14:39:02 EST 2020
1,Article,"In 2020, Larissa FastHorse's 'The Thanksgiving...","In 2020, Larissa FastHorse's 'The Thanksgiving...","In late October, Native American playwright <a...",Jill Kaufman,Fri Nov 20 16:34:53 EST 2020
2,Article,In From The Cold: Martha’s Vineyard To Open It...,In From The Cold: Martha’s Vineyard To Open It...,"For the past five winters, three Martha’s Vine...",Jennette Barnes,Tue Nov 24 15:48:23 EST 2020
3,Article,Retail Behemoth Amazon May Be Coming To The Si...,Retail Behemoth Amazon May Be Coming To The Si...,A site plan filed with the city of Worcester’s...,Aaron Schachter,Fri Nov 27 15:11:17 EST 2020
4,Article,Rollins Talking Criminal Justice With Biden Team,Rollins Talking Criminal Justice With Biden Team,While she said &quot;right now I&#39;m stickin...,Michael P. Norton | State House News Service,Mon Nov 30 08:49:08 EST 2020
...,...,...,...,...,...,...
350,Article,Baker recommends pardons of four people,Baker recommends pardons of four people,Gov. Charlie Baker announced pardons of four p...,Jenifer B. McKim,Wed Oct 26 15:29:34 EDT 2022
351,Article,Ketanji Brown Jackson's recusal in Harvard adm...,Ketanji Brown Jackson's recusal in Harvard adm...,When the U.S. Supreme Court announces its deci...,Diane Adame,Mon Oct 31 15:15:03 EDT 2022
352,Article,It's now illegal in Mass. to throw out used je...,It's now illegal in Mass. to throw out used je...,It&#39;s now against the law in Massachusetts ...,Craig LeMoult,Tue Nov 01 09:58:52 EDT 2022
353,Article,Salem takes in record numbers of Halloween tou...,Salem takes in record numbers of Halloween tou...,Saying that Halloween in Salem is “a big deal”...,James Bennett II,Mon Oct 31 16:32:14 EDT 2022


In [20]:
def get_locations(article_text):
    """
    get location names from article using NER 
    input: article_text as a string, aggregate of h1, h2, lede, and body
    returns: locations - set of tuples of (NAME, 'GPE')
    """
    # get locations using NER  
    nlp = spacy.load("en_core_web_lg")
    doc = nlp(article_text)

    # get the locations only, remove duplicates from results 
    locations = set([(X.text, X.label_) for X in doc.ents if X.label_ == 'GPE' or X.label_ == 'LOC' or X.label_ == 'FAC']) # or X.label_ == 'LOC' or X.label_ == 'FAC' or X.label_ == 'ORG'
    orgs = set([(X.text, X.label_) for X in doc.ents if X.label_ == 'ORG'])
    
    return locations, orgs

In [160]:
test = "Long closed sporting-goods store in Packards Corner in Center of Computing and Data Sciences could be replaced by six-story residential building. A developer has filed plans to replace the old City Sports at 1035 Commonwealth Ave. in Allston with a six-story, 55-unit residential building with retail space on the ground floor. In a filing with the BPDA, developers BGI Homes of Brighton and D2 Development of Roxbury say the $15-million project would have units ranging from studios to three bedrooms. The plans do not specify whether the units would be rented as apartments or sold as condos. Seven of the units would be offered as affordable. The building, right on the B Line and the 57 bus route, would have no parking spaces for residents but two for the retail space. The filing notes that Fenway Park is a 29-minute walk away. Residents would have easier access to Thai food - at Brown Sugar Cafe next door. The developer expect the units will be very attractive to those seeking an active neighborhood within the urban context of Boston. City Sports closed in 2015."

In [161]:
#!python3 -m spacy download en_core_web_lg
i = 11
text = df['Label'][i] + df['Headline'][i] + df['Body'][i]
loc, org = get_locations(test)
print(loc)
print()
print(org)

{('Allston', 'GPE'), ('Brown Sugar Cafe', 'FAC'), ('Fenway Park', 'FAC'), ('Boston', 'GPE')}

{('D2 Development of Roxbury', 'ORG'), ('Packards Corner', 'ORG'), ('City Sports', 'ORG'), ('BGI Homes of Brighton', 'ORG'), ('Center of Computing and Data Sciences', 'ORG'), ('the B Line', 'ORG')}


In [147]:
print(test)

Long closed sporting-goods store in Center for Computing and Data Sciences could be replaced by six-story residential building. A developer has filed plans to replace the old City Sports at 1035 Commonwealth Ave. in Allston with a six-story, 55-unit residential building with retail space on the ground floor. In a filing with the BPDA, developers BGI Homes of Brighton and D2 Development of Roxbury say the $15-million project would have units ranging from studios to three bedrooms. The plans do not specify whether the units would be rented as apartments or sold as condos. Seven of the units would be offered as affordable. The building, right on the B Line and the 57 bus route, would have no parking spaces for residents but two for the retail space. The filing notes that Fenway Park is a 29-minute walk away. Residents would have easier access to Thai food - at Brown Sugar Cafe next door. The developer expect the units will be very attractive to those seeking an active neighborhood within 

In [151]:
def get_location_geocode(API_KEY, locations):
    """
    getting coordinates from location names in articles 
    input: google maps platform API KEY, locations article 
    return: dictionary of location names (key) with coordinates (value as a dictionary with lat and lon as keys)
    """
    gmaps = googlemaps.Client(key=API_KEY)
    results = {}

    # getting coordinates
    for place in locations:
        # we can constrain google geocode api search to massachusetts or us - census geocoder will not work for places outside of U.S 
        #geocode_result = gmaps.geocode(place[0] + ", Suffok County, MA, USA") # place is a tuple, where first value is the location name 
        geocode_result = gmaps.geocode(place[0] + ", Suffolk County",  components={"administrative_area_level": "MA", 
                                                                                   "country": "US"})
        print(geocode_result)
        print()
        temp = {}
        try:
            geocode_components = geocode_result[0]['address_components']
            for addr_comp in geocode_components:
                if 'administrative_area_level_2' in addr_comp['types']:
                    if "Suffolk County" == addr_comp['short_name']:
                        temp['lat'] = geocode_result[0]['geometry']['location']['lat']
                        temp['lon'] = geocode_result[0]['geometry']['location']['lng']
                        results[place[0]] = temp
        except IndexError: # unable to get coordinates for location
            print("Unable to locate " + place[0])

    return results 

In [162]:
import secret
get_location_geocode(secret.API_KEY, org)

[{'address_components': [{'long_name': '407', 'short_name': '407', 'types': ['street_number']}, {'long_name': 'Dudley Street', 'short_name': 'Dudley St', 'types': ['route']}, {'long_name': 'Roxbury', 'short_name': 'Roxbury', 'types': ['neighborhood', 'political']}, {'long_name': 'Boston', 'short_name': 'Boston', 'types': ['locality', 'political']}, {'long_name': 'Suffolk County', 'short_name': 'Suffolk County', 'types': ['administrative_area_level_2', 'political']}, {'long_name': 'Massachusetts', 'short_name': 'MA', 'types': ['administrative_area_level_1', 'political']}, {'long_name': 'United States', 'short_name': 'US', 'types': ['country', 'political']}, {'long_name': '02119', 'short_name': '02119', 'types': ['postal_code']}, {'long_name': '3392', 'short_name': '3392', 'types': ['postal_code_suffix']}], 'formatted_address': '407 Dudley St, Boston, MA 02119, USA', 'geometry': {'location': {'lat': 42.32550490000001, 'lng': -71.0754742}, 'location_type': 'ROOFTOP', 'viewport': {'northea

{'D2 Development of Roxbury': {'lat': 42.32550490000001, 'lon': -71.0754742},
 'Packards Corner': {'lat': 42.3512369, 'lon': -71.1285122},
 'Center of Computing and Data Sciences': {'lat': 42.34989710000001,
  'lon': -71.10323009999999}}

In [168]:
i = 11
text = df['Label'][i] + df['Headline'][i] + df['Body'][i]



soup = BeautifulSoup(test)
print(soup.get_text())

Long closed sporting-goods store in Packards Corner in Center of Computing and Data Sciences could be replaced by six-story residential building. A developer has filed plans to replace the old City Sports at 1035 Commonwealth Ave. in Allston with a six-story, 55-unit residential building with retail space on the ground floor. In a filing with the BPDA, developers BGI Homes of Brighton and D2 Development of Roxbury say the $15-million project would have units ranging from studios to three bedrooms. The plans do not specify whether the units would be rented as apartments or sold as condos. Seven of the units would be offered as affordable. The building, right on the B Line and the 57 bus route, would have no parking spaces for residents but two for the retail space. The filing notes that Fenway Park is a 29-minute walk away. Residents would have easier access to Thai food - at Brown Sugar Cafe next door. The developer expect the units will be very attractive to those seeking an active ne

In [167]:
print(text)

Statehouses Evacuate Amid Protests In Support Of TrumpStatehouses Evacuate Amid Protests In Support Of TrumpProtesters who back President Donald Trump massed outside statehouses from Georgia to New Mexico on Wednesday, leading to some evacuations as cheers rang out in reaction to the news that pro-Trump demonstrators had <a href="https://apnews.com/article/congress-confirm-joe-biden-78104aea082995bbd7412a6e6cd13818" style="box-sizing: border-box; text-decoration: none; background-color: transparent; font-size: inherit; cursor: pointer; color: rgb(28, 167, 218); font-family: &quot;AP Serif&quot;; font-style: normal; font-weight: 400;">stormed the U.S. Capitol</a>.<br/><br/>Hundreds of people gathered in state capitals across the country to oppose President-elect Joe Biden&#39;s win, waving signs saying “Stop the Steal” and “Four more years,” most of them not wearing masks during the coronavirus pandemic and a few carrying long guns in places like Oklahoma and Georgia.<br/><br/>New Mexic