In [None]:
import spacy
import os
import pandas as pd
import geopy 
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup #Parses HTML content
from collections import Counter #Counts item frequencies in a list
from geopy.extra.rate_limiter import RateLimiter
from spacy import displacy

### Get URL Text

In [None]:
url = "https://www.cnn.com/travel/article/experts-guide-to-great-american-road-trips/index.html"
data = requests.get(url)

In [None]:
#Parse out HTML tags and clean up newlines
soup = BeautifulSoup(data.content, 'html.parser')
text = soup.get_text()
text = text.strip().replace("\n", " ").replace("\r", " ")

### Load the spacy model

In addition to installing spacy, need to install the model with:

requires conda install -c conda-forge spacy-model-en_core_web_lg

In [None]:
nlp = spacy.load("en_core_web_lg")


In [None]:
text = "Time!In 2013, a bespectacled Sunday school teacher from Atlanta, Georgia, drove from New York to Redondo Beach, California in a world record-setting 28 hours and 50 minutes.We can\'t recommend this -- Ed Bolian and his co-driver averaged 98 miles per hour during the journey, and could easily have gone to jail in any number of states.It\'s also a loss from a travel perspective. Any itinerary should leave room to sample America\'s rich and nutty menu of roadside attractions. We\'ve broken them down into a few helpful categories."


## Natural Language Processing to tokenize and label text

In [24]:
text

"Time!In 2013, a bespectacled Sunday school teacher from Atlanta, Georgia, drove from New York to Redondo Beach, California in a world record-setting 28 hours and 50 minutes.We can't recommend this -- Ed Bolian and his co-driver averaged 98 miles per hour during the journey, and could easily have gone to jail in any number of states.It's also a loss from a travel perspective. Any itinerary should leave room to sample America's rich and nutty menu of roadside attractions. We've broken them down into a few helpful categories."

In [25]:
doc = nlp(text)
displacy.render(doc, style="ent")

### Detected places to dataframe

In [None]:
locations = []

locations.extend([[ent.text, ent.start, ent.end, ent.label_, spacy.explain(ent.label_)] for ent in doc.ents ])
df_all = pd.DataFrame(locations, columns=['Location', 'start','end', 'label', 'meaning'])
df_all

In [None]:
label_unique = list(df_all['label'].unique())

for l in label_unique:
    print(l)
    print(spacy.explain(l))

In [None]:
# Just the locations

loc_codes = ['GPE', 'LOC']

df_places = df_all.loc[df_all['label'].isin(loc_codes)]

df_places

### Geocode locations with geopy

In [None]:
locator = geopy.geocoders.Nominatim(user_agent='mygeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# geocode = RateLimiter(locator.geocode('Atlanta', addressdetails=True), min_delay_seconds=1)

In [None]:
locations = [ locator.geocode(loc, addressdetails=True) for loc in list(df_places['Location'])]

In [None]:
locations_raw = []
city = []
state = []
country = []

for loc in locations:
    if loc is not None:
        add = loc.raw['address']
        locations_raw.append(add)
        add_parts = list(add.keys())
        
        if 'city' in add_parts:
            city.append(add['city'])
        else:
            city.append('')

        if 'state' in add_parts:
            state.append(add['state'])
        else:
            state.append('')

        if 'county' in add_parts:
            country.append(add['country'])
        else:
            country.append('')

    else:
        locations_raw.append('')
        city.append('')
        state.append('')
        country.append('')

In [None]:
df_places['add_raw'] = locations_raw
df_places['city'] = city
df_places['state'] = state
df_places['country'] = country
df_places

### TODO Dataframe to input dict for geonenrichment