In [20]:
import spacy
import os
import pandas as pd
import geopy 
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup #Parses HTML content
from collections import Counter #Counts item frequencies in a list
from geopy.extra.rate_limiter import RateLimiter
from spacy import displacy

### Get URL Text

In [21]:
url = "https://www.cnn.com/travel/article/experts-guide-to-great-american-road-trips/index.html"
data = requests.get(url)

In [22]:
#Parse out HTML tags and clean up newlines
soup = BeautifulSoup(data.content, 'html.parser')
text = soup.get_text()
text = text.strip().replace("\n", " ").replace("\r", " ")

### Load the spacy model

In addition to installing spacy, need to install the model with:

requires conda install -c conda-forge spacy-model-en_core_web_lg

In [23]:
nlp = spacy.load("en_core_web_lg")


In [7]:
text = "Time!In 2013, a bespectacled Sunday school teacher from Atlanta, Georgia, drove from New York to Redondo Beach, California in a world record-setting 28 hours and 50 minutes.We can\'t recommend this -- Ed Bolian and his co-driver averaged 98 miles per hour during the journey, and could easily have gone to jail in any number of states.It\'s also a loss from a travel perspective. Any itinerary should leave room to sample America\'s rich and nutty menu of roadside attractions. We\'ve broken them down into a few helpful categories."

## Natural Language Processing to tokenize and label text

In [24]:
text

'Great  American road trip guide | CNN TravelDestinationsFood & DrinkNewsStayVideoSearchMenuBusiness TravellerEndless highway: How to drive the ultimate American road tripCorinne Purtill, for CNN • Updated 12th August 2016FacebookTwitterEmailUp nextQantas goes back to basics08:09The ancient city that\'s back on the bucket list08:28Revisiting Rome, 1947 style07:21The Evolution of the Kangaroo Route07:26The Ultra Long Haul, Reimagined 08:33Step back in time on the Kangaroo Route08:10The Chinese travel revolution08:39Putting China\'s airlines to the test06:13Chinese travel boom: The game-changers 07:43Thailand\'s tourism boom 09:02(CNN) — Ah, the great American road trip. It\'s a rite of passage, a combination of nostalgia, discovery and misadventure ideally set against a sweeping landscape and killer tunes.The beauty of the road trip lies in its democratic simplicity: little more is needed beyond a map and a functioning vehicle for a Kerouac-worthy journey. We\'ve boiled the planning dow

In [25]:
doc = nlp(text)
displacy.render(doc, style="ent")

### Detected places to dataframe

In [26]:
locations = []

locations.extend([[ent.text, ent.start, ent.end, ent.label_, spacy.explain(ent.label_)] for ent in doc.ents ])
df_all = pd.DataFrame(locations, columns=['Location', 'start','end', 'label', 'meaning'])
df_all

Unnamed: 0,Location,start,end,label,meaning
0,American,2,3,NORP,Nationalities or religious or political groups
1,CNN,7,8,ORG,"Companies, agencies, institutions, etc."
2,American,19,20,NORP,Nationalities or religious or political groups
3,Purtill,22,23,ORG,"Companies, agencies, institutions, etc."
4,CNN,25,26,ORG,"Companies, agencies, institutions, etc."
...,...,...,...,...,...
142,NewslettersWork,1762,1763,PERSON,"People, including fictional"
143,UsePrivacy PolicyAccessibility &,1768,1771,ORG,"Companies, agencies, institutions, etc."
144,FootageCNN NewsourceSitemap© 2022 Cable News N...,1781,1788,ORG,"Companies, agencies, institutions, etc."
145,Warner Media Company,1790,1793,ORG,"Companies, agencies, institutions, etc."


In [27]:
label_unique = list(df_all['label'].unique())

for l in label_unique:
    print(l)
    print(spacy.explain(l))

NORP
Nationalities or religious or political groups
ORG
Companies, agencies, institutions, etc.
DATE
Absolute or relative dates or periods
GPE
Countries, cities, states
EVENT
Named hurricanes, battles, wars, sports events, etc.
PERSON
People, including fictional
FAC
Buildings, airports, highways, bridges, etc.
LOC
Non-GPE locations, mountain ranges, bodies of water
CARDINAL
Numerals that do not fall under another type
QUANTITY
Measurements, as of weight or distance
TIME
Times smaller than a day
WORK_OF_ART
Titles of books, songs, etc.
ORDINAL
"first", "second", etc.


In [28]:
# Just the locations

loc_codes = ['GPE', 'LOC']

df_places = df_all.loc[df_all['label'].isin(loc_codes)]

df_places

Unnamed: 0,Location,start,end,label,meaning
6,Rome,45,46,GPE,"Countries, cities, states"
11,China,70,71,GPE,"Countries, cities, states"
15,U.S.,189,190,GPE,"Countries, cities, states"
16,California,192,193,GPE,"Countries, cities, states"
18,New England,202,204,LOC,"Non-GPE locations, mountain ranges, bodies of ..."
22,Connecticut,259,260,GPE,"Countries, cities, states"
23,Florida,261,262,GPE,"Countries, cities, states"
28,"""If",329,330,GPE,"Countries, cities, states"
31,U.S.,357,358,GPE,"Countries, cities, states"
35,experienceThe Grand Canyon,400,403,LOC,"Non-GPE locations, mountain ranges, bodies of ..."


### Geocode locations with geopy

In [29]:
locator = geopy.geocoders.Nominatim(user_agent='mygeocoder')
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# geocode = RateLimiter(locator.geocode('Atlanta', addressdetails=True), min_delay_seconds=1)

In [31]:
locations = [ locator.geocode(loc, addressdetails=True) for loc in list(df_places['Location'])]

In [32]:
locations_raw = []
city = []
state = []
country = []

for loc in locations:
    if loc is not None:
        add = loc.raw['address']
        locations_raw.append(add)
        add_parts = list(add.keys())
        
        if 'city' in add_parts:
            city.append(add['city'])
        else:
            city.append('')

        if 'state' in add_parts:
            state.append(add['state'])
        else:
            state.append('')

        if 'county' in add_parts:
            country.append(add['country'])
        else:
            country.append('')

    else:
        locations_raw.append('')
        city.append('')
        state.append('')
        country.append('')

In [33]:
df_places['add_raw'] = locations_raw
df_places['city'] = city
df_places['state'] = state
df_places['country'] = country
df_places.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_places['add_raw'] = locations_raw
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_places['city'] = city
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_places['state'] = state
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value i

Unnamed: 0,Location,start,end,label,meaning,add_raw,city,state,country
111,US,1156,1157,GPE,"Countries, cities, states","{'country': 'United States', 'country_code': '...",,,
112,US,1196,1197,GPE,"Countries, cities, states","{'country': 'United States', 'country_code': '...",,,
128,California,1651,1652,GPE,"Countries, cities, states","{'state': 'California', 'country': 'United Sta...",,California,
129,Washington,1653,1654,GPE,"Countries, cities, states","{'city': 'Washington', 'state': 'District of C...",Washington,District of Columbia,
130,D.C.,1655,1656,GPE,"Countries, cities, states","{'city': 'Washington', 'state': 'District of C...",Washington,District of Columbia,


### Dataframe to input JSON for geonenrichment

In [34]:
locations_raw = []
cities = []
states = []
countries = []

for loc in locations:
    if loc is not None:
        add = loc.raw['address']
        locations_raw.append(add)
        add_parts = list(add.keys())
        
        if 'city' in add_parts:
            cities.append(add['city'])
        else:
            cities.append('')

        if 'state' in add_parts:
            states.append(add['state'])
        else:
            states.append('')

        if 'county' in add_parts:
            countries.append(add['country'])
        else:
            countries.append('')

    else:
        locations_raw.append('')
        cities.append('')
        states.append('')
        countries.append('')

In [35]:
import json

df_places['add_raw'] = locations_raw
df_places['city'] = cities
df_places['state'] = states
df_places['country'] = countries

# write the us cities to json file

df_export = df_places.loc[(df_places['country'] == 'United States') & (df_places['state'] != '') & (df_places['city'] != '')]

out_dict = []

for idx, row in df_export.iterrows():
    out_dict.append(
        {'city': row['city'],
        'state': row['state']}
    )

out_json = r"C:\Projects\HTM_2022\locately\locations.json"

f = open(out_json, "w")
json.dump(out_dict, f)
f.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_places['add_raw'] = locations_raw
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_places['city'] = cities
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_places['state'] = states
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [36]:
out_dict

[{'city': 'Collinsville', 'state': 'Illinois'},
 {'city': 'Buffalo', 'state': 'New York'},
 {'city': 'Rapid City', 'state': 'South Dakota'},
 {'city': 'Amarillo', 'state': 'Texas'}]