In [1]:
# do any pip installs in this cell

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import json
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Image
import folium

In [9]:
APIDATA_EXISTS = os.path.isfile('apidata.json')

print(APIDATA_EXISTS)

True


In [8]:
# pull addresses

url = 'https://www.wrtv.com/news/local-news/crime/people-weve-lost-these-are-the-indianapolis-homicide-victims-of-2022'

html = requests.get(url).content
soup = BeautifulSoup(html)

# print(soup)

In [11]:
# parse addresses

paragraphs = soup.find_all('p')
paragraphs = list(filter(lambda p: len(p.contents) > 10, paragraphs))

addresses = [paragraph.contents[6].text if (not paragraph.contents[6].text.startswith('What happened:')) else paragraph.contents[4].text for paragraph in paragraphs]
addresses = list(filter(lambda addr: addr != 'Unknown location', addresses))

# as a sanity check, the below should not contain entries like "What happened:", only addresses without digits
bad_addr = list(filter(lambda addr: sum([not ch.isdigit() for ch in addr]) == len(addr), addresses))

print(len(addresses)) # Should be 216 per the article, but is actually 215 because of one unknown address

215


In [7]:
# convert addresses to longitude/lattitude
# if APIDATA_EXISTS is False, query the API; otherwise, read from apidata.json

if not APIDATA_EXISTS:
    apidata = []
    url = 'https://maps.googleapis.com/maps/api/geocode/json?address={}&key={}'
    API_KEY = 'go find one' # available on Google Cloud

    for _ in range(len(addresses)):
        address = requests.utils.quote(addresses[_] + ', Indiana')

        resp = requests.get(url.format(address, API_KEY))
        resp = json.loads(resp.text)

        apidata.append(resp)

        if _ % 10 == 0:
            print(f'Finished Address {_}/{len(addresses)}; Waiting 1 Second')
            time.sleep(1)
    print(f'Finished Address {len(addresses)}/{len(addresses)}; Backing up to apidata.json')
    
    with open('apidata.json', 'w') as f:
        json.dump(apidata, f, indent=4) # backups results of maps api to apidata.json
        
else:
    with open('apidata.json') as f:
        apidata = json.load(f)
    print('apidata.json loaded into memory')

apidata.json loaded into memory


In [12]:
# extracts coordinates from apidata into geolocs

geolocs = []

for resp in apidata:
    coords = resp['results'][0]['geometry']['location']
    geolocs.append((coords['lat'], coords['lng']))
    
# print(geolocs)

In [16]:
# plots coordinates on a map

homicide_map = folium.Map((geolocs[0][0], geolocs[0][1]), zoom_start=13)

for pt in geolocs:
    marker = folium.Marker([pt[0], pt[1]]) #latitude,longitude
    homicide_map.add_child(marker) 

homicide_map