# Data Science Capstone

## Comparing Memphis and Portland's access to fitness facilities, and food venues


In [28]:
!conda install -c conda-forge geopy --yes        # if needed
!conda install -c conda-forge folium=0.5.0 --yes # if needed

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

   

In [29]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values



import urllib.request
from bs4 import BeautifulSoup
import requests


from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# Scraping Wikipedia for Memphis and Portland Neighbourhoods

## Memphis Neighbourhoods

In [5]:
url = 'https://raw.githubusercontent.com/Daylen-Mackey/Data_Science_Capstone/master/MemphisNeighbourhoods.csv'
mNeigh = pd.read_csv(url)
mNeigh.head(20)

Unnamed: 0,Neighbourhoods
0,Central Business District
1,Edge District
2,Harbor Town
3,Linden
4,Medical District
5,Pinch District
6,South Forum
7,South Main Arts District
8,Speedway Terrace
9,Uptown/Greenlaw


### Some Memphis Neighbourhoods have a '/' in them. Let's split that -- duplicates can be filtered out later

### *Before*


In [6]:
#before
mNeigh.tail()

Unnamed: 0,Neighbourhoods
102,Fairgrounds
103,Glenview
104,Lamar Avenue
105,Poplar Avenue
106,Union Extended


In [7]:
replicates = mNeigh[mNeigh.Neighbourhoods.str.contains('/')]
replicates

Unnamed: 0,Neighbourhoods
9,Uptown/Greenlaw
32,Balmoral/Quince
34,Cherry/Willow
37,Galloway Gardens/Walnut Grove


In [8]:
replicates['Neighbourhoods'] = replicates['Neighbourhoods'].apply(lambda x: x.split('/')[1]) 
replicates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,Neighbourhoods
9,Greenlaw
32,Quince
34,Willow
37,Walnut Grove


### Remove the repllicates and append to bottom of data frame

In [9]:
#Keep what's on the left side of the '/'
mNeigh['Neighbourhoods'] = mNeigh['Neighbourhoods'].apply(lambda x: x.split('/')[0]) 


In [10]:
copy = mNeigh
mCopy = pd.concat([copy, replicates])


### *After*


In [11]:

mCopy.reset_index(drop = True, inplace = True)
mCopy.tail(15)

Unnamed: 0,Neighbourhoods
96,Magnolia
97,Oakhaven
98,Orange Mound
99,Parkway Village
100,Riverdale
101,Southwind
102,Fairgrounds
103,Glenview
104,Lamar Avenue
105,Poplar Avenue


In [12]:
mCopy['Name'] = mCopy['Neighbourhoods'] + ', Memphis, Tennessee'
mCopy.tail()

Unnamed: 0,Neighbourhoods,Name
106,Union Extended,"Union Extended, Memphis, Tennessee"
107,Greenlaw,"Greenlaw, Memphis, Tennessee"
108,Quince,"Quince, Memphis, Tennessee"
109,Willow,"Willow, Memphis, Tennessee"
110,Walnut Grove,"Walnut Grove, Memphis, Tennessee"


In [13]:
# Meanial Data Cleaning 
mCopy.loc[mCopy['Neighbourhoods'] == 'University Street Neighborhood','Neighbourhoods'] = 'University Street'


In [14]:
geolocator = Nominatim(user_agent="ny_explorer")

mLat = []
mLong = []

for city in mCopy['Name']:
    location = geolocator.geocode(city)
    try:
        mLat.append(location.latitude)
        mLong.append(location.longitude)
    except:
        mLat.append(np.nan)
        mLong.append(np.nan)
    #pLong = pLong.append(location.latitude)

In [15]:
mCopy['Latitude'] = mLat
mCopy['Longitude'] = mLong


## Portland Neighbourhoods

In [16]:
url = 'https://raw.githubusercontent.com/Daylen-Mackey/Data_Science_Capstone/master/PortlandNeighbourhoods.csv'
pNeigh = pd.read_csv(url)
pNeigh.head(15)

Unnamed: 0,Neighbourhoods
0,Arlington Heights
1,Forest Park
2,Goose Hollow
3,Hillside
4,Linnton
5,"Northwest District (includes Uptown, Nob Hill,..."
6,Northwest Heights
7,Northwest Industrial
8,Old Town Chinatown
9,Pearl District


### Portland Dataframe contains (includes...) many times -- need to remove

In [17]:
pCopy = pNeigh
pCopy['Neighbourhoods'] = pCopy['Neighbourhoods'].apply(lambda x: x.split('(')[0])
pCopy.head(20)

Unnamed: 0,Neighbourhoods
0,Arlington Heights
1,Forest Park
2,Goose Hollow
3,Hillside
4,Linnton
5,Northwest District
6,Northwest Heights
7,Northwest Industrial
8,Old Town Chinatown
9,Pearl District


In [18]:
pCopy['Name'] = pCopy['Neighbourhoods'] + ', Portland, Oregon'
pCopy.size

190

In [20]:
geolocator = Nominatim(user_agent="ny_explorer")

pLat = []
pLong = []

for city in pCopy.Name:
    location = geolocator.geocode(city)
    try:
            pLat.append(location.latitude)
            pLong.append(location.longitude)
    except:
        pLat.append(np.nan)
        pLong.append(np.nan)
    #pLong = pLong.append(location.latitude)
    

In [21]:
pCopy['Latitude'] = pLat
pCopy['Longitude'] = pLong
pCopy

Unnamed: 0,Neighbourhoods,Name,Latitude,Longitude
0,Arlington Heights,"Arlington Heights, Portland, Oregon",45.519496,-122.710667
1,Forest Park,"Forest Park, Portland, Oregon",45.561468,-122.758581
2,Goose Hollow,"Goose Hollow, Portland, Oregon",45.517749,-122.692819
3,Hillside,"Hillside, Portland, Oregon",45.527439,-122.71312
4,Linnton,"Linnton, Portland, Oregon",45.60033,-122.786779
5,Northwest District,"Northwest District , Portland, Oregon",45.533013,-122.698845
6,Northwest Heights,"Northwest Heights, Portland, Oregon",45.540806,-122.774354
7,Northwest Industrial,"Northwest Industrial, Portland, Oregon",,
8,Old Town Chinatown,"Old Town Chinatown, Portland, Oregon",45.524934,-122.673516
9,Pearl District,"Pearl District, Portland, Oregon",45.529044,-122.681598


## We now have dataframes with Memphis and Portland' neighbourhood coordinates. Let's remove NaN values, and proceed

In [22]:
pLoc = pCopy.dropna()
pLoc

Unnamed: 0,Neighbourhoods,Name,Latitude,Longitude
0,Arlington Heights,"Arlington Heights, Portland, Oregon",45.519496,-122.710667
1,Forest Park,"Forest Park, Portland, Oregon",45.561468,-122.758581
2,Goose Hollow,"Goose Hollow, Portland, Oregon",45.517749,-122.692819
3,Hillside,"Hillside, Portland, Oregon",45.527439,-122.71312
4,Linnton,"Linnton, Portland, Oregon",45.60033,-122.786779
5,Northwest District,"Northwest District , Portland, Oregon",45.533013,-122.698845
6,Northwest Heights,"Northwest Heights, Portland, Oregon",45.540806,-122.774354
8,Old Town Chinatown,"Old Town Chinatown, Portland, Oregon",45.524934,-122.673516
9,Pearl District,"Pearl District, Portland, Oregon",45.529044,-122.681598
10,Portland Downtown,"Portland Downtown, Portland, Oregon",45.515274,-122.680025


In [23]:
mLoc = mCopy.dropna()
mLoc

Unnamed: 0,Neighbourhoods,Name,Latitude,Longitude
1,Edge District,"Edge District, Memphis, Tennessee",35.159758,-90.05426
2,Harbor Town,"Harbor Town, Memphis, Tennessee",35.163899,-90.053486
3,Linden,"Linden, Memphis, Tennessee",35.134818,-90.027293
4,Medical District,"Medical District, Memphis, Tennessee",35.141978,-90.030331
8,Speedway Terrace,"Speedway Terrace, Memphis, Tennessee",35.155645,-90.019813
9,Uptown,"Uptown, Memphis, Tennessee",35.150906,-90.044815
11,Warehouse District,"Warehouse District, Memphis, Tennessee",35.137284,-90.060127
12,Winchester Park,"Winchester Park, Memphis, Tennessee",35.152504,-90.036226
13,Annesdale,"Annesdale, Memphis, Tennessee",35.134008,-90.062926
14,Belleair,"Belleair, Memphis, Tennessee",35.141008,-89.993866


### Hidden Cell -- Connecting to FourSquare API

In [24]:
# The code was removed by Watson Studio for sharing.

### Finding the Coordinates of Portland, Oregon and Memphis, Tennessee

In [26]:
city = 'Portland, Oregon'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(city)
portland_latitude = location.latitude
portland_longitude = location.longitude

city = 'Memphis, Tennessee'

location = geolocator.geocode(city)
memphis_latitude = location.latitude
memphis_longitude = location.longitude


print('The geograpical coordinate of Portland are {}, {}.'.format(portland_latitude, portland_longitude))
print('The geograpical coordinate of Memphis are {}, {}.'.format(memphis_latitude, memphis_longitude))

The geograpical coordinate of Portland are 45.5202471, -122.6741949.
The geograpical coordinate of Memphis are 35.1490215, -90.0516285.


# Creating Maps of Portland and Memphis Neighbourhoods

## Portland

In [33]:
map_portland = folium.Map(location=[portland_latitude, portland_longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(pLoc['Latitude'], pLoc['Longitude'], pLoc['Neighbourhoods']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_portland)  
map_portland

In [35]:
map_memphis = folium.Map(location=[memphis_latitude, memphis_longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(mLoc['Latitude'], mLoc['Longitude'], mLoc['Neighbourhoods']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_memphis)  
map_memphis

## Relevant Forsquare Categories
Gym / Fitness Center
4bf58dd8d48988d175941735

Grocery Store
4bf58dd8d48988d118951735

Health Food Store
50aa9e744b90af0d42d5de0e

Fast Food Restaurant
4bf58dd8d48988d16e941735


In [None]:
radius = 50
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, portland_latitude, portland_longitude, VERSION, radius, LIMIT)


In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=50):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
results = requests.get(url).json()["response"]['groups'][0]['items']
venues_list = []
venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

NameError: name 'name' is not defined

In [None]:
results
