# Applied Data Science Capstone Week 3
# Segmenting and Clustering Toronto Neighborhoods
## Richard C. Anderson

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup 

#### Scrape postal code data from Wikipedia page:

In [2]:
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(req.content,'lxml')
table = soup.find_all('table')[0]
df_list = pd.read_html(str(table))

#### Create dataframe from scraped data:

In [3]:
df_hoods=pd.DataFrame(df_list[0])
df_hoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
print('Shape of raw dataframe:',df_hoods.shape)

Shape of raw dataframe: (180, 3)


#### Drop rows with no Borough assignment from dataframe:

In [5]:
# Delete rows with no Borough assignment and reset df index
indices = df_hoods[ df_hoods["Borough"] == "Not assigned" ].index
df_hoods.drop(indices, inplace=True)
df_hoods.reset_index(inplace=True)
df_hoods.drop(['index'], axis=1, inplace=True)
df_hoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
print('Shape of filtered dataframe:',df_hoods.shape)

Shape of filtered dataframe: (103, 3)


### Add Geo-location to the Toronto neighborhood data

In [7]:
!pip install geocoder
import geocoder as gc

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 8.1MB/s ta 0:00:011
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [8]:
def get_geoloc_for_postalcode(pcode):
    print('PCode:',pcode)
    # initialize your variable to None
    lat_lng_found = None

    # loop until coordinates are returned
    while(lat_lng_found is None):
        g = gc.google('{}, Toronto, Ontario'.format(pcode))
        print(g)
        lat_lng_found = g.latlng
        print('Found:',lat_lng_found)

    lat = lat_lng_coords[0]
    long = lat_lng_coords[1]
    return lat,long

#### Unfortunately, the geocoder call would never return a value, always a [Request Denied] message

In [9]:
#test_lat,test_long = get_geoloc_for_postalcode('M5A')
#print('Lat:',test_lat,' Long:',test_long)

#### Importing the csv file instead...

In [10]:
df_geoloc = pd.read_csv('https://cocl.us/Geospatial_data')
df_geoloc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging the geolocation dataframe with the neighborhood dataframe

In [11]:
df_merged = pd.merge(left=df_hoods, right=df_geoloc, left_on='Postal Code', right_on='Postal Code')
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [12]:
print('Shape of merged dataframe:',df_merged.shape)

Shape of merged dataframe: (103, 5)
