In [1]:
! conda install -c conda-forge folium --yes
! pip install emoji --upgrade
! pip install geocoder

Solving environment: done

# All requested packages already installed.

Requirement already up-to-date: emoji in /opt/conda/envs/Python36/lib/python3.6/site-packages (0.5.3)


In [2]:
# standard weapons of choice
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelBinarizer, CategoricalEncoder, OneHotEncoder
from sklearn.cluster import KMeans

# webscraping weapons of choice for standard HTMl sites
import requests
from bs4 import BeautifulSoup

# geolocation helper
import geocoder # module to convert an address into latitude and longitude values

# all the viz
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from tabulate import tabulate
import emoji

%matplotlib inline

##  <center>🍁 Toronto Neighborhood Exploration 🍁</center>

#### <center>Let's explore some neighborhoods in Toronto, Ontario Canada, cluster them together, and then find cool stuff in these neighborhoods by using the FourSquare API</center>
Let's get some data in here by scraping what we need from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [3]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup= BeautifulSoup(response.content, 'lxml')
toronto_table = soup.find_all('table')[0]
toronto_df = pd.read_html(str(toronto_table))[0]
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#### Lets get rid all the postcodes that don't have boroughs assciated wtih them. 📬

In [4]:
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


#### Let's collapse all of the neighborhoods into their corresponding boroughs and postcode so they appear on in a single row (neighborhoods belong to boroughs belong to postcodes)

In [5]:
toronto_df = toronto_df.groupby(['Postcode','Borough'], sort=False).agg(lambda x: ', '.join(x))
toronto_df.reset_index(level=['Postcode', 'Borough'], inplace=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned


#### Some of the neighborhoods are not assigned.  If that is the case, let's add the name of its borough as the name of the neighborhood.

In [6]:
toronto_df['Neighbourhood'].replace('Not assigned', toronto_df['Borough'], inplace=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


#### Our final table now looks like this:

In [7]:
print(tabulate(toronto_df, headers=toronto_df.columns, tablefmt='fancy grid'))

     Postcode    Borough           Neighbourhood
---  ----------  ----------------  --------------------------------------------------------------------------------------------------------------------------------------
  0  M3A         North York        Parkwoods
  1  M4A         North York        Victoria Village
  2  M5A         Downtown Toronto  Harbourfront, Regent Park
  3  M6A         North York        Lawrence Heights, Lawrence Manor
  4  M7A         Queen's Park      Queen's Park
  5  M9A         Etobicoke         Islington Avenue
  6  M1B         Scarborough       Rouge, Malvern
  7  M3B         North York        Don Mills North
  8  M4B         East York         Woodbine Gardens, Parkview Hill
  9  M5B         Downtown Toronto  Ryerson, Garden District
 10  M6B         North York        Glencairn
 11  M9B         Etobicoke         Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
 12  M1C         Scarborough       Highland Creek, Rouge Hill, Port Union
 1

In [8]:
toronto_df.shape

(103, 3)

#### Now that everything is cleaned up, let's get the longitude and latitude values associated with each postcode 🌐

In [9]:
for i, postcode in enumerate(toronto_df['Postcode']):
    address = f'{postcode}, Toronto, Ontario'
    location = geocoder.arcgis(address)
    toronto_df.at[i, 'Latitude'] = location.latlng[0]
    toronto_df.at[i, 'Longitude'] = location.latlng[1]

In [10]:
toronto_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75244,-79.329271
1,M4A,North York,Victoria Village,43.730421,-79.31332
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65512,-79.36264
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.723125,-79.451589
4,M7A,Queen's Park,Queen's Park,43.661102,-79.391035
5,M9A,Etobicoke,Islington Avenue,43.662242,-79.528379
6,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
7,M3B,North York,Don Mills North,43.749195,-79.361905
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.707535,-79.311773
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657363,-79.37818


#### Great!  Now that we have some lat, long values, let's see how these neighborhoods cluster together. 🏙️