# Part 1 - Import Toronto Neighborhood Data to Pandas Dataframe

**Import pandas and data from url into data frame**

In [3]:
import pandas as pd

In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
df = pd.read_html(url,header=0)
df

[    Postal Code           Borough  \
 0           M1A      Not assigned   
 1           M2A      Not assigned   
 2           M3A        North York   
 3           M4A        North York   
 4           M5A  Downtown Toronto   
 5           M6A        North York   
 6           M7A  Downtown Toronto   
 7           M8A      Not assigned   
 8           M9A         Etobicoke   
 9           M1B       Scarborough   
 10          M2B      Not assigned   
 11          M3B        North York   
 12          M4B         East York   
 13          M5B  Downtown Toronto   
 14          M6B        North York   
 15          M7B      Not assigned   
 16          M8B      Not assigned   
 17          M9B         Etobicoke   
 18          M1C       Scarborough   
 19          M2C      Not assigned   
 20          M3C        North York   
 21          M4C         East York   
 22          M5C  Downtown Toronto   
 23          M6C              York   
 24          M7C      Not assigned   
 25         

In [6]:
df_tor = df[0]
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


**Remove/drop all rows where Borough is Not Assigned**

In [7]:
#Get all boroughs that have values "Not assigned"
NA_Borough = df_tor[df_tor.Borough == 'Not assigned']
NA_Borough.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
7,M8A,Not assigned,Not assigned
10,M2B,Not assigned,Not assigned
15,M7B,Not assigned,Not assigned


In [8]:
df_tor = df_tor.drop(df_tor[df_tor.Borough == 'Not assigned'].index)
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
#Check for any neighborhoods that are 'Not assigned'
NA_Neigh = df_tor[df_tor.Neighborhood.str.contains('Not assigned')]
NA_Neigh

Unnamed: 0,Postal Code,Borough,Neighborhood


In [10]:
df_tor.shape

(103, 3)

# Part 2 - Getting Coordinates for all Neighborhoods

In [11]:
import numpy as np

In [12]:
#Create new columns for the dataframe containing NaN values
df_tor["Latitude"] = np.nan
df_tor["Longitude"] = np.nan
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,,
3,M4A,North York,Victoria Village,,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


In [13]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

Data downloaded!


In [14]:
df_coord = pd.read_csv ('Geospatial_Coordinates.csv')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df_tor['Latitude'] = df_tor['Postal Code'].map(df_coord.set_index('Postal Code')['Latitude'])
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,
3,M4A,North York,Victoria Village,43.725882,
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,


In [16]:
df_tor['Longitude'] = df_tor['Postal Code'].map(df_coord.set_index('Postal Code')['Longitude'])
df_tor

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
11,M3B,North York,Don Mills,43.745906,-79.352188
12,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


# Part 3 - Exploring Neighborhoods in Toronto

**Install geopy, follium and KMeans Clustering**

In [17]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0          conda-forge
    geopy:           

In [18]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    branca-0.4.1               |             py_0          26 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         713 KB

The following NEW packages will be INSTALLED:

    altair:  4.1.0-py_1 conda-forge
    branca:  0.4.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Downloading and Extracting Packages
branca-0.4.1         | 26 KB     | #####

In [19]:
from sklearn.cluster import KMeans

In [41]:
import matplotlib.cm as cm
import matplotlib.colors as colors

**Cluster only boroughs that contain the word Toronto**

In [20]:
# Get coordinates for Toronto
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [48]:
# Extract boroughs containing the word Toronto
df_bor_tor = df_tor[df_tor.Borough.str.contains('Toronto')]
df_bor_tor = df_bor_tor.drop('Postal Code', 1)
df_bor_tor.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
4,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
13,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
22,Downtown Toronto,St. James Town,43.651494,-79.375418
30,East Toronto,The Beaches,43.676357,-79.293031


In [45]:
print('There are {} uniques boroughs.'.format(len(df_bor_tor['Borough'].unique())))

There are 4 uniques boroughs.


In [49]:
# drop string columns - get only lat and long values for clustering
lat_long_df = df_bor_tor[['Latitude','Longitude']]
lat_long_df.head()

Unnamed: 0,Latitude,Longitude
4,43.65426,-79.360636
6,43.662301,-79.389494
13,43.657162,-79.378937
22,43.651494,-79.375418
30,43.676357,-79.293031


In [70]:
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state = 0).fit(lat_long_df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 3, 1, 1, 2, 1, 2], dtype=int32)

In [71]:
# if column 'Cluster Labels' already exists
df_bor_tor = df_bor_tor.drop('Cluster Labels', 1)
df_bor_tor.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
4,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
13,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
22,Downtown Toronto,St. James Town,43.651494,-79.375418
30,East Toronto,The Beaches,43.676357,-79.293031


In [72]:
# add clustering labels
df_bor_tor.insert(0, 'Cluster Labels', kmeans.labels_)

df_bor_tor.head()

Unnamed: 0,Cluster Labels,Borough,Neighborhood,Latitude,Longitude
4,1,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
13,1,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
22,1,Downtown Toronto,St. James Town,43.651494,-79.375418
30,3,East Toronto,The Beaches,43.676357,-79.293031


In [85]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)



# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 0.7, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_bor_tor['Latitude'], df_bor_tor['Longitude'], df_bor_tor['Neighborhood'], df_bor_tor['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=8,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.8).add_to(map_clusters)
       
map_clusters