# Segmenting and Clustering Toronto Neighborhoods

## Start of Week 3 assignment ##
### -- *start of submission 1 for week 3* -- prep the data

### Import Libraries

In [2]:
#Coursera example: https://www.coursera.org/learn/applied-data-science-capstone/peer/I1bDq/segmenting-and-clustering-neighborhoods-in-toronto
#example from https://labs.cognitiveclass.ai/tools/jupyterlab/lab/tree/labs/DP0701EN/DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0.ipynb

#Import libraries
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

!conda install -c conda-forge lxml --yes
import lxml   # web table reading library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ca-certificates-2020.4.5.1 |       hecc5488_0         146 KB  conda-forge
    certifi-2020.4.5.1         |   py36h9f0ad1d_0         151 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1

### Download and Explore Dataset

In [3]:
#using pandas method
#Obtain Postal Code, Borough, and Neighborhood information from Wikipedia
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table = pd.read_html(URL, header = 0)

In [4]:
#Obtain and view the ripped table for df_tor
df_tor = table[0]
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
#looking at Not Assigned boroughs
df_tor[df_tor['Borough'].str.contains('Not assigned')]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
7,M8A,Not assigned,
10,M2B,Not assigned,
15,M7B,Not assigned,
16,M8B,Not assigned,
19,M2C,Not assigned,
24,M7C,Not assigned,
25,M8C,Not assigned,
28,M2E,Not assigned,


### Transform the data

In [26]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
#Rename the columns; however, I think it didn't change, someone may have changed the Wiki column names directly..!
df_tor.rename(columns = {"Postal Code": "PostalCode", "Neighbourhood": "Neighborhood"}, inplace = True)

In [27]:
#Only process the cells that have an assigned borough. Drop cells with a borough that is Not Assigned.
df_tor.drop(df_tor[df_tor.Borough == 'Not assigned'].index, inplace=True)
df_tor.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [28]:
#Combine the neighborhoods that exists in one postal code
df_tor = df_tor.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
#df_tor.head(15)

#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df_tor.loc[85,'Neighborhood'] = 'Queen\'s Park'

#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print (df_tor.shape)
df_tor.head()

(103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### -- *end of submission 1 for week 3* --
### -- *start of submission 2 for week 3* -- Add lat-long to allow mapping

In [29]:
#in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.
#here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data
#Create a dataframe of the latitude and longitudes of the Toronto Neighborhoods
df_latlong = pd.read_csv("http://cocl.us/Geospatial_data")
df_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
df_latlong.rename(columns = {"Postal Code": "PostalCode"}, inplace = True)
df_latlong.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [31]:
df_latlong.tail()

Unnamed: 0,PostalCode,Latitude,Longitude
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437
102,M9W,43.706748,-79.594054


In [32]:
df_latlong.shape

(103, 3)

In [33]:
#Join df_latlong with the df_tor, where column names must be the same to mergge dataframes using pandas.
df_tor.set_index("PostalCode")
df_latlong.set_index("PostalCode")
neighbor=pd.merge(df_tor, df_latlong)
neighbor.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [34]:
print('Toronto has {} boroughs and {} neighborhoods.'.format(len(neighbor['Borough'].unique()),neighbor.shape[0]))

Toronto has 10 boroughs and 103 neighborhoods.


### -- *end submission 2 for week 3* --
### -- *start submission 3 for week 3* -- Explore the data

In [35]:
#Use geopy library to get the latitude and longitude values of Toronto, Canada
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto, Canada are 43.6534817, -79.3839347.


In [56]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighbor['Latitude'], neighbor['Longitude'], neighbor['Borough'], neighbor['Neighborhood']):
    label = '{}, {}'.format(borough, neighborhood) #i like macro, then micro. Example: York, Caledonia-Fairbanks
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#FF0000',
        fill_opacity=0.6,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### exploring... map boroughs

In [55]:
#what are the borough names, and how many postal codes/neighbourhoods inside.
df_tor_borough = neighbor.groupby(['Borough']).count()
df_tor_borough.head()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12


In [63]:
#for illustration, i want to see Etobicoke because I'm from Etobicoke!
data_etobicoke = neighbor[neighbor['Borough'] == 'Etobicoke'].reset_index(drop=True)
data_etobicoke.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores",43.605647,-79.501321
1,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
2,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
3,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
4,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999


In [65]:
data_etobicoke.shape

(12, 5)

In [66]:
address = 'Etobicoke, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Etobicoke, ON are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Etobicoke, ON are 43.671459150000004, -79.55249206611668.


In [67]:
# create map of Etobicoke using latitude and longitude values
map_etobicoke = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(data_etobicoke['Latitude'], data_etobicoke['Longitude'], data_etobicoke['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_etobicoke)  
    
map_etobicoke

In [68]:
#my sister lives in Scarborough, i want to see that on the map too.
data_scarborough = neighbor[neighbor['Borough'] == 'Scarborough'].reset_index(drop=True)
data_scarborough.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [69]:
address = 'Scarborough, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough, ON are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Scarborough, ON are 43.773077, -79.257774.


In [72]:
# create map of Scarborough using latitude and longitude values
map_scarborough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(data_scarborough['Latitude'], data_scarborough['Longitude'], data_scarborough['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#FFB6C1',
        fill_opacity=0.6,
        parse_html=False).add_to(map_etobicoke)  #the previously generated map
    
map_etobicoke #show it

### --*end submission 3 week 3.*--