## Toronto Neighborhoods 

### Question 1 (Identify Toronto Neighborhoods)

In [1]:
# Import the libraries for web scraping 
import numpy as np 
import pandas as pd 
from bs4 import BeautifulSoup
import requests 

In [2]:
# Neighborhoods in Toronto from Wikipedia 
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
html = requests.get(url).text

In [4]:
soup = BeautifulSoup(html, 'html5lib')

In [5]:
tables = soup.find_all('table')

In [6]:
# Using beautifulsoup to scrape the data 
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df1=pd.DataFrame(table_contents)
df1['Borough']=df1['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})


In [7]:
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [8]:
# Question 1 Answer 
# First 13 rows of the dataframe 
df1.iloc[0:12]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
df1.shape

(103, 3)

### Question 2 (Identify the Neighborhoods Latitude and Longitude

In [10]:
# Extract the data from the downloaded csv file 
df = pd.read_csv('C:/Users/Chia/Downloads/Geospatial_Coordinates.csv')

In [11]:
# Rename the column name of Postal Code to PostalCode 
df.rename(columns={'Postal Code': 'PostalCode'}, inplace = True)

In [12]:
df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
# Merge the dataframe created previously with the newly extracted
# Each neighborhood will have latitude and longitude in the new dataframe 
Toronto_df = pd.merge(df1, df, on=['PostalCode'])

In [14]:
# Question 2 Answer 
Toronto_df.iloc[0:12]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [15]:
Toronto_df.dtypes

PostalCode       object
Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

To check whether the postal code is correspond to the correct latitude and longitude. The dataframe shown the in the instruction shows that: 

M5A (Downtown Toronto) with latitude of 43.654260 and longitude of -70.360636
M4B (East York) latitude of 43.706397 and longitude of -79.309937 


In [16]:
Toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [17]:
M5A = Toronto_df[Toronto_df['PostalCode'] == 'M5A']
M5A

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636


In [18]:
M4B = Toronto_df[Toronto_df['PostalCode'] == 'M4B']
M4B

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937


The reult shown here is correspond to the result shown in the intruction section of the assignment 

### Question 3 (Data Visualization)

In [19]:
# Install folium and geopy 
!pip install folium
!pip install geopy



You should consider upgrading via the 'c:\users\chia\anaconda4\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\chia\anaconda4\python.exe -m pip install --upgrade pip' command.





In [20]:
# Import libraries for data visualization 
from geopy.geocoders import Nominatim 
import folium 

import matplotlib.cm as cm 
import matplotlib.colors as colors 

from sklearn.cluster import KMeans 

#### Visualize the Neighborhoods of Toronto 

In [21]:
# Number of frequency of Borough exists in the dataframe 
# And to identify which Boorough has the name Toronto in it 
Toronto_df['Borough'].value_counts()

North York                24
Downtown Toronto          17
Scarborough               17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East York                  4
East Toronto               4
Queen's Park               1
Downtown Toronto Stn A     1
Mississauga                1
Etobicoke Northwest        1
East Toronto Business      1
East York/East Toronto     1
Name: Borough, dtype: int64

In [22]:
# Exploring only the Borough that has the name Toronto in it 
Toronto_data = Toronto_df[Toronto_df['Borough'].str.contains('Toronto')]
Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [23]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="Toronto, Ontario")
location = geolocator.geocode(address)
latitude = location.latitude 
longitude = location.longitude

In [24]:
Map_Toronto = folium.Map(location=[latitude,longitude], zoom_start = 11)

for lat,lng,label in zip (Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius = 5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Map_Toronto)

Map_Toronto 

#### Cluster Neighborhoods

In [25]:
CLIENT_ID = 'XGGBYFSY4MFREH0C3YWVCFSAJP354JNGYMNZI5UVT20KTJKZ'
CLIENT_SECRET = '5QEO5JWNP2VSYMOUE3ZKXKZDWTU0BMVLXRCYTJXGR2UUC3IZ'
VERSION = '20180605'
radius = 2000
LIMIT = 100 

In [26]:
# Create a function to explore venues (name, location and category) each neighborhood in Toronto 

def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION,
            lat,
            lng,
            radius,
            LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        
        nearby_venues.columns = ['Neighborhood',
                     'Neighborhood Latitude',
                     'Neighborhood Longitude',
                     'Venue','Venue Latitude',
                     'Venue Longitude',
                     'Venue Category']
        
        return(nearby_venues)

In [27]:
Toronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'], 
                                latitudes=Toronto_data['Latitude'],
                                longitudes=Toronto_data['Longitude']
                            )

Toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site
3,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
4,"Regent Park, Harbourfront",43.65426,-79.360636,Distillery Sunday Market,43.650075,-79.361832,Farmers Market


In [28]:
# Create dummie variables for Venue Category 
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']])
# Adding back the column of Neighborhood 
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood']
# Move Neighborhood to the first column 
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot

Unnamed: 0,Neighborhood,Venue Category_American Restaurant,Venue Category_Animal Shelter,Venue Category_Athletics & Sports,Venue Category_BBQ Joint,Venue Category_Bagel Shop,Venue Category_Bakery,Venue Category_Bar,Venue Category_Beach,Venue Category_Beer Bar,...,Venue Category_Plaza,Venue Category_Pub,Venue Category_Ramen Restaurant,Venue Category_Restaurant,Venue Category_Sandwich Place,Venue Category_Seafood Restaurant,Venue Category_Shopping Mall,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Vegetarian / Vegan Restaurant
0,"Regent Park, Harbourfront",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
97,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,"Regent Park, Harbourfront",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
kclusters = 5 
Toronto_onehot = Toronto_onehot.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_onehot)
kmeans.labels_[0:19]

array([3, 2, 0, 0, 0, 2, 0, 2, 4, 0, 4, 0, 2, 2, 0, 0, 3, 2, 0])

In [30]:
Toronto_onehot.head()

Unnamed: 0,Venue Category_American Restaurant,Venue Category_Animal Shelter,Venue Category_Athletics & Sports,Venue Category_BBQ Joint,Venue Category_Bagel Shop,Venue Category_Bakery,Venue Category_Bar,Venue Category_Beach,Venue Category_Beer Bar,Venue Category_Bookstore,...,Venue Category_Plaza,Venue Category_Pub,Venue Category_Ramen Restaurant,Venue Category_Restaurant,Venue Category_Sandwich Place,Venue Category_Seafood Restaurant,Venue Category_Shopping Mall,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Vegetarian / Vegan Restaurant
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
Toronto_onehot['Neighborhood'] = Toronto_data['Neighborhood']
Toronto_onehot.head()

Unnamed: 0,Venue Category_American Restaurant,Venue Category_Animal Shelter,Venue Category_Athletics & Sports,Venue Category_BBQ Joint,Venue Category_Bagel Shop,Venue Category_Bakery,Venue Category_Bar,Venue Category_Beach,Venue Category_Beer Bar,Venue Category_Bookstore,...,Venue Category_Pub,Venue Category_Ramen Restaurant,Venue Category_Restaurant,Venue Category_Sandwich Place,Venue Category_Seafood Restaurant,Venue Category_Shopping Mall,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Vegetarian / Vegan Restaurant,Neighborhood
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"Regent Park, Harbourfront"
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,


In [32]:
# Insert the Cluster Labels in new columns 
Toronto_onehot.insert(0, 'Cluster Labels', kmeans.labels_)

# Create a new dataframe 
Toronto_merged = Toronto_data 

# Merge the dataframe with venues characteristics for each neighborhood with dataframe that has the information regarding the neighborhood 
Toronto_merged = pd.merge(Toronto_data, Toronto_onehot, on='Neighborhood')

Toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Venue Category_American Restaurant,Venue Category_Animal Shelter,Venue Category_Athletics & Sports,Venue Category_BBQ Joint,...,Venue Category_Plaza,Venue Category_Pub,Venue Category_Ramen Restaurant,Venue Category_Restaurant,Venue Category_Sandwich Place,Venue Category_Seafood Restaurant,Venue Category_Shopping Mall,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Vegetarian / Vegan Restaurant
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### The End