In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20181001144907-0025


In [2]:
#get item table from source
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

In [3]:
#convert item table from html into pandas dataframe
df = pd.read_html(str(soup.table),header=0)[0]

In [4]:
#drop any rows that have value 'Not assigned' in column 'Borough'
df_clean = df[df['Borough'] != 'Not assigned'].copy()
df_clean.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [5]:
#get index for all rows that have value 'Not assigned' in column 'Neighbourhood'
index_list = df_clean[df_clean['Neighbourhood']=='Not assigned'].index

In [6]:
#copy 'Borough' value into 'Neighbourhood' with the same index
for i in index_list:
    df_clean.loc[i,'Neighbourhood'] = df_clean.loc[i,'Borough']

df_clean.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [7]:
#group the dataframe using value in 'Postcode' and 'Borough' and join 'Neighboorhood'
df_clean = df_clean.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_clean.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [8]:
df_clean.shape

(103, 3)

In [9]:
import geocoder

In [10]:
#lng = []
#lat = []

#for postal_code in df['Postcode']:
    # initialize your variable to None
#    lat_lng_coords = None
    
    # loop until you get the coordinates
#    while(lat_lng_coords is None):
#      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#      lat_lng_coords = g.latlng

#    latitude = lat_lng_coords[0]
#    longitude = lat_lng_coords[1]
    
#    lng.append(longitude)
#    lat.append(latitude)

#print(postal_code[:5],lng[:5], lat[:5])

In [11]:
#getting longitude and latitude from laternative file
df_latlng = pd.read_csv('https://cocl.us/Geospatial_data')
df_latlng.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [12]:
#making sure the dataframe size is same for merge
df_latlng.shape

(103, 3)

In [13]:
#merging data_clean and df_latlong and drop redudndant column
df_canada = df_clean.merge(df_latlng, left_on='Postcode', right_on='Postal Code', how='outer')
df_canada.drop(['Postal Code'], axis=1, inplace=True)
df_canada.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [14]:
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

In [15]:
#get data for all rows that have value 'Toronto' in column 'Borough'
df_toronto = df_canada[df_canada['Borough'].str.contains('Toronto')==True]
df_toronto.reset_index(inplace=True, drop=True)
df_toronto.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


In [16]:
df_toronto.shape

(38, 5)

In [17]:
#getting list of Toronto to use
ar_toronto = df_toronto['Borough'].unique().tolist()
ar_toronto

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [18]:
#getting latitude and longitude of Toronto
address = 'Toronto, CAN'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of East Toronto are {0:0.6f}, {1:0.6f}.'.format(latitude, longitude))

The geograpical coordinate of East Toronto are 43.660700, -79.385089.


In [19]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighbourhood, postcode in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood'], df_toronto['Postcode']):
    label = 'Postcode: {0}<br>Borough: {1}<br>Neighbourhood: {2}'.format(postcode, borough, neighbourhood)
    iframe = folium.IFrame(html=label, width=300, height=100)
    popup = folium.Popup(iframe, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=5,
                        popup=popup,
                        color='blue',
                        fill=True,
                        fill_color='blue',
                        fill_opacity=0.7,
                       ).add_to(map_toronto)

map_toronto

In [20]:
# The code was removed by Watson Studio for sharing.

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#function to get data from Foursquare for each neighbourhood
def getNearbyVenues(names, latitudes, longitudes):    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(Client_Id,
                                                                                                                                    Client_Secret,
                                                                                                                                    Version,
                                                                                                                                    lat,
                                                                                                                                    lng,
                                                                                                                                    Radius,
                                                                                                                                    Limit
                                                                                                                                   )
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, 
                             lat, 
                             lng,
                             v['venue']['name'],
                             v['venue']['categories'][0]['name'],
                             v['venue']['location']['lat'],
                             v['venue']['location']['lng']
                            ) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood',
                             'Neighbourhood_Latitude',
                             'Neighbourhood_Longitude',
                             'Venue',
                             'Venue_Category',
                             'Venue_Latitude',
                             'Venue_Longitude'
                             ]
    
    nearby = json_normalize(results)
    return(nearby_venues)

In [None]:
df_toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                    latitudes=df_toronto['Latitude'],
                                    longitudes=df_toronto['Longitude']
                                   )

In [None]:
#delete any rows that have none value
df_toronto_venues.dropna(how='any', axis=0, inplace=True)
df_toronto_venues.reset_index(inplace=True, drop=True)
df_toronto_venues.head(10)

In [None]:
#getting count for each postcode/neighbourhood
df_toronto_neigh = df_toronto_venues.copy()
df_toronto_neigh.groupby('Neighbourhood').count()

In [None]:
#make one_coded dataframe
df_toronto_venues_onecode = pd.get_dummies(df_toronto_venues['Venue_Category'])
df_toronto_venues_onecode['Neighbourhood'] = df_toronto_venues['Neighbourhood']

# move neighborhood column to the first column
fixed_columns = [df_toronto_venues_onecode.columns[-1]] + list(df_toronto_venues_onecode.columns[:-1])
df_toronto_venues_onecode = df_toronto_venues_onecode[fixed_columns]

df_toronto_venues_onecode.head(10)

In [None]:
df_toronto_venues_onecode.shape

In [None]:
#getting mean in each based on category and neighbourhood
df_toronto_venues_neigh_mean = df_toronto_venues_onecode.groupby('Neighbourhood').mean().reset_index()
df_toronto_venues_neigh_mean.head(10)

In [None]:
#setting up dataframe for clustering
df_toronto_venues_clustering = df_toronto_venues_neigh_mean.copy()
df_toronto_venues_clustering.drop(['Neighbourhood'], axis=1, inplace=True)
df_toronto_venues_clustering.head(10)

In [None]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(df_toronto_venues_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

In [None]:
#making sure the number of cluster label is correct
len(kmeans.labels_)

In [None]:
#label each neighbourhood with cluster
df_toronto_cluster = df_toronto.copy()
df_toronto_cluster.sort_values(['Neighbourhood'], ascending=True, inplace=True)
df_toronto_cluster.reset_index(inplace=True, drop=True)
df_toronto_cluster['Cluster'] = kmeans.labels_
df_toronto_cluster.head(10)

In [None]:
#populate venue with cluster
df_toronto_venues_cluster = df_toronto_venues.copy()
df_toronto_venues_cluster = df_toronto_venues_cluster.merge(df_toronto_cluster.drop(['Postcode','Borough','Latitude','Longitude'], axis=1), left_on='Neighbourhood', right_on='Neighbourhood', how='outer')
df_toronto_venues_cluster.head(10)

In [None]:
# cluster 1
df_toronto_cluster0 = df_toronto_venues_cluster[df_toronto_venues_cluster['Cluster']==0].copy()
df_toronto_cluster0.drop(['Neighbourhood_Latitude','Neighbourhood_Longitude'], axis=1, inplace=True)
df_toronto_cluster0.reset_index(inplace=True, drop=True)
df_toronto_cluster0.head(10)

In [None]:
#cluster 2
df_toronto_cluster1 = df_toronto_venues_cluster[df_toronto_venues_cluster['Cluster']==1].copy()
df_toronto_cluster1.drop(['Neighbourhood_Latitude','Neighbourhood_Longitude'], axis=1, inplace=True)
df_toronto_cluster1.reset_index(inplace=True, drop=True)
df_toronto_cluster1.head(10)

In [None]:
#cluster 3
df_toronto_cluster2 = df_toronto_venues_cluster[df_toronto_venues_cluster['Cluster']==2].copy()
df_toronto_cluster2.drop(['Neighbourhood_Latitude','Neighbourhood_Longitude'], axis=1, inplace=True)
df_toronto_cluster2.reset_index(inplace=True, drop=True)
df_toronto_cluster2.head(10)

In [None]:
#cluster 4
df_toronto_cluster3 = df_toronto_venues_cluster[df_toronto_venues_cluster['Cluster']==3].copy()
df_toronto_cluster3.drop(['Neighbourhood_Latitude','Neighbourhood_Longitude'], axis=1, inplace=True)
df_toronto_cluster3.reset_index(inplace=True, drop=True)
df_toronto_cluster3.head(10)

In [None]:
#cluster 5
df_toronto_cluster4 = df_toronto_venues_cluster[df_toronto_venues_cluster['Cluster']==4].copy()
df_toronto_cluster4.drop(['Neighbourhood_Latitude','Neighbourhood_Longitude'], axis=1, inplace=True)
df_toronto_cluster4.reset_index(inplace=True, drop=True)
df_toronto_cluster4.head(10)

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, name, category in zip(df_toronto_cluster3['Venue_Latitude'], df_toronto_cluster3['Venue_Longitude'], df_toronto_cluster3['Venue'], df_toronto_cluster3['Venue_Category']):
    label = 'Name: {0}<br>Category: {1}'.format(name, category)
    iframe = folium.IFrame(html=label, width=300, height=100)
    popup = folium.Popup(iframe, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=5,
                        popup=popup,
                        color='blue',
                        fill=True,
                        fill_color='blue',
                        fill_opacity=0.7,
                       ).add_to(map_toronto)

map_toronto