In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
%matplotlib inline
import requests
from tabulate import tabulate
from sklearn.cluster import KMeans 
import matplotlib.colors as colors
import matplotlib.cm as cm
import folium

<h3>Web Scraping and Data Extraction</h3>

In [17]:
result= requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(result,"lxml")
table1= soup.find("table",class_="wikitable")

In [18]:
n_rows=table1.find_all("tr")

In [19]:
neighbor=[]
for i in n_rows:
    outcome=i.text.split("\n")[1:-1]
    neighbor.append(outcome)

In [20]:
neighbor[0:5]

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

<h3>Convertion of extracted data to Dataframe</h3>

In [21]:
neighbor[0][-1]="Neighborhood"
Ndf=pd.DataFrame(neighbor[1:],columns=neighbor[0])
Ndf.head(2)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned


In [22]:
B_na=Ndf.index[Ndf["Borough"]=="Not assigned"]
N_na=Ndf.index[Ndf["Neighborhood"]=="Not assigned"]
Gna=B_na & N_na

In [23]:
Ndf.drop(Ndf.index[B_na],inplace=True)
Ndf.reset_index(drop=True,inplace=True)

In [24]:
Ndf.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [25]:
postalCode_g=Ndf.groupby("Postcode")
N_group=postalCode_g['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
borough_g=postalCode_g['Borough'].apply(lambda x: set(x).pop())
df_group= pd.DataFrame(list(zip(borough_g.index,borough_g,N_group)))
df_group.columns=["Postcode", "Borough", "Neighborhood"]

<h3>getting the coordinates of the neighborhood</h3>

In [26]:
df_coordinates = pd.read_csv("Geospatial_Coordinates.csv")
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
pcode_coordinates=df_group.join(df_coordinates.set_index("Postal Code"),on="Postcode")
                                

In [28]:
pcode_coordinates.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<h2>Explore and cluster the neighborhoods in Toronto</h2>

In [29]:
from geopy.geocoders import Nominatim 

In [31]:
address = 'Toronto, Ontario Canada'
geolocator = Nominatim(user_agent='foursquare_agent')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print("Coordinates of Toronto are: {}, {}.".format(latitude, longitude))

Coordinates of Toronto are: 43.653963, -79.387207.


In [34]:
To_map=folium.Map(location=[latitude, longitude], zoom_start=11)

for location in pcode_coordinates.itertuples():
    label = 'Postal Code: {};  Borough: {};  Neighborhoods: {}'.format(location[1], location[2], location[3])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [location[-2], location[-1]],
        radius=1,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map) 
    folium.Circle(
        radius=500,
        popup=label,
        location=[location[-2], location[-1]],
        color='#3186cc',
        fill=True,
        fill_color='#3186cc'
    ).add_to(toronto_map) 
    
To_map

In [36]:
toronto_data = pcode_coordinates[pcode_coordinates['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [37]:
toronto_map=folium.Map(location=[latitude, longitude], zoom_start=11)
for lat,long,label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

<h2>Using Foursquare API to explore and segment neighbourhoods in Toronto.</h2>

In [38]:
CLIENT_ID = 'FFQU3U40EWVECVPGXCO5NGEJLUUGFC01N22V5G0M5ADQKLUL' # your Foursquare ID
CLIENT_SECRET = 'YGVOX4Z35GASJW55QOHYR5B3ZJS02V5ZRSFLX5EPOFVDQJFZ' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FFQU3U40EWVECVPGXCO5NGEJLUUGFC01N22V5G0M5ADQKLUL
CLIENT_SECRET:YGVOX4Z35GASJW55QOHYR5B3ZJS02V5ZRSFLX5EPOFVDQJFZ


In [39]:
def get_venues(names,latitudes,longitudes,radius=500):
    list_venue=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        list_venue.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in list_venue for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [44]:
pcode_coordinates=toronto_data
toronto_v=get_venues(names=pcode_coordinates['Neighborhood'],latitudes=pcode_coordinates['Latitude'],longitudes=pcode_coordinates['Longitude'])

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The 

In [46]:
toronto_v.head(3)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub


In [47]:
toronto_v.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",30,30,30,30,30,30
Berczy Park,30,30,30,30,30,30
"Brockton, Exhibition Place, Parkdale Village",21,21,21,21,21,21
Business Reply Mail Processing Centre 969 Eastern,19,19,19,19,19,19
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",17,17,17,17,17,17
"Cabbagetown, St. James Town",30,30,30,30,30,30
Central Bay Street,30,30,30,30,30,30
"Chinatown, Grange Park, Kensington Market",30,30,30,30,30,30
Christie,15,15,15,15,15,15
Church and Wellesley,30,30,30,30,30,30


In [48]:
print('{} uniques categories.'.format(len(toronto_v['Venue Category'].unique())))

191 uniques categories.


In [49]:
toron1 = pd.get_dummies(toronto_v[['Venue Category']], prefix="", prefix_sep="")

toron1['Neighborhood'] = toronto_v['Neighborhood'] 

fcolumns = [toron1.columns[-1]] + list(toron1.columns[:-1])
toron1 = toron1[fcolumns]

toron1.head(3)

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<h3>Grouping rows by neighbourhood and taking the mean of the frequency of occurrence of each category.</h3>

In [78]:
toron_g= toron1.groupby('Neighborhood').mean().reset_index()
toron_g.head()

Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<h3>To know the existing businesses in different locations</h3>

In [55]:
for i in toron_g['Neighborhood']:
    print("----"+i+"----")
    h_cond =toron_g[toron_g['Neighborhood'] == i].T.reset_index()
    h_cond.columns = ['venue','freq']
    h_cond = h_cond.iloc[1:]
    h_cond['freq'] = h_cond['freq'].astype(float)
    h_cond = h_cond.round({'freq': 2})
    print(h_cond.sort_values('freq', ascending=False).reset_index(drop=True).head())
    print('\n')

----Adelaide, King, Richmond----
              venue  freq
0        Steakhouse  0.10
1             Hotel  0.07
2       Coffee Shop  0.07
3              Café  0.07
4  Asian Restaurant  0.07


----Berczy Park----
                venue  freq
0                Café  0.07
1         Coffee Shop  0.07
2      Farmers Market  0.07
3  Seafood Restaurant  0.07
4            Beer Bar  0.07


----Brockton, Exhibition Place, Parkdale Village----
                venue  freq
0                Café  0.10
1      Breakfast Spot  0.10
2         Coffee Shop  0.10
3       Burrito Place  0.05
4  Falafel Restaurant  0.05


----Business Reply Mail Processing Centre 969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.05
2          Comic Shop  0.05
3                Park  0.05
4          Restaurant  0.05


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0   Airport Service  0.18
1

<h3>Decending order sorting of venues</h3>

In [56]:
def most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[:]

In [69]:
top_venues = 10

ind = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for x in np.arange(top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(x+1, ind[x]))
    except:
        columns.append('{}th Most Common Venue'.format(x+1))

#new dataframe
sorted_neigborhood = pd.DataFrame(columns=columns)
sorted_neigborhood['Neighborhood'] = toron_g['Neighborhood']
   
sorted_neigborhood.shape


(38, 11)

<h3>Neighbourhood Clustering</h3>

In [70]:
k_clusters = 10

toron_g_cluster = toron_g.drop('Neighborhood', 1)

k_means = KMeans(n_clusters=k_clusters, random_state=1).fit(toron_g_cluster)

print(k_means.labels_[0:10])
print(len(k_means.labels_))

[0 0 5 4 7 5 5 0 4 4]
38


In [71]:
toron_m=pcode_coordinates

#we are adding the clustering labels
toron_m['Cluster Labels'] = k_means.labels_

#we are merging group_toronto with toronto_data to add the latitude/longitude for each neighbourhood
toron_m = toron_m.join(sorted_neigborhood.set_index('Neighborhood'), on='Neighborhood')

toron_m.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,,,,,,,,,,
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,,,,,,,,,,
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,5,,,,,,,,,,
3,M4M,East Toronto,Studio District,43.659526,-79.340923,4,,,,,,,,,,
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,7,,,,,,,,,,


<h3>Cluster visualization</h3>

In [77]:
cluster_v = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(k_clusters)
ys = [i+x+(i*x)**2 for i in range(k_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toron_m['Latitude'], toron_m['Longitude'], toron_m['Neighborhood'],k_means.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(cluster_v)
       
cluster_v