# Segmentation and clustering in toronto

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
import folium

In [6]:
import json 

import requests 

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import matplotlib.cm as cm
import matplotlib.colors as colors # Matplotlib and associated plotting modules

from sklearn.cluster import KMeans # import k-means from clustering stage


In [7]:
!conda install -c conda-forge beautifulsoup4 --yes
from bs4 import BeautifulSoup
print("Library imported")

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.9.1       |   py36h9f0ad1d_0         163 KB  conda-forge
    soupsieve-2.0.1            |   py36h9f0ad1d_0          56 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         219 KB

The following NEW packages will be INSTALLED:

  beautifulsoup4     conda-forge/linux-64::beautifulsoup4-4.9.1-py36h9f0ad1d_0
  soupsieve          conda-forge/linux-64::soupsieve-2.0.1-py36h9f0ad1d_0



Downloading and Extracting Packages
beautifulsoup4-4.9.1 | 163 KB    | ##################################### | 100% 
soupsieve-2.0.1      | 56 KB     | #

In [8]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 

print("Libraries imported.")

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

# 1.Download dataset

# First we need to download the dataset from the wikipedia

In [34]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text



In [35]:
soup = BeautifulSoup(result.content, 'html.parser')



# Locate table and postal code

In [50]:
postalCode = []
borough = []
neighborhood = []


In [51]:
soup.find('table').find_all('tr')

# find all the rows of the table
soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [52]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCode.append(cells[0].text.rstrip())
        borough.append(cells[1].text.rstrip())
        neighborhood.append(cells[2].text.rstrip('\n'))

# Load the pandas dataframe

In [54]:
df = pd.DataFrame({"PostalCode": postalCode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})

df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


# Remove  no assigned columns

In [55]:
df_drop = df[df.Borough != "Not assigned"].reset_index(drop=True)
df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Group neighbourhoods in same borough

In [61]:
df_grouped = df_drop.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Neighborhood which is "Not assigned", makes the value the same as Borough

In [64]:
for index, row in df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [66]:
df.shape

(180, 3)

# Q2

Read the csv file with panda

In [67]:
c = pd.read_csv('https://cocl.us/Geospatial_data')
c.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [68]:
c.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
c.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merge both the data

In [70]:
df_new = df_grouped.merge(c, on="PostalCode", how="left")
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Q3

In [73]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


# Define Foursquare Credentials and Version

In [74]:
CLIENT_ID = 'PXFNR0RQELQKQFRCAKISPVVDMWUDHTJESMWNN2HP5124XTEQ' 
CLIENT_SECRET = 'YKOHA3CSHQQAJCA1LK1QKT1551XIRPLZQW4W1XX1PK4AX12J'  
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PXFNR0RQELQKQFRCAKISPVVDMWUDHTJESMWNN2HP5124XTEQ
CLIENT_SECRET:YKOHA3CSHQQAJCA1LK1QKT1551XIRPLZQW4W1XX1PK4AX12J


In [76]:
map_t = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_new['Latitude'], df_new['Longitude'], df_new['Borough'], df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_t)  
    
map_t

In [85]:

    radius = 100
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(df_new['Latitude'],df_new['Longitude'], df_new['PostalCode'],df_new['Borough'], 
                                                  df_new['Neighborhood']):
         url = "https://api.foursquare.com/v2/venues/explore?client_id=PXFNR0RQELQKQFRCAKISPVVDMWUDHTJESMWNN2HP5124XTEQ &client_secret= YKOHA3CSHQQAJCA1LK1QKT1551XIRPLZQW4W1XX1PK4AX12J &v=20180605 \
             &ll=43.653963,-79.387207&radius=500&limit=100".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))     

# Convert to dataframe

In [86]:
v_df = pd.DataFrame(venues)


v_df.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(v_df.shape)
v_df.head()

(46, 9)


Unnamed: 0,PostalCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M9W,Etobicoke,Northwest,43.706748,-79.594054,Downtown Toronto,43.653232,-79.385296,Neighborhood
1,M9W,Etobicoke,Northwest,43.706748,-79.594054,Textile Museum of Canada,43.654396,-79.3865,Art Museum
2,M9W,Etobicoke,Northwest,43.706748,-79.594054,Cafe Plenty,43.654571,-79.38945,Café
3,M9W,Etobicoke,Northwest,43.706748,-79.594054,Sansotei Ramen 三草亭,43.655157,-79.386501,Ramen Restaurant
4,M9W,Etobicoke,Northwest,43.706748,-79.594054,Japango,43.655268,-79.385165,Sushi Restaurant


# Analysing each area

In [88]:
t = pd.get_dummies(v_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
t['PostalCode'] = v_df['PostalCode'] 
t['Borough'] = v_df['Borough'] 
t['Neighborhoods'] = v_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
f = list(t.columns[-3:]) + list(t.columns[:-3])
t = t[f]

print(t.shape)
t.head()

(46, 39)


Unnamed: 0,PostalCode,Borough,Neighborhoods,Art Gallery,Art Museum,Arts & Crafts Store,Bar,Breakfast Spot,Bubble Tea Shop,Café,...,Pub,Ramen Restaurant,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Smoke Shop,Sushi Restaurant,University,Vegetarian / Vegan Restaurant
0,M9W,Etobicoke,Northwest,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M9W,Etobicoke,Northwest,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M9W,Etobicoke,Northwest,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,M9W,Etobicoke,Northwest,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,M9W,Etobicoke,Northwest,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
