# IBM Applied data science capstone


## 3 week assignment: 
## "Segmenting and Clustering Toronto neighborhoods"


### Part 1:

### Load Libraries

In [118]:
import requests # library to handle requests
import csv # load dat from csv
from urllib.request import urlopen # open url file
# for webscraping import Beautiful Soup 
from bs4 import BeautifulSoup
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import json # library to handle JSON files

### Scrape Wikipedia

In [119]:
html = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(html, "lxml")
table = soup.find_all("table", {"class":"wikitable"})
table = table[0]
rows = table.find_all("tr")
csvFile = open("canada_postal.csv", 'wt', newline='')
writer = csv.writer(csvFile)
try:
    for row in rows:
        csvRow = []
        for cell in row.find_all(['td', 'th']):
            csvRow.append(cell.get_text())
        writer.writerow(csvRow)
finally:
    csvFile.close()

### Read canada postal code information for csv file

In [120]:
df = pd.read_csv("canada_postal.csv")


### Find postal code by Borough and Neighbourhood

In [121]:
df['Neighbourhood'] = df['Neighbourhood\n'].replace("\n","", regex=True)
df = df.drop(['Neighbourhood\n'], axis = 1)
df.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Remove "Not assigned" and aggregate

In [122]:
indexname = df[df['Borough'] == 'Not assigned'].index
df.drop(indexname, inplace = True)
df.head(11)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [123]:
df1=df.groupby("Postcode").agg(lambda x:','.join(set(x)))
df1.loc[df1['Neighbourhood']=="Not assigned",'Neighbourhood']=df1.loc[df1['Neighbourhood']=="Not assigned",'Borough']
df1.head(11)

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Morningside,Guildwood,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"Kennedy Park,East Birchmount Park,Ionview"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West"
M1N,Scarborough,"Birch Cliff,Cliffside West"


In [124]:
df1.shape

(103, 2)

### From csv file get coordinates for postal codes

In [125]:
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [126]:
df1['Latitude']=geo_data['Latitude'].values
df1['Longitude']=geo_data['Longitude'].values

df1

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"Kennedy Park,East Birchmount Park,Ionview",43.727929,-79.262029
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


### Get coordinates from geolocator

In [127]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


### Create map of Toronto using latitude and longitude values and add markers

In [128]:
# create map of Toronto 
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='brown',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Part 2: 

## Explore neighbourhoods of Toronto

### Connect to Foursquare. Enter you credentials and version

In [139]:
CLIENT_ID = 'not show this' # your Foursquare ID
CLIENT_SECRET = 'not show this' # your Foursquare Secret
VERSION = '20190419' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: not show this
CLIENT_SECRET:not show this


### Select Neighbourhoods

In [130]:
df_t = df1[df1['Borough'].str.contains('Toronto')]

d_tor = df_t.reset_index(drop=True)
d_tor

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
2,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879
5,Central Toronto,Davisville North,43.712751,-79.390197
6,Central Toronto,North Toronto West,43.715383,-79.405678
7,Central Toronto,Davisville,43.704324,-79.38879
8,Central Toronto,"Moore Park,Summerhill East",43.689574,-79.38316
9,Central Toronto,"Deer Park,Rathnelly,South Hill,Summerhill West...",43.686412,-79.400049


In [131]:
# create map of Toronto using latitude and longitude values
map_ofneigh = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(d_tor['Latitude'], d_tor['Longitude'], d_tor['Borough'], d_tor['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='brown',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_ofneigh)  
    
map_ofneigh

### Explore the neighbourhood around Toronto University

#### p.s. Geoffrey Hinton works there

In [132]:
d_tor.loc[25, 'Neighbourhood']

'Harbord,University of Toronto'

### Getting latitude and longitude values

In [133]:
neighbourhood_latitude = d_tor.loc[25, 'Latitude'] 
neighbourhood_longitude = d_tor.loc[25, 'Longitude'] 

neighbourhood_name = d_tor.loc[25, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Harbord,University of Toronto are 43.6626956, -79.4000493.


### Find top 50 veneus using Foursquare API. Selected radius 2000 meters

In [134]:
LIMIT = 50
radius = 2000

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=CVNQUGD221MLI52MXAQI2QMGGNMCUEPEHKBKFOZ1MLMSNEZC&client_secret=HU0ZZQQ5DFCHMF55A5XOVGJ4HDX43MS1D5FPXZNFDZH214VH&v=20190419&ll=43.6626956,-79.4000493&radius=2000&limit=50'

### Send the GET Request and examine the results

In [135]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cb9b7fa4434b94d03a9a986'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'University of Toronto',
  'headerFullLocation': 'University of Toronto, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 237,
  'suggestedBounds': {'ne': {'lat': 43.680695618000016,
    'lng': -79.37521381163324},
   'sw': {'lat': 43.64469558199998, 'lng': -79.42488478836677}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '527d450111d25050de4ea0d8',
       'name': 'Rasa',
       'location': {'address': '196 Robert Street',
        'lat': 43.662756751275445,
        'lng': -79.40398803188654,
        'labeledLatLngs': [{'label': 'display',
    

### Define information of interest and filter dataframe

In [136]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [137]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(50)

Unnamed: 0,name,categories,lat,lng
0,Rasa,Restaurant,43.662757,-79.403988
1,Yasu,Japanese Restaurant,43.662837,-79.403217
2,Her Father's Cider Bar + Kitchen,Beer Bar,43.662448,-79.404703
3,Harbord House,Bar,43.662466,-79.40541
4,Fresh on Bloor,Vegetarian / Vegan Restaurant,43.666755,-79.403491
5,The Planet Traveler,Hostel,43.657202,-79.403568
6,Philosopher's Walk,Park,43.666894,-79.395597
7,Koerner Hall,Concert Hall,43.667983,-79.395962
8,Royal Ontario Museum,Museum,43.668367,-79.394813
9,Queen's Park,Park,43.663946,-79.39218


In [138]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

50 venues were returned by Foursquare.


### So we found 50 different venues. Toronto is amazing
