# 1. web scraping for postcode, borough, neighborhood dataframe 

### 1.1 import packages 

In [1]:
import numpy as np
import pandas as pd
import requests # for getting html content
from bs4 import BeautifulSoup # for obtaining clean html content

### 1.2 download content from wiki, obtain & clean up content in the table

In [2]:
############1. obtain data from wiki (using BeautyfulSoup)####################
wiki_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_postal_codes = requests.get(wiki_link)

soup = BeautifulSoup(wiki_postal_codes.content, 'lxml')
postal_table = soup.table

############2. get content from the entire table, parse into array of strings #############
body = postal_table.text.split("\n")
while ("" in body):
    body.remove("")
    
############3. reshape the array into N x 3 matrix, first row is col name, remaining are content ######
postal_table = np.reshape(body,(-1,3))
col = postal_table[0]
data = postal_table[1:]

df = pd.DataFrame(data, columns = col)

############# 4. data clean up -- remove rows with "Not assigned" Borough ############
df = df[df.Borough != "Not assigned"]

############# 4. data clean up -- assign the "Not assigned" Neighbourhood with the same Borough name ############
ind = df.loc[df['Neighbourhood']=='Not assigned']
df.Neighbourhood.replace("Not assigned", ind['Borough'], inplace = True)

############# 4. data clean up -- group the Neighbourhood with the same Postcode ############
df = df.groupby('Postcode').agg({'Borough':'first',
                                    'Neighbourhood':", ".join}).reset_index()

df.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### 1.3 print shape of the resulting dataframe

In [3]:
df.shape

(103, 3)

# 2. add longitude, lattitude data into data frame (using postalcode)

In [4]:
'''
import geocoder # install geocoder

# initialize your variable to None
lat_lng_coords = None

g = geocoder.google('Mountain View, CA') # a sample request to test performance

# it takes forever to load a single lat_lng data, using .csv instead..

while lat_lng_coords == None:
    g = geocoder.google('Mountain View, CA')
    lat_lng_coords = g.latlng
print(g.latlng)
'''

"\nimport geocoder # install geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\ng = geocoder.google('Mountain View, CA') # a sample request to test performance\n\n# it takes forever to load a single lat_lng data, using .csv instead..\n\nwhile lat_lng_coords == None:\n    g = geocoder.google('Mountain View, CA')\n    lat_lng_coords = g.latlng\nprint(g.latlng)\n"

In [5]:
postcode_latlng = pd.read_csv('http://cocl.us/Geospatial_data')
postcode_latlng.columns = ['Postcode','Latitude','Longitude']
postcode_latlng.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
postal_df = pd.merge(df, postcode_latlng, how = 'left', on = ['Postcode'])
postal_df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# 3. cluster the neighborhood data and visualize

In [7]:
import folium
from geopy.geocoders import Nominatim # convert an address intolatitude and longitude values
import datetime
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.cluster import KMeans
print("all imported!")

all imported!


### 3.1 select the borough that contains "Toronto"

In [8]:
toronto_data = postal_df[postal_df['Borough'].str.contains('Toronto', regex = False)].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### 3.2 for Boroughs that contains multiple neighbourhoods, separated them into different rows 

In [34]:
col_names = ['Postcode','Borough','Neighbourhood','Latitude','Longitude']
toronto_neigh = pd.DataFrame(columns = col_names)
#toronto_neigh
for i in range(toronto_data.shape[0]):
    postcode = toronto_data.loc[i, 'Postcode']
    borough = toronto_data.loc[i, 'Borough']
    lat = toronto_data.loc[i, 'Latitude'].astype(float)
    lng = toronto_data.loc[i, 'Longitude'].astype(float)
    neigh = toronto_data.loc[i,'Neighbourhood'].split(", ")
    for j in range(len(neigh)):
        toronto_neigh = toronto_neigh.append(pd.DataFrame(np.array([[postcode, borough, neigh[j], lat, lng]]), columns = col_names))

toronto_neigh = toronto_neigh.reset_index(drop = True)
toronto_neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.67635739999999,-79.2930312
1,M4K,East Toronto,The Danforth West,43.6795571,-79.352188
2,M4K,East Toronto,Riverdale,43.6795571,-79.352188
3,M4L,East Toronto,The Beaches West,43.6689985,-79.31557159999998
4,M4L,East Toronto,India Bazaar,43.6689985,-79.31557159999998


### 3.3 create folium map of toronto with boroughs on them

In [39]:
# find lat_lng lcoation of "Toronto,CA"
address = "Toronto, Ontario"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
lat = location.latitude
lng = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(lat, lng))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [40]:
# create folium map
toronto_map = folium.Map(location = [lat, lng], zoom_start = 11
                        )
for lat, lng, borough, neighborhood in zip(toronto_neigh['Latitude'], toronto_neigh['Longitude'], toronto_neigh['Borough'],toronto_neigh['Neighbourhood']):
    label = '{}, {} ({}, {})'.format(neighborhood, borough, lat, lng)
    label = folium.Popup(label, parse_html= True)
    folium.CircleMarker([float(lat),float(lng)],
                       radius = 3,
                       popup = label,
                       color = 'red',
                       fill = True,
                       fill_color = '#a72920',
                       fill_opacity = 0.5,
                       parse_html = False).add_to(toronto_map)
toronto_map

### 3.4 use FourSquares 

In [12]:
now = datetime.datetime.now()
date = "%4d%02d%02d" % (now.year, now.month, now.day)

CLIENT_ID = 'RITE5DZBWPGXJKX5S5A2L03WXIAXZOIDDAACBCWIGXO4XAEQ' # Foursquare ID
CLIENT_SECRET = 'HGGFICSXSLIIPMB0QTLVGXVBO3NFMIRKCQFT4PSJDNEJWGOF' # Foursquare Secret
VERSION = date # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: RITE5DZBWPGXJKX5S5A2L03WXIAXZOIDDAACBCWIGXO4XAEQ
CLIENT_SECRET:HGGFICSXSLIIPMB0QTLVGXVBO3NFMIRKCQFT4PSJDNEJWGOF


#### 3.4.1 analyze a sample neighbourhood

In [13]:
toronto_neigh.loc[0,'Neighbourhood']

'The Beaches'

In [14]:
i = 0
latitude = toronto_neigh.loc[i, 'Latitude'] # neighborhood latitude value
longitude = toronto_neigh.loc[i, 'Longitude'] # neighborhood longitude value
neighborhood_name = toronto_neigh.loc[i, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               latitude, 
                                                               longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [15]:
radius = 500
limit = 100
url = "https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        latitude,
        longitude,
        VERSION,
        radius,
        limit)

url


'https://api.foursquare.com/v2/venues/search?client_id=RITE5DZBWPGXJKX5S5A2L03WXIAXZOIDDAACBCWIGXO4XAEQ&client_secret=HGGFICSXSLIIPMB0QTLVGXVBO3NFMIRKCQFT4PSJDNEJWGOF&ll=43.67635739999999,-79.2930312&v=20190417&radius=500&limit=100'

In [16]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cb7bf69dd5797408ed52e8e'},
 'response': {'venues': [{'id': '56afcad6498e05333bf42031',
    'name': 'Glen Stewart Ravine',
    'location': {'lat': 43.67629984029563,
     'lng': -79.2947841389563,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.67629984029563,
       'lng': -79.2947841389563}],
     'distance': 141,
     'cc': 'CA',
     'country': 'Canada',
     'formattedAddress': ['Canada']},
    'categories': [{'id': '4bf58dd8d48988d162941735',
      'name': 'Other Great Outdoors',
      'pluralName': 'Other Great Outdoors',
      'shortName': 'Other Outdoors',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/parks_outdoors/outdoors_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1555545961',
    'hasPerk': False},
   {'id': '4bd461bc77b29c74a07d9282',
    'name': 'Glen Manor Ravine',
    'location': {'address': 'Glen Manor',
     'crossStreet': 'Queen St.',
     'lat': 43.67682094413784,


In [17]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [18]:
venues = results['response']['venues']#[0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Glen Stewart Ravine,Other Great Outdoors,43.6763,-79.294784
1,Glen Manor Ravine,Trail,43.676821,-79.293942
2,Glen Stewart Park,Park,43.675278,-79.294647
3,Beaches Fitness - Personal Trainer & Health Coach,Gym / Fitness Center,43.669253,-79.31114
4,Kondope,,43.679822,-79.295899


In [19]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

100 venues were returned by Foursquare.


#### 3.4.2 analyze all the neighborhoods

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT = 100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
toronto_venues = getNearbyVenues(names=toronto_neigh['Neighbourhood'],
                                   latitudes=toronto_neigh['Latitude'],
                                   longitudes=toronto_neigh['Longitude']
                                  )


The Beaches
The Danforth West
Riverdale
The Beaches West
India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park
Summerhill East
Deer Park
Forest Hill SE
Rathnelly
South Hill
Summerhill West
Rosedale
Cabbagetown
St. James Town
Church and Wellesley
Harbourfront
Regent Park
Ryerson
Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide
King
Richmond
Harbourfront East
Toronto Islands
Union Station
Design Exchange
Toronto Dominion Centre
Commerce Court
Victoria Hotel
Roselawn
Forest Hill North
Forest Hill West
The Annex
North Midtown
Yorkville
Harbord
University of Toronto
Chinatown
Grange Park
Kensington Market
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place
Underground city
Christie
Dovercourt Village
Dufferin
Little Portugal
Trinity
Brockton
Exhibition Place
Parkdale Village
High Park
The Junction South
Parkdale
Roncesvall

In [22]:
print(toronto_venues.shape)
toronto_venues.head()

(3301, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.67635739999999,-79.2930312,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.67635739999999,-79.2930312,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.67635739999999,-79.2930312,Starbucks,43.678798,-79.298045,Coffee Shop
3,The Beaches,43.67635739999999,-79.2930312,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Danforth West,43.6795571,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


In [23]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adelaide,100,100,100,100,100,100
Bathurst Quay,14,14,14,14,14,14
Berczy Park,57,57,57,57,57,57
Brockton,20,20,20,20,20,20
Business Reply Mail Processing Centre 969 Eastern,16,16,16,16,16,16
CN Tower,14,14,14,14,14,14
Cabbagetown,44,44,44,44,44,44
Central Bay Street,87,87,87,87,87,87
Chinatown,100,100,100,100,100,100
Christie,16,16,16,16,16,16


In [24]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 238 uniques categories.


### 3.5 analyze each neighborhood

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Danforth West,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
toronto_onehot.shape

(3301, 239)

In [31]:
toronto_grouped = toronto_onehot.groupby(['Neighbourhood']).mean().reset_index()

In [32]:
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,Adelaide,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0
1,Bathurst Quay,0.0,0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0
3,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625


In [33]:
toronto_grouped.shape

(73, 239)

#### 3.5.1 put the sorted categories in a dataframe

In [41]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [62]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = []
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

fixed_columns = [neighborhoods_venues_sorted.columns[-1]] + list(neighborhoods_venues_sorted.columns[:-1])
neighborhoods_venues_sorted = neighborhoods_venues_sorted[fixed_columns]

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Adelaide,Coffee Shop,Steakhouse,Café,Thai Restaurant,American Restaurant
1,Bathurst Quay,Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Boat or Ferry
2,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Café,Cheese Shop
3,Brockton,Coffee Shop,Café,Breakfast Spot,Grocery Store,Italian Restaurant
4,Business Reply Mail Processing Centre 969 Eastern,Yoga Studio,Garden Center,Restaurant,Auto Workshop,Skate Park


### 5. Cluster neighborhood