# Week 3 Segmenting and Clustering Neighbor

#### Load required packages

In [1]:
import numpy as np
import pandas as pd

import json

from geopy.geocoders import Nominatim

import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

print('Libraries imported.')

Libraries imported.


## Part 1: Scraping Data

In [2]:
page_url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
dfs=pd.read_html(page_url, attrs={"class": "wikitable"}, header=0)
print(len(dfs))
# dfs[0].shape

1


# Question 1

In [3]:
toronto_data=dfs[0]

# Update columns' name
toronto_data.columns = ['PostalCode', 'Borough', 'Neighborhood']

# Ignore cells with borough 'Not assigned'
toronto_data=toronto_data[toronto_data['Borough']!='Not assigned']

# Set Neighbourhood to Borough if it's 'Not assigned'
nbh_not_index=toronto_data['Neighborhood']=='Not assigned'
tdata=toronto_data.copy()
tdata.loc[nbh_not_index, 'Neighborhood']=toronto_data.loc[nbh_not_index,'Borough']

# print(toronto_data.shape)
# Combine Neighbourhood with same Postcode
tdata=tdata.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
# print(toronto_data.shape)

tdata

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Part 2: Geocoder

Function to try get geo data

In [4]:
import geocoder

def get_lat_lng_coords(postcode):
    lat_lng_coords = None
    
    try_count=0
    while(lat_lng_coords is None):
        if try_count==10:
            print('Warning: {} Failed 10 times Check Network Connection'.format(postcode))
            return 0, 0
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        print(g.latlng)
        lat_lng_coords = g.latlng
        try_count+=1
    
    print('Get {}'.format(postcode))
    return lat_lng_coords[0], lat_lng_coords[1]

In [5]:
feature_list=['Latitude', 'Longitude']
latlng_df=pd.DataFrame(columns=feature_list)
# latlng_df

def unused_code():
    for code in tdata['Postcode']:
        lat, lng = get_lat_lng_coords(code)
        latlng_df.append({'Latitude': lat, 'Longitude': lng}, ignore_index=True)
    latlng_df.head()


Google didn't bring me much joy, Let's just download the data

In [6]:
# !wget -q -O 'toronto_data.csv' "http://cocl.us/Geospatial_data"
# print('Data downloaded!')

In [7]:
with open('toronto_data.csv') as json_data:
    geo_data = pd.read_csv(json_data)

geobk=geo_data.copy()
# geo_data

#### Finally, rename the columns' name and join two dataframes

# Question 2

In [8]:
geo_data.columns=['PostalCode', 'Latitude', 'Longitude']
merged_data = tdata.join(geo_data.set_index('PostalCode'), on="PostalCode")
merged_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


Checking Correctness

In [9]:
merged_data.loc[merged_data['PostalCode']=='M1R']
pass

## Part 3: Cluster and Visualization

Filter Borough with string 'Toront'

In [10]:
toronto_nghbr = list(map(lambda bor: bor.find('Toront') != -1, merged_data['Borough']))
t_data = merged_data.loc[toronto_nghbr].reset_index(drop=True)
t_data.head()
pass

Function from lab to get nearby Venues

Include Foursquare API credential

In [11]:
from credential import *
LIMIT = 150
CLIENT_ID = FS_ID # Foursquare ID
CLIENT_SECRET = FS_SECRET # Foursquare Secret
VERSION = FS_VERSION

In [12]:
import json
import sys, collections
Local_Cache = 'venues_data.json'

def read_json(file):
    fp=open(file,'r+')
    res_dict=json.load(fp)
    fp.close()
    return {} if isinstance(res_dict, collections.Mapping) == False else res_dict

def write_json(file, content):
    fp= open(file, 'w+')
    json.dump(content,fp)
    fp.truncate()
    fp.close()

In [13]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_dict = read_json(Local_Cache)
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        if name in venues_dict:
            print('Already exist in cache, Processing: ' + str(name))
            results=venues_dict[name]
        else:
            print('Fetching: ' + str(name))
            
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION, 
                lat, 
                lng, 
                radius, 
                LIMIT)
            try:
                # make the GET request
                results = requests.get(url).json()["response"]['groups'][0]['items']
                venues_dict[name]=results
            except:
                error = sys.exc_info()[0]
                print("Error fetching" + str(name) + " " + str(error))
                continue
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    write_json(Local_Cache, venues_dict) # write results back to local cache
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
from credential import *
LIMIT = 150
t_venues = getNearbyVenues(names=t_data['Neighborhood'],
                           latitudes=t_data['Latitude'],
                           longitudes=t_data['Longitude']
                          )
t_venues.head()
pass

Already exist in cache, Processing: The Beaches
Already exist in cache, Processing: The Danforth West, Riverdale
Already exist in cache, Processing: The Beaches West, India Bazaar
Already exist in cache, Processing: Studio District
Already exist in cache, Processing: Lawrence Park
Already exist in cache, Processing: Davisville North
Already exist in cache, Processing: North Toronto West
Already exist in cache, Processing: Davisville
Already exist in cache, Processing: Moore Park, Summerhill East
Already exist in cache, Processing: Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Already exist in cache, Processing: Rosedale
Already exist in cache, Processing: Cabbagetown, St. James Town
Already exist in cache, Processing: Church and Wellesley
Already exist in cache, Processing: Harbourfront, Regent Park
Already exist in cache, Processing: Ryerson, Garden District
Already exist in cache, Processing: St. James Town
Already exist in cache, Processing: Berczy Park
Already e

#### NOTE: Venue Category contains 'Neighborhood'

In [15]:
t_venues.head()
print(t_venues.shape)
list(t_venues['Venue Category']).index('Neighborhood')

(1719, 7)


4

Find out how many unique categories

In [16]:
print('There are {} uniques categories.'.format(len(t_venues['Venue Category'].unique())))

There are 237 uniques categories.


In [17]:
t_venues[['Venue Category']].head()
pass

Convert categorical variable into dummy/indicator variable

In [18]:
t_onehot=pd.get_dummies(t_venues[['Venue Category']], prefix="", prefix_sep="")
# print(t_onehot.shape)

# Add Neighborhood Column back and Name it Nghbhood to differentiate from Neighborhood
t_onehot['Nghbhood']=t_venues['Neighborhood']
# print(t_onehot.shape)
# print(t_onehot.columns)
# Move Nghbhood column to the first place
fixed_columns = [t_onehot.columns[-1]]+list(t_onehot.columns[:-1])
# print(fixed_columns.index('Nghbhood'))
# print(len(list(fixed_columns)))
toronto_oh = t_onehot[fixed_columns]
# toronto_oh.head()

We calculate the percentage of each Venue Category by dividing the total venue number

In [19]:
toronto_percent = toronto_oh.groupby(['Nghbhood']).sum()/toronto_oh.groupby(['Nghbhood']).count()
# toronto_percent = toronto_oh.groupby(['Nghbhood']).mean()
toronto_percent.reset_index(inplace=True)
toronto_group = toronto_percent
print(toronto_percent.shape)
# toronto_percent.head()

(38, 238)


### Cluster Neighborhoods

In [20]:
# number of clusters
kclusters = 5

# drop Nghbhood from column
toronto_clustering = toronto_percent.drop('Nghbhood', axis=1)

kmeans =KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)

# Check cluster labels
kmeans.labels_[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [21]:
# add cluster labels
print(len(kmeans.labels_))
print(toronto_group.shape)

38
(38, 238)


In [22]:
t_data.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_data=t_data

toronto_data.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,0,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,0,M4M,East Toronto,Studio District,43.659526,-79.340923
4,0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


### Plot Map

Get Toronto geographical coordinates

In [26]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


# Question 3

In [30]:
# Create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# Set color scheme for the clusters
x =np.arange(kclusters)
ys = [i + x + (i**2)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
# print(colors_array)
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood'], toronto_data['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

### Thank you for reviewing