In [188]:
import requests
import bs4
import pandas as pd
import numpy as np

#### Scrape a Wikipedia page and put values from the table into a DataFrame

In [17]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
data = []
for tr in soup.table.find_all('tr')[1:]:
    data.append( list(map(lambda x: x.text.rstrip(), tr.find_all('td'))) )
df = pd.DataFrame(data, columns=['PostalCode', 'Burough', 'Neighborhood'])
df

Unnamed: 0,PostalCode,Burough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### Requirement: Remove all rows that have a Burough value of 'Not assigned'

In [18]:
df = df.drop(df[df.Burough == 'Not assigned'].index)
df.Burough.value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Mississauga          1
Name: Burough, dtype: int64

#### Requirement: Combine rows that have the same PostalCode
There are no rows that share a postal code, so no action is needed

In [4]:
v = df.PostalCode.value_counts()
df[df.PostalCode.isin(v.index[v.gt(5)])]

Unnamed: 0,PostalCode,Burough,Neighborhood


#### Requirement: If a Neighborhood is 'Not assigned', give it the value in Burough
There are no Neighborhoods with the value 'Not assigned', so no action is needed

In [5]:
df[df.Neighborhood == 'Not assigned'].count()

PostalCode      0
Burough         0
Neighborhood    0
dtype: int64

#### After processing the data, 103 rows remain in the DataFrame

In [19]:
df.shape

(103, 3)

#### Use geo data to get latitude and longitude coordinates for PostalCodes

In [15]:
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [20]:
df = df.merge(geo_df, left_on='PostalCode', right_on='Postal Code').drop('Postal Code', axis=1)
df

Unnamed: 0,PostalCode,Burough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [98]:
df = df[df['Burough'].str.contains('[Tt]oronto')]

In [21]:
import folium

In [107]:
df.reset_index(drop=True, inplace=True)


In [111]:
map_toronto = folium.Map(location=[df.Latitude[0], df.Longitude[0]], zoom_start=11)
for lat, lng, burough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Burough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, burough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

In [193]:
# Here a request was made to the FOURSQUARE API, but I have omitted it so as not to display my access parameters to the API
# Just in case someone can run this and get values stored in variables...
CLIENT_ID = 'NOPE'
CLIENT_SECRET = 'NICE TRY'

In [33]:
response = requests.get(url).json()
response

{'meta': {'code': 200, 'requestId': '602eb2b72b8f4854bd1f3d3f'},
 'response': {'venues': [{'id': '4e42684718a8627fce453c01',
    'name': 'TTC stop #8380',
    'location': {'address': 'Underhill Dr',
     'crossStreet': 'At Cassandra N',
     'lat': 43.752672,
     'lng': -79.326351,
     'labeledLatLngs': [{'label': 'display',
       'lat': 43.752672,
       'lng': -79.326351}],
     'distance': 273,
     'cc': 'CA',
     'city': 'Toronto',
     'state': 'ON',
     'country': 'Canada',
     'formattedAddress': ['Underhill Dr (At Cassandra N)',
      'Toronto ON',
      'Canada']},
    'categories': [{'id': '52f2ab2ebcbc57f1066b8b4f',
      'name': 'Bus Stop',
      'pluralName': 'Bus Stops',
      'shortName': 'Bus Stop',
      'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/travel/busstation_',
       'suffix': '.png'},
      'primary': True}],
    'referralId': 'v-1613673143',
    'hasPerk': False},
   {'id': '4e8d9dcdd5fbbbb6b3003c7b',
    'name': 'Brookbanks Park',
    '

In [41]:
from pandas.io.json import json_normalize

In [43]:
venues = response['response']['venues']
nearby_venues = json_normalize(venues)

  nearby_venues = json_normalize(venues)


In [46]:
nearby_venues = nearby_venues.loc[:, ['name', 'categories', 'location.lat', 'location.lng']]
nearby_venues

Unnamed: 0,name,categories,location.lat,location.lng
0,TTC stop #8380,"[{'id': '52f2ab2ebcbc57f1066b8b4f', 'name': 'B...",43.752672,-79.326351
1,Brookbanks Park,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",43.751976,-79.332140
2,Subway,"[{'id': '4bf58dd8d48988d1c5941735', 'name': 'S...",43.760334,-79.326906
3,Dollarama,"[{'id': '52dea92d3cf9994f4e043dbb', 'name': 'D...",43.760341,-79.325519
4,Allwyn's Bakery,"[{'id': '4bf58dd8d48988d144941735', 'name': 'C...",43.759840,-79.324719
...,...,...,...,...
87,Rachdale Public School,"[{'id': '4f4533804b9074f6e4fb0105', 'name': 'E...",43.751937,-79.322195
88,Valu Mart Garden Centre,"[{'id': '4eb1c0253b7b52c0e1adc2e9', 'name': 'G...",43.746040,-79.325238
89,TTC Stop 9083,"[{'id': '52f2ab2ebcbc57f1066b8b4f', 'name': 'B...",43.759251,-79.334000
90,Ranchdale Park,"[{'id': '4bf58dd8d48988d163941735', 'name': 'P...",43.751388,-79.322138


In [80]:
def extract_category(category):
    if len(category) == 0:
        return None
    else:
        return category[0]['name']

In [83]:
nearby_venues['categories'] = nearby_venues['categories'].apply(extract_category)
nearby_venues

Unnamed: 0,name,categories,location.lat,location.lng
0,TTC stop #8380,Bus Stop,43.752672,-79.326351
1,Brookbanks Park,Park,43.751976,-79.332140
2,Subway,Sandwich Place,43.760334,-79.326906
3,Dollarama,Discount Store,43.760341,-79.325519
4,Allwyn's Bakery,Caribbean Restaurant,43.759840,-79.324719
...,...,...,...,...
87,Rachdale Public School,Elementary School,43.751937,-79.322195
88,Valu Mart Garden Centre,Garden Center,43.746040,-79.325238
89,TTC Stop 9083,Bus Stop,43.759251,-79.334000
90,Ranchdale Park,Park,43.751388,-79.322138


In [86]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [112]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'], latitudes=df['Latitude'], longitudes=df['Longitude'])
toronto_venues

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.654260,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.654260,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.654260,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.654260,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.654260,-79.360636,Impact Kitchen,43.656369,-79.356980,Restaurant
...,...,...,...,...,...,...,...
1596,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park
1597,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Toronto Yoga Mamas,43.664824,-79.324335,Yoga Studio
1598,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Olliffe On Queen,43.664503,-79.324768,Butcher
1599,"Business reply mail Processing Centre, South C...",43.662744,-79.321558,Greenwood Cigar & Variety,43.664538,-79.325379,Smoke Shop


#### One-hot encode our venues (because they are categorical)

In [174]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

# make last column (neighborhood) the first column
ordered_columns = toronto_onehot.columns.tolist()
ordered_columns.insert(0, ordered_columns.pop(ordered_columns.index('Neighborhood')))
toronto_onehot = toronto_onehot[ordered_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0625,0.0625,0.0625,0.125,0.125,0.0625,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015873,0.0,0.0,0.015873,0.0,0.015873


In [182]:
from sklearn.cluster import KMeans
k = 5

toronto_clustering = toronto_grouped.drop('Neighborhood', axis=1)

kmeans = KMeans(n_clusters=k, random_state=53)
kmeans.fit(toronto_clustering)
kmeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 3, 1, 0, 1,
       1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [185]:
df['ClusterLabel'] = kmeans.labels_
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ClusterLabel'] = kmeans.labels_


Unnamed: 0,PostalCode,Burough,Neighborhood,Latitude,Longitude,ClusterLabel
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564,1
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,1
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259,1


In [192]:
import matplotlib.cm as cm
import matplotlib.colors as colors
map_clusters = folium.Map(location=[df.Latitude[0], df.Longitude[0]], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['ClusterLabel']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Map meaning
Here we can see different neighborhoods clustered according to their venues. This did not create a particularly interesting result, as almost all neighborhoods were put in a single cluster.

##### Improvement
One suggestion is to not use *all* of the venues, but rather the most common ones (this was done in a previous lab). Also, perhaps also the clustering algorithm has hyperparameters that could be tweaked, but I don't know how to do that. 