# 1) Web Scraping

In [65]:
#main libraries
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
#import folium
import requests
from geopy.geocoders import Nominatim

In [2]:
#prepare webscraper 
!pip install beautifulsoup4
!pip install lxml
from bs4 import BeautifulSoup
import csv
import json
import lxml



In [3]:
#get raw html file from url page
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = requests.get(url).text
mySoup = BeautifulSoup(results, 'lxml')

In [4]:
scrapTable = mySoup.body.find('table')

In [5]:
table_rows = scrapTable.findAll('tr')

In [6]:
output_rows = []
for table_row in table_rows:
    columns = table_row.findAll('td')
    output_row = []
    for column in columns:
        if(column.text != 'Not assigned'):
            output_row.append(column.text)
        else:
            output_row.append('discard')
    output_rows.append(output_row)


In [7]:
#initialize .csv file on which we will write the data from the webpage and that we will later transform in a dataframe
csv_file = open('TorontoData.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerows(output_rows)

In [8]:
csv_file.close()

# 2) Create dataframe

In [9]:
df = pd.read_csv('TorontoData.csv')

In [10]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']

In [11]:
df = df[df.Borough != 'discard']

In [12]:
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M3A,North York,Parkwoods\n
2,M4A,North York,Victoria Village\n
3,M5A,Downtown Toronto,Harbourfront\n
4,M5A,Downtown Toronto,Regent Park\n
5,M6A,North York,Lawrence Heights\n
6,M6A,North York,Lawrence Manor\n
7,M7A,Queen's Park,Not assigned\n
9,M9A,Etobicoke,Islington Avenue\n
10,M1B,Scarborough,Rouge\n
11,M1B,Scarborough,Malvern\n


In [13]:
TorontoDF = pd.DataFrame(columns= ['PostalCode', 'Borough', 'Neighborhood'])

In [14]:
for index, row in df.iterrows():
    if(row['Neighborhood'] == 'Not assigned\n'):
        row['Neighborhood'] = row['Borough']
    postCode = row['PostalCode']
    borough = row['Borough']
    neighborhood = row['Neighborhood']
    
    TorontoDF = TorontoDF.append({'PostalCode': postCode, 'Borough': borough, 'Neighborhood': neighborhood}, ignore_index=True)

In [15]:
TorontoDF['Neighborhood'] = TorontoDF['Neighborhood'].str.replace('\n','')

In [32]:
TorontoDF.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [61]:
#TorontoDF.groupby(['PostalCode']).groups.keys()
#TorAggregated = TorontoDF.groupby(['PostalCode'])[['Neighborhood'], ['Borough']].apply(lambda x: ','.join(x)).reset_index()
TorAggregated = TorontoDF.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(list)

In [93]:
TorontoDF_Final= pd.DataFrame(TorAggregated)

In [94]:
TorontoDF_Final.reset_index(inplace=True)

In [95]:
TorontoDF_Final

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]"
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]"


In [79]:
TorontoDF_Final.shape

(103, 3)

# 3) Get PostCodes Coordinates

### *Note:* I had some troubles because geocoders could not have been imported, therefore I used the .csv with the coordinates

In [85]:
!wget -q -O 'toronto_data.csv' https://cocl.us/Geospatial_data

In [87]:
latDF = pd.read_csv('toronto_data.csv')

In [101]:
TorontoDF_wCoord = TorontoDF_Final.assign(Latitude = latDF['Latitude'],  Longitude= latDF['Longitude'] )

In [102]:
TorontoDF_wCoord

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476
5,M1J,Scarborough,[Scarborough Village],43.744734,-79.239476
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]",43.727929,-79.262029
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]",43.711112,-79.284577
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]",43.716316,-79.239476
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]",43.692657,-79.264848


# 4) Toronto Neighbor Clustering

In [103]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [105]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, postcode in zip(TorontoDF_wCoord['Latitude'], TorontoDF_wCoord['Longitude'], TorontoDF_wCoord['Borough'], TorontoDF_wCoord['PostalCode']):
    label = '{}, {}'.format(postCode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## I do not know too much of Toronto, so I decided to narrow down my exploration to the area around its university, located in the downtown (precisely M1B, Downtown Toronto)

In [115]:
downtown_data = TorontoDF_wCoord[TorontoDF_wCoord['PostalCode'] == 'M1B'].reset_index(drop=True)
downtown_data['Latitude'][0]

43.806686299999996

In [106]:
m1b_address = 'M1B, Toronto, Ontario'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(m1b_address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the downtown are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of the downtown are 43.653963, -79.387207.


In [113]:
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=14)

# add markers to map
for lat, lng, label in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Neighborhood']):
    label = '{}, {}'.format(postCode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

# 5) use FS API 

In [116]:
CLIENT_ID = 'K0AX2YIXDJUOQYQTF5CBAL5VZSPVVFJKFV3QOI0RZIPPPC21' # your Foursquare ID
CLIENT_SECRET = 'RGWNPXTQE1JTAQ2IGB1D5HS1NGT54CUJCUTXTVC4TQL2DTZB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: K0AX2YIXDJUOQYQTF5CBAL5VZSPVVFJKFV3QOI0RZIPPPC21
CLIENT_SECRET:RGWNPXTQE1JTAQ2IGB1D5HS1NGT54CUJCUTXTVC4TQL2DTZB


In [122]:

LIMIT = 100 

radius = 1500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    downtown_data['Latitude'][0], 
    downtown_data['Longitude'][0], 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=K0AX2YIXDJUOQYQTF5CBAL5VZSPVVFJKFV3QOI0RZIPPPC21&client_secret=RGWNPXTQE1JTAQ2IGB1D5HS1NGT54CUJCUTXTVC4TQL2DTZB&v=20180605&ll=43.806686299999996,-79.19435340000001&radius=1500&limit=100'

In [123]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d0aa47c2b274a0039e83996'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 31,
  'suggestedBounds': {'ne': {'lat': 43.82018631350001,
    'lng': -79.17568194219122},
   'sw': {'lat': 43.79318628649998, 'lng': -79.2130248578088}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c97975582b56dcb8320ebaa',
       'name': 'Canadiana exhibit',
       'location': {'lat': 43.81796218928876,
        'lng': -79.19337359666939,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.81796218928876,
          'lng': -79.193373

In [124]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [131]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Canadiana exhibit,Zoo Exhibit,43.817962,-79.193374
1,Images Salon & Spa,Spa,43.802283,-79.198565
2,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777
3,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
4,LCBO,Liquor Store,43.796671,-79.204586
5,Wendy's,Fast Food Restaurant,43.802008,-79.19808
6,Harvey's,Fast Food Restaurant,43.800106,-79.198258
7,Wendy's,Fast Food Restaurant,43.807448,-79.199056
8,Tim Hortons,Coffee Shop,43.802,-79.198169
9,Pizza Pizza,Pizza Place,43.806613,-79.178445


In [126]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

31 venues were returned by Foursquare.


## I will now check how many categories are present among the results

In [127]:
print('There are {} uniques categories.'.format(len(nearby_venues['categories'].unique())))

There are 18 uniques categories.


In [133]:
# one hot encoding
downtown_onehot = pd.get_dummies(nearby_venues[['categories']], prefix="", prefix_sep="")


downtown_onehot

Unnamed: 0,American Restaurant,Bakery,Big Box Store,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Cosmetics Shop,Fast Food Restaurant,Fruit & Vegetable Store,Hobby Shop,Liquor Store,Movie Theater,Paper / Office Supplies Store,Pizza Place,Spa,Video Game Store,Women's Store,Zoo Exhibit
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
