#### Before we get the data and start exploring it, let's download all the dependencies that we will need.

In [2]:
import numpy as np  # library to handle data in a vectorized manner

import pandas as pd  # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests  # library to handle requests

from bs4 import BeautifulSoup  # library used for scraping

import texttable as tt  # library to print data as a table

from sklearn.cluster import KMeans  # import k-means from clustering stage

import folium  # map rendering library

from geopy.geocoders import Nominatim  # convert an address into latitude and longitude values

import json # library to handle JSON files


#### Request then parse the contents of the website

In [3]:
response = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(response.text, "html.parser")  # pass the html object and the type of parser as arguments

#### Scrape the data from the wwebsite and print the table to confirm if the required data is obtained 

In [4]:
# soup.find_all will scrape every element in the table

table_data = iter(soup.find_all('td'))
data = []

# This loop will keep repeating till there is data available in the iterator
while True:
    try:
        postal_code = next(table_data).text
        borough = next(table_data).text
        neighborhood = next(table_data).text

        data.append([postal_code, borough, neighborhood])

    # Exception will occur when there are no more elements left to iterate through
    except StopIteration:
        break

# Print the table
table = tt.Texttable()  # create texttable object
table.add_rows([(None, None, None)] + data)  # add an empty row at the beginning for the headers
table.set_cols_align(('c', 'c', 'c'))  # 'c' denotes center align
table.header((' Postal Code ', ' Borough ', ' Neighborhood '))  # add the headers
print(table.draw())

+---------------+------------------+-------------------------------------------+
|  Postal Code  |      Borough     |                Neighborhood               |
|      M1A      |   Not assigned   |               Not assigned                |
|               |                  |                                           |
+---------------+------------------+-------------------------------------------+
|      M2A      |   Not assigned   |               Not assigned                |
|               |                  |                                           |
+---------------+------------------+-------------------------------------------+
|      M3A      |    North York    |                 Parkwoods                 |
|               |                  |                                           |
+---------------+------------------+-------------------------------------------+
|      M4A      |    North York    |             Victoria Village              |
|               |           

#### Convert the data to a pandas data frame 

In [6]:
df = pd.DataFrame(data)
df.columns =["Postal Code", "Borough", "Neighborhood"]  # set the column names
cols_to_check = ['Postal Code','Borough', 'Neighborhood']
df[cols_to_check] = df[cols_to_check].replace({'\n':''}, regex=True)  # remove "/n" character from every cell
df = df.iloc[:180] 
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

In [7]:

df=df[df['Borough']!='Not assigned'] # remove not assigned
df.head()


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Group neighborhoods with like postal codes 
#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [8]:

df_condition = df[df.Borough != 'Not assigned']
df_final = df_condition.groupby(['Postal Code','Borough'], sort=False).agg(', '.join) 
df_final.reset_index(inplace=True) 
df_final.loc[df_final['Neighborhood']=="Not assigned",'Neighborhood']=df_final.loc[df_final['Neighborhood']=="Not assigned",'Borough']

df_final.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [9]:
df.shape

(103, 3)

#### Read the geographical coordinates of each postal code

In [10]:
geo_coord = pd.read_csv('https://cocl.us/Geospatial_data')
geo_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge the two data frames

In [11]:
df_merged = pd.merge(df_final,geo_coord,on='Postal Code')
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Filter the data to obtain boroughs that contain the word "Toronto"

In [12]:
df_filtered = df_merged[['Toronto' in x for x in df_merged['Borough']]]
df_filtered

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Use geopy library to get the latitude and longitude values of Toronto

In [13]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


#### Create a map of Toronto with neighborhoods superimposed on top

In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(
        df_filtered['Latitude'], 
        df_filtered['Longitude'], 
        df_filtered['Borough'], 
        df_filtered['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

#### Define Foursquare Credentials and Version

In [14]:
CLIENT_ID = 'TKXS2XMWIT30VQAEBAL4AV05WLKRG2Q0AL35LWTCUE3SP4WQ' # your Foursquare ID
CLIENT_SECRET = 'FVADZBSYOGMTT2BJTJ0KD0QWAAG4AGE5DL2JAJA1HXASFV54' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version


#### Explore a neighborhood in our data frame

In [15]:
df_filtered.loc[2, 'Neighborhood']

'Regent Park, Harbourfront'

#### Get the neighborhood's latitude and longitude values

In [16]:
neighborhood_latitude = df_filtered.loc[2, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_filtered.loc[2, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_filtered.loc[2, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


#### Now, let's get the top 100 venues that are in Regent Park, Harbourfron within a radius of 500 meters.

In [17]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

# get the result to a json file
results = requests.get(url).json()

In [18]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#### Clean the json and structure it into a pandas dataframe

In [19]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


In [20]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

44 venues were returned by Foursquare.
