In [3]:
#only needed for installation
#!conda install -c anaconda beautifulsoup4 --yes
#!conda install -c anaconda lxml --yes
#!conda install -c anaconda html5lib --yes
#!conda install -c anaconda requests --yes

import numpy as np 
import pandas as pd 
from bs4 import BeautifulSoup
import requests

# loading a page at a specific url

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_page = requests.get(url).content

# Scrapping a page 
### there a table that have all the data in the page which have the class set as wikitable sortable

In [5]:
soup = BeautifulSoup(html_page,'lxml')
table_postalcode=soup.find('table', class_='wikitable sortable')

postalCodeList=[]
for tableRow in table_postalcode.find_all('tr'):
    entryList=[]
    for tableCol in tableRow.find_all('td'):
        entryList.append(tableCol.text.replace('\n', ''))
    if len(entryList)==3: # only picking with all three enrties
        if entryList[1]!='Not assigned': # leaving the enrties with Borough is set as Not assigned
            if entryList[2]=='Not assigned': #if Neighbourhood is Not assigned then copy the value of Borough 
                entryList[2]=entryList[1]
                postalCodeList.append(entryList)
            else:
                postalCodeList.append(entryList)
headerList=[]
for tableRow in table_postalcode.find_all('tr'):
    for tableheading in tableRow.find_all('th'):
        headerList.append(tableheading.text.replace('\n', ''))

df = pd.DataFrame(postalCodeList, columns = headerList)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


# Joining all the Neighbourhood with same Borough

In [7]:
df1=df.groupby('Postcode')['Neighbourhood'].apply(', '.join)
df=df.drop('Neighbourhood', 1)
df=pd.merge(df, df1, on='Postcode')
df=df.drop_duplicates(keep='last').reset_index(drop=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [8]:
df.shape

(103, 3)

# Part 2

### downloading coordinates from a csv file at http://cocl.us/Geospatial_data and loading into a different dataframe

In [9]:
df_coordinates=pd.read_csv('https://cocl.us/Geospatial_data')
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Renaming the columns names and merging

In [10]:
df_coordinates=df_coordinates.rename(columns = {"Postal Code": "PostalCode"}) 
df=df.rename(columns = {"Postcode": "PostalCode"})

#Merging two dataframes bases on same column i.e. PostalCode
df=pd.merge(df, df_coordinates, on='PostalCode')

In [11]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


# Part 3

In [14]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

### Getting Coordinates of Toronto city

In [15]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [16]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add Borough as markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Map is not showing at github so i have added a screen shot of the map as following
<img src="toronto.jpg"/>

## Looking into Neighbourhoods
#### I like this area seems crowdy

In [19]:
df.loc[79]

PostalCode                   M4S
Borough          Central Toronto
Neighbourhood         Davisville
Latitude                 43.7043
Longitude               -79.3888
Name: 79, dtype: object

### Getting ready with foursquare API

In [20]:
CLIENT_ID = 'TQJ4YICT53P0H1NVREHKCL1OE4NNK12MI1WAWTAJFMF3TSM2' # your Foursquare ID
CLIENT_SECRET = 'OZ1IXQ4FNXCI00WH5ROPT3EBWWTSJB5ISKCZRAXVDC3VQGAV' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TQJ4YICT53P0H1NVREHKCL1OE4NNK12MI1WAWTAJFMF3TSM2
CLIENT_SECRET:OZ1IXQ4FNXCI00WH5ROPT3EBWWTSJB5ISKCZRAXVDC3VQGAV


In [21]:
lat = df.loc[79, 'Latitude'] 
long = df.loc[79, 'Longitude'] 
name = df.loc[79, 'Neighbourhood'] 

### let us explore the area on the 500 meter radius and get only 100 results

In [22]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    long, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=TQJ4YICT53P0H1NVREHKCL1OE4NNK12MI1WAWTAJFMF3TSM2&client_secret=OZ1IXQ4FNXCI00WH5ROPT3EBWWTSJB5ISKCZRAXVDC3VQGAV&v=20180604&ll=43.7043244,-79.3887901&radius=500&limit=100'

In [23]:
json_result = requests.get(url).json()
json_result

{'meta': {'code': 200, 'requestId': '5db398e26001bc00383bc7e7'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Davisville',
  'headerFullLocation': 'Davisville, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 36,
  'suggestedBounds': {'ne': {'lat': 43.7088244045, 'lng': -79.38257691798016},
   'sw': {'lat': 43.699824395499995, 'lng': -79.39500328201983}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4ae6ea6ef964a52082a721e3',
       'name': 'Jules Cafe Patisserie',
       'location': {'address': '617 Mt Pleasant Ave',
        'crossStreet': 'at Manor Rd E',
        'lat': 43.70413799694304,
        'lng': -79.38841260442167,
        'labeledLatLngs':

### Lets see what we have find so far

In [28]:
venues = json_result['response']['groups'][0]['items']
dataframe_filtered = json_normalize(venues) # flatten JSON

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
dataframe_filtered =dataframe_filtered.loc[:, filtered_columns]

# filter the category for each row
dataframe_filtered['venue.categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean columns
dataframe_filtered.columns = [col.split(".")[-1] for col in dataframe_filtered.columns]

dataframe_filtered

Unnamed: 0,name,categories,lat,lng
0,Jules Cafe Patisserie,Dessert Shop,43.704138,-79.388413
1,Thobors Boulangerie Patisserie Café,Café,43.704514,-79.388616
2,Marigold Indian Bistro,Indian Restaurant,43.702881,-79.388008
3,XO Gelato,Dessert Shop,43.705177,-79.388793
4,Viva Napoli,Pizza Place,43.705752,-79.389125
5,Zee Grill,Seafood Restaurant,43.704985,-79.388476
6,Starbucks,Coffee Shop,43.70594,-79.38941
7,June Rowlands Park,Park,43.700517,-79.389189
8,Florentia Ristorante,Italian Restaurant,43.703594,-79.387985
9,Sakae Sushi,Sushi Restaurant,43.704944,-79.388704


### at first look at the dataframe above we can easily see that there are lots of eatable places are here so there is a good wa