In [1]:
import numpy as np 
import pandas as pd
import requests # a package to send http request
from bs4 import BeautifulSoup #BeautifulSoup is a package to parse and work with html file

In [2]:
# Data extraction from wikipedia

wiki_data_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text #return the data of the page in html format
soup = BeautifulSoup(wiki_data_url, 'lxml') #assign the url address to a variable in order to parse it
#print(soup.prettify())
data_table = soup.find('table',{'class':'wikitable sortable'}) #taking the part of the html file that we need
first_column = []
second_column = []
third_column = []
for row in data_table.findAll('tr'): # we need everything between 'tr' and 'td'
    cells = row.findAll('td')
    if len(cells)==3: #3 is the number of columns that we want (Neighborhood, Postal Code, Borough)
        first_column.append(cells[0].find(text=True)) 
        second_column.append(cells[1].find(text=True))
        third_column.append(cells[2].find(text=True))
                                                      #find is a function that extract the data not in a html format in the specific location where cells is in 
                                                      #if we not use find the data in our data frame will be in a html format

df = pd.DataFrame() 
df['Postal Code'] = first_column
df['Borough'] = second_column
df['Neighborhood'] = third_column
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
# Data cleaning 1 - drop 'not assigned' in 'borough' column
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [4]:
# Data cleaning 3 - replace neighborhoods with 'not assigned' to corresponds borough
for i,j in zip(df['Borough'], df['Neighborhood']):
    if j == 'Not assigned':
        df.replace(j, i,  inplace = True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [5]:
# Data cleaning 2 - same postal code analysis
df_groupby = df.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(','.join)
df_groupby_DF = pd.DataFrame(df_groupby)
df_groupby_DF = df_groupby_DF.reset_index()
df_groupby_DF.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood\n,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae\n


In [6]:
# Number of rows of the data frame after 'cleaning'
df_groupby_DF.shape[0]

103

In [7]:
#extract data and display its header
data_lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
data_lat_long.head()
#adding long and lat coordinates to our grouped data frame
df_groupby_DF['Latitude'] = data_lat_long['Latitude']
df_groupby_DF['Longitude'] = data_lat_long['Longitude']
df_groupby_DF.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


In [77]:
# In this part I will explore Toronto's neighborhoods and will find 50 places per neighborhood (using Foursquare API) and then cluster the results

In [8]:
# import relevant libaries
!conda install -c conda-forge folium=0.5.0 --yes
import folium
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    altair-3.2.0               |           py36_0         770 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.3 MB

The following NEW packages will be 

In [9]:
df_groupby_DF['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

In [10]:
df_groupby_2 = df.groupby('Borough')
West_Toronto_neighborhoods = set(df_groupby_2.get_group('West Toronto')['Neighborhood'])
east_Toronto_neighborhoods = set(df_groupby_2.get_group('East Toronto')['Neighborhood'])
downtown_Toronto_neighborhoods = set(df_groupby_2.get_group('Downtown Toronto')['Neighborhood'])
central_Toronto_neighborhoods = set(df_groupby_2.get_group('Central Toronto')['Neighborhood'])

In [11]:
hoods_Toronto_set = West_Toronto_neighborhoods | east_Toronto_neighborhoods | downtown_Toronto_neighborhoods | central_Toronto_neighborhoods
hoods_Toronto_set.remove('Business Reply Mail Processing Centre 969 Eastern\n')
hoods_Toronto_set.remove('Stn A PO Boxes 25 The Esplanade\n')
hoods_Toronto_set

{'Adelaide\n',
 'Bathurst Quay\n',
 'Berczy Park',
 'Brockton\n',
 'CN Tower',
 'Cabbagetown',
 'Central Bay Street\n',
 'Chinatown',
 'Christie\n',
 'Church and Wellesley',
 'Commerce Court',
 'Davisville\n',
 'Davisville North\n',
 'Deer Park',
 'Design Exchange',
 'Dovercourt Village',
 'Dufferin\n',
 'Exhibition Place',
 'First Canadian Place',
 'Forest Hill North',
 'Forest Hill SE\n',
 'Forest Hill West\n',
 'Garden District\n',
 'Grange Park',
 'Harbord\n',
 'Harbourfront',
 'Harbourfront East\n',
 'Harbourfront West\n',
 'High Park',
 'India Bazaar',
 'Island airport\n',
 'Kensington Market',
 'King\n',
 'King and Spadina',
 'Lawrence Park',
 'Little Portugal',
 'Moore Park',
 'North Midtown\n',
 'North Toronto West\n',
 'Parkdale',
 'Parkdale Village',
 'Railway Lands',
 'Rathnelly',
 'Regent Park',
 'Richmond\n',
 'Riverdale',
 'Roncesvalles',
 'Rosedale',
 'Roselawn\n',
 'Runnymede',
 'Ryerson\n',
 'South Hill',
 'South Niagara',
 'St. James Town',
 'Studio District\n',
 'Su

In [12]:
hoods_Toronto = pd.DataFrame(hoods_Toronto_set)
hoods_Toronto.columns = ['Neighborhood']
hoods_Toronto.head()

Unnamed: 0,Neighborhood
0,Forest Hill North
1,Little Portugal
2,The Beaches West
3,Island airport
4,Christie


In [None]:
# Toronto neighborhoods coordinates
lat = []
long = []
for hood in hoods_Toronto_set:
    address = hood + ',Toronto'
    geolocator = Nominatim(user_agent="Toronto_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    lat.append(latitude)
    long.append(latitude)
hoods_Toronto['Latitude'] = lat
hoods_Toronto['Longitude'] =long
hoods_Toronto

In [65]:
# finding the coordinates of Toronto
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
Toronto_latitude = latitude
Toronto_longitude = longitude

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [None]:
# using foursquare API
import json # library to handle JSON files
CLIENT_ID = 'T0GMVPN0M5BBZJJX5EFNCWNY53A3ASN4GLHPL5NHAPYHXBEG' # your Foursquare ID
CLIENT_SECRET = '1ZNOAZE0ZWE3A1NWOBWUUZBKVTOBUEGVMC1VGMQ5XZNWYYZB' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

# API query - Dovercourt Village
radius = 500
LIMIT = 50
url_Dovercourt_Village = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, hoods_Toronto.iloc[0,1], hoods_Toronto.iloc[0,2], radius, LIMIT)
results_Dovercourt_Village = requests.get(url_Dovercourt_Village).json()

# API query - The Beaches
radius = 500
LIMIT = 50
url_The_Beaches = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, hoods_Toronto.iloc[1,1], hoods_Toronto.iloc[1,2], radius, LIMIT)
results_Beaches = requests.get(url_The_Beaches).json()

# API query - Harbourfront
radius = 500
LIMIT = 50
url_Harbourfront = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, hoods_Toronto.iloc[2,1], hoods_Toronto.iloc[2,2], radius, LIMIT)
results_Harbourfront = requests.get(url_Harbourfront).json()

# API query - Lawrence Park
radius = 500
LIMIT = 50
url_Lawrence_Park = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, hoods_Toronto.iloc[3,1], hoods_Toronto.iloc[3,2], radius, LIMIT)
results_Lawrence_Park = requests.get(url_Lawrence_Park).json()

In [67]:
#  get_category_type function from the previuos module

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    


In [68]:
from pandas.io.json import json_normalize #tranform JSON file into a pandas dataframe

In [None]:
#Dovercourt Village
nearby_venues = json_normalize(venues) # flatten JSON
venues_Dovercourt_Village = results_Dovercourt_Village['response']['groups'][0]['items']

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()
df_explore_Dovercourt_Village = nearby_venues.iloc[:,:]
df_explore_Dovercourt_Village

In [None]:
#The Beaches
nearby_venues = json_normalize(venues) # flatten JSON
venues_Dovercourt_Village = results_Beaches['response']['groups'][0]['items']

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
df_explore_Beaches = nearby_venues.iloc[:,:]
df_explore_Beaches

In [None]:
#Harbourfront
nearby_venues = json_normalize(venues) # flatten JSON
venues_Dovercourt_Village = results_Harbourfront['response']['groups'][0]['items']

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
df_explore_Harbourfront = nearby_venues.iloc[:,:]
df_explore_Harbourfront

In [None]:
#Lawrence Park
nearby_venues = json_normalize(venues) # flatten JSON
venues_Dovercourt_Village = results_Lawrence_Park['response']['groups'][0]['items']

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
df_Lawrence_Park = nearby_venues.iloc[:,:]
df_Lawrence_Park

In [None]:
df_explore = pd.concat([df_explore_Dovercourt_Village, df_explore_Beaches, df_explore_Harbourfront, df_Lawrence_Park], axis = 0)
df_explore = df_explore.reset_index(drop = True)
df_explore.columns = ['name', 'categories', 'lat', 'lng']

In [75]:
# cleaning data for kmeans clustering
df_explore.drop(['name', 'categories'] , axis = 1, inplace = True)
df_explore

In [76]:
# clustring
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300).fit(df_explore)

In [None]:
print('The labels are:', kmeans.labels_)
print('The centroids are:', kmeans.cluster_centers_)