In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import json
import xml
import pandas as pd
import lxml

In [2]:
website_url = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050').text

from bs4 import BeautifulSoup

soup = BeautifulSoup(website_url,'lxml') # make Soup using URL

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XpBJTApAAD0AACWpshkAAAAJ","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":949497198,"wgRevisionId":945633050,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canada","Toronto","Ontario

In [3]:
neighborhood = soup.find('table', class_ = 'wikitable')
neighborhood_rows = neighborhood.find_all('tr')

In [4]:
information = []
for row in neighborhood_rows:
    info = row.text.split('\n')[1:-1] # remove empty str - the first and last items
    information.append(info)
    
information[0:20] #see the first 20 rows


[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn']]

In [7]:
neighborhood_df = pd.DataFrame(information[1:], columns=information[0])

neighborhood_df = neighborhood_df.rename(columns={neighborhood_df.columns[2]: "Neighborhood" })


neighborhood_df.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [8]:
#getting rid of Not Assigned
neighborhood_df = neighborhood_df[neighborhood_df.Borough != "Not assigned"]
neighborhood_df.reset_index(drop = True, inplace = True)
neighborhood_df.head(20)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [10]:
grouped = neighborhood_df.groupby(['Postcode']) # group by Postcode


# combine the neighborhoods grouped by postcode and into a new df
neighborhood_grouped = grouped['Neighborhood'].apply(lambda x: x.sum()) 

# adds spaces and commas between neighborhoods
neighborhood_grouped = grouped['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
borough_grouped = grouped['Borough'].apply(lambda x: set(x).pop())

# turn borough_grouped and neighborhood_grouped into dataframes
borough = borough_grouped.to_frame()

neighborhood = neighborhood_grouped.to_frame()

grouped_final = borough.merge(neighborhood, on="Postcode")

grouped_final



Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [11]:
print('The numer of rows and columns in this final grouped dataframe is', grouped_final.shape)

The numer of rows and columns in this final grouped dataframe is (103, 2)


In [12]:
#attach longitude and latitude to neighborhoods and post code table

geospatial_data = pd.read_csv(r'C:\Users\Croni\Downloads\Geospatial_Coordinates.csv')

geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
geospatial_data = geospatial_data.rename(columns = {geospatial_data.columns[0]: "Postcode"})

In [14]:
full_table = grouped_final.merge(geospatial_data, on = 'Postcode')

full_table.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
#Clustering the neighborhoods of Toronto 

#convert address to latitude and longitude
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

#display images
from IPython.display import Image
from IPython.core.display import HTML 

!conda install -c conda-forge folium = 0.5.0 --yes


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.




CondaValueError: invalid package specification: =



In [16]:
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json
import requests
from pandas.io.json import json_normalize #transforms JSON file into pandas dataframe

In [17]:
print('This dataframe has {} boroughs and {} neighborhoods.'.format(len(full_table['Borough'].unique()),full_table['Neighborhood'].shape[0]))

This dataframe has 10 boroughs and 103 neighborhoods.


In [18]:
#Get longitude and latitude using geolocator 

address = 'Toronto, Ontario, Canada'

geolocator = Nominatim(user_agent = "TO_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}'.format(latitude,longitude))

The geographical coordinates of Toronto are 43.6534817, -79.3839347


In [20]:
#Creating a map
import folium
map_toronto = folium.Map (location = [latitude,longitude], zoom_start = 10)

#add neighborhoods
for lat,long,bor,neigh in zip (full_table['Latitude'], full_table['Longitude'], full_table['Borough'], full_table['Neighborhood']): 
    label = '{}, {}'.format(neigh,bor)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat,long],
        radius = 7,
        popup = label,
        color = 'red',
        fill = True,
        fill_color = 'white',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)
    
map_toronto

In [21]:
#cluster the boroughs

list_boroughs = full_table['Borough'].unique()
list_boroughs

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [23]:
def borough_loc(list_of_places):
    for place in list_of_places:
        address = (place + ", Ontario, Canada")
        geolocator = Nominatim(user_agent="TO_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
        print('{''}, {}, {},'.format(place,latitude,longitude))

borough_loc(list_boroughs)

Scarborough, 43.773077, -79.257774,
North York, 43.7543263, -79.44911696639593,
East York, 43.699971000000005, -79.33251996261595,
East Toronto, 43.6247901, -79.3934918,
Central Toronto, 43.6534817, -79.3839347,
Downtown Toronto, 43.6563221, -79.3809161,
York, 44.0007518, -79.4372217,
West Toronto, 43.6534817, -79.3839347,
Mississauga, 43.590338, -79.645729,
Etobicoke, 43.671459150000004, -79.55249206611668,


In [26]:
import numpy as np

boroughs = ['Scarborough', 43.773077, -79.257774,
'North York', 43.7708175, -79.4132998,
'East York', 43.6913391, -79.3278212,
'East Toronto', 43.653963, -79.387207,
'Central Toronto', 43.653963, -79.387207,
'Downtown Toronto', 43.655115, -79.380219,
'York', 44.0007518, -79.4372217,
'West Toronto', 43.653963, -79.387207,
"Queen's Park", 43.6599803, -79.3903686,
'Mississauga', 43.590338, -79.645729,
'Etobicoke', 43.6435559, -79.5656326]

boroughs_df = pd.DataFrame(np.array(boroughs).reshape(11,3), columns = ["Borough","Latitude","Longitude"])

boroughs_df

Unnamed: 0,Borough,Latitude,Longitude
0,Scarborough,43.773077,-79.257774
1,North York,43.7708175,-79.4132998
2,East York,43.6913391,-79.3278212
3,East Toronto,43.653963,-79.387207
4,Central Toronto,43.653963,-79.387207
5,Downtown Toronto,43.655115,-79.380219
6,York,44.0007518,-79.4372217
7,West Toronto,43.653963,-79.387207
8,Queen's Park,43.6599803,-79.3903686
9,Mississauga,43.590338,-79.645729
