In [1]:
import numpy as np
import pandas as pd
import urllib.request # import the library we use to open URLs
from bs4 import BeautifulSoup # import the BeautifulSoup library so we can parse HTML and XML documents

In [2]:
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" # specify which URL/web page we are going to be scraping
page=urllib.request.urlopen(url) # open the url using urllib.request and put the HTML into the page variable
soup= BeautifulSoup(page, "lxml")# parse the HTML from our URL into the BeautifulSoup parse tree format

In [19]:
#print(soup.prettify()) #print results of the web page scrape

In [4]:
soup.title #Bring back page 'title' and data between the start and end of 'title' tags
soup.title.string #Bring back just titles that are strings

'List of postal codes of Canada: M - Wikipedia'

In [5]:
all_tables=soup.find_all("table")# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable

In [6]:
right_table=soup.find('table', class_='wikitable sortable') #place the right table "wikitable sortable" with all the needed information into right_table

In [7]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

df=pd.DataFrame(A,columns=['Postal Code'])
df['Borough']=B
df['Neighborhood']=C

In [20]:
dfToronto=df[~df.Borough.str.contains("Not assigned")]
dfToronto.drop_duplicates()
dfToronto.groupby(['Postal Code','Borough'], sort = False).agg(lambda x: ','.join(x))


Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
Postal Code,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"
M3B,North York,Don Mills
M4B,East York,"Parkview Hill, Woodbine Gardens"
M5B,Downtown Toronto,"Garden District, Ryerson"


In [22]:
#question 2 of Week 3 Applied Data Science - Segmenting and Clustering Neighborhoods in Toronto
#Add locations to list of Postal Codes, Boroughs, and Neighborhoods
dfGeoloc = pd.read_csv ('http://cocl.us/Geospatial_data')
dfGeoloc

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [23]:
dfToronto = dfToronto.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
dfToronto['Borough']=df['Borough'].str.join('').str.replace(r"\n", "")
dfToronto['Neighborhood']=df['Neighborhood'].str.join('').str.replace(r"\n", "")

In [24]:
dfMerge=pd.concat([dfToronto, dfGeoloc],axis=1,join="inner")
dfMerge.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Postal Code.1,Latitude,Longitude
2,M3A,North York,Parkwoods,M1E,43.763573,-79.188711
3,M4A,North York,Victoria Village,M1G,43.770992,-79.216917
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",M1H,43.773136,-79.239476
5,M6A,North York,"Lawrence Manor, Lawrence Heights",M1J,43.744734,-79.239476
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",M1K,43.727929,-79.262029


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [None]:
#question 3 of Week 3 Applied Data Science - Segmenting and Clustering Neighborhoods in Toronto
#Map generation and exploration

In [2]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-2.0.0                |     pyh9f0ad1d_0          63 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0          conda-forge
    geopy:           

In [3]:
#using Geopy library to get the latitude and longitude values of Toronto
#name agent tn_explorer
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tn_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto 43.6534817, -79.3839347.


In [None]:
#load Folium 
import folium # map rendering library

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)