In [1]:
!pip install beautifulsoup4
!pip install lxml
!pip install html5lib
!pip install requests



## Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe like the one shown below:

In [2]:
from bs4 import BeautifulSoup
import requests
url= 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content,'lxml')
#print(soup.prettify()) # print the parsed data of html

# create the  dataframe

## The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [3]:
target_table = soup.find("table", attrs={"class": "wikitable sortable"})
target_table_data = target_table.tbody.find_all("tr")  
# Get all the headings of Lists
headings = []
for td in target_table_data[0].find_all("th"):
    # remove any newlines and extra spaces from left and right
    headings.append(td.text.replace('\n', ' ').strip())

print(headings)

['Postcode', 'Borough', 'Neighbourhood']


## read data from html file

In [9]:
target_table_data = target_table.tbody.find_all("tr")  
table_data=[]
for i in range(1,len(target_table_data)):
    data=[]
    for td in target_table_data[i].find_all("td"):
    # remove any newlines and extra spaces from left and right
        data.append(td.text.replace('\n', ' ').strip())
    table_data.append(data)

In [31]:
import pandas as pd
data = pd.DataFrame(table_data, columns = headings)
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [33]:
# Get names of indexes for which borough  not assigned
indexNames = data[ data['Borough'] == 'Not assigned' ].index
# Delete these row indexes from dataFrame
data.drop(indexNames , inplace=True)
data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [24]:
data.set_index('Postcode', inplace=True)

In [26]:
#data.groupby('Postcode')['Neighbourhood'].apply(' ,'.join).reset_index()
new_data= pd.DataFrame(data.groupby(['Postcode','Borough'])['Neighbourhood'].apply(' ,'.join).reset_index())

In [114]:
new_data

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge ,Malvern"
1,M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union"
2,M1E,Scarborough,"Guildwood ,Morningside ,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park ,Ionview ,Kennedy Park"
7,M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge"
8,M1M,Scarborough,"Cliffcrest ,Cliffside ,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff ,Cliffside West"


##  If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [29]:
indexNames = data[ data['Neighbourhood'] == 'Not assigned' ].index
print(indexNames)

Index([], dtype='object', name='Postcode')


no cell has a borough but not assigned neighbothood

## print the number of rows of your dataframe.

In [14]:
print(new_data.shape)

(103, 3)


## Import data from Geo

In [15]:
!wget -O geo_data.csv http://cocl.us/Geospatial_data

--2020-03-15 15:02:57--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 158.85.108.86, 158.85.108.83, 169.48.113.194
Connecting to cocl.us (cocl.us)|158.85.108.86|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2020-03-15 15:03:00--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|158.85.108.86|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-15 15:03:00--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-03-15 15:03:01--  https://ibm.box.com/public

In [16]:
df = pd.read_csv('geo_data.csv')
df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## use csv file to create new data frame with latitude and longitude

In [40]:
neighborhoods= new_data.set_index('Postcode').join(df.set_index('Postal Code'))
neighborhoods

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge ,Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek ,Rouge Hill ,Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood ,Morningside ,West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
M1J,Scarborough,Scarborough Village,43.744734,-79.239476
M1K,Scarborough,"East Birchmount Park ,Ionview ,Kennedy Park",43.727929,-79.262029
M1L,Scarborough,"Clairlea ,Golden Mile ,Oakridge",43.711112,-79.284577
M1M,Scarborough,"Cliffcrest ,Cliffside ,Scarborough Village West",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff ,Cliffside West",43.692657,-79.264848


## Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto 

### get all neighborhoods of toronto

In [42]:
toronto_data = neighborhoods[neighborhoods['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West ,Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West ,India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


### get address of Toronto

In [45]:
address = 'Toronto'
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create map

In [49]:
!conda install -c conda-forge folium=0.5.0 --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |   py36h9f0ad1d_1         149 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    branca-0.4.0               |             py_0          26 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    altair-4.0.1               |             py_0         575 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                       

In [51]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
# create map of New York using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor