# Part 1 - Creating a dataframe containing PostalCode, Borough and neighborhood columns

In [1]:
# downloading and importing required libraries for web scraping
!pip install requests
!pip install bs4
from bs4 import BeautifulSoup 
import requests
import pandas as pd

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


### To explore and cluster the neighborhoods in Toronto, we will scrape the following Wikipedia page and then read it into a pandas dataframe and clean it as follows

In [2]:

wikipedia_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page= requests.get(wikipedia_link).text
page_lxml= BeautifulSoup(page,'lxml')
table=page_lxml.find('table')
#table.findAll('tr')

In [3]:
tables = pd.read_html(wikipedia_link)
dataframe = tables[0]
dataframe.columns = ['PostalCode', 'Borough', 'Neighborhood'] #renaming the PostCode column to PostalCode column

dataframe.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
#  Subseting the cells that have an assigned Borough. i.e. ignoring cells with a Borough that is 'Not assigned',

df = dataframe[dataframe.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [5]:
# combining multiple neighborhood rows that have similar PostalCode into one row with the neighborhoods separated with a comma.
cleaned_df=df.groupby("PostalCode").agg(lambda x:','.join(set(x)))
cleaned_df=cleaned_df.reset_index()
cleaned_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Port Union,Highland Creek,Rouge Hill"
2,M1E,Scarborough,"Guildwood,West Hill,Morningside"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
cleaned_df.loc[cleaned_df['Neighborhood']=="Not assigned",'Neighborhood']=cleaned_df.loc[cleaned_df['Neighborhood']=="Not assigned",'Borough'] # For a cell that has a borough but a Not assigned neighborhood, then the neighborhood is the same as the borough

len(cleaned_df[cleaned_df['Neighborhood'] == 'Not assigned'])

0

In [7]:
cleaned_df.shape

(103, 3)

# Part 2 - to get Latitude and Longitude coordinates

In [8]:

# Importing the csv file that contains Latitude and Longitude of a given PostalCode
lat_lon_df = pd.read_csv("http://cocl.us/Geospatial_data")
lat_lon_df.columns = ['PostalCode', 'Latitude', 'Longitude'] #renaming the PostCode column to PostalCode column
lat_lon_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:

# Merging the above two data frames cleaned_df and lat_lon_df 
df_merged = pd.merge(cleaned_df, lat_lon_df, on='PostalCode', how='inner')
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Highland Creek,Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,West Hill,Morningside",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part 3 - Exploring and clustering the neighborhoods in Toronto
Before we start working this part, let's download all the required libraries.

In [10]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In [11]:
# Quickly examine the resulting dataframe and the dataset has 11 boroughs and 103 neighborhoods.

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_merged['Borough'].unique()),
        df_merged.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.
