<h1> Segmenting and Clustering Neighborhoods in Toronto

<h2> Part 1: Scraping Wikipedia page for Toronto Boroughs

import all necessary libraries (for this I am going to make the dataframe in both BeautifulSoup4 and Pandas

In [2]:
#!conda install -c conda-forge bs4 --yes
#!conda install -c conda-forge lxml --yes
#!conda install -c conda-forge requests --yes
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

<h3> Creating the dataframe with BeautifulSoup4

In [3]:
#create variable for url
result = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#create variable for the content of said url
src = result.content

#create soup item for the content
soup = BeautifulSoup(src, 'html.parser')

#since what we are looking for is the dataframe on the site we have to know what type of html class it is. you can inspect the page to find the html code that refers to the dataframe.
#it happens to be a table type, so we tell BeautifulSoup we want to find anything with the table tag
table = soup.find('table')

#now we need the table rows which are the 'tr' tag in html
table_rows = table.find_all('tr')

#create a list to store the values
tor_list = []

#for loop to find all 'td' tags as that is the data for the rows. If td has no text is is removed and not added to the list
for tr in table_rows:
    td = tr.find_all('td')
    if td:
        row = [i.text.strip() for i in td]
        tor_list.append(row)

#create dataframe from the list with the columns Postal Code, Borough, and Neighborhood(s)
tor_df = pd.DataFrame(data=tor_list, columns=['Postal Code', 'Borough', 'Neighborhood(s)'])

#remove all \n in the dataset
tor_df = tor_df.replace(r'\n', '', regex=True)

#replace all empty values with NaN
tor_df = tor_df.replace(r'^\s*$', np.NaN, regex=True)

#drop all rows with NaN in them
tor_df.dropna(axis=0, how='any', inplace=True)

#reset index of dataframe
tor_df.reset_index(drop=True, inplace=True)

#show head of dataframe
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood(s)
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
#get size of dataframe
tor_df.shape

(103, 3)

<h3> Creating the dataframe with Pandas

In [5]:
#assign html to variable, set na_values to NaN, with no header
tor_df_pd = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', na_values='NAN', header=0)

#previous code imports all of the tables on the page, chose the first one
tor_df_pd = tor_df_pd[0]

#drop NaN valued rows in dataframe
tor_df_pd.dropna(axis=0, inplace=True)

#reset index of rows
tor_df_pd.reset_index(drop=True, inplace=True)

#print dataframe
tor_df_pd.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
#get the size of the dataframe
tor_df_pd.shape

(103, 3)

<h2> Part 2: Adding Longitude and Latitude to Dataframe

In [12]:
#import coordinates
coords = pd.read_csv('Geospatial_Coordinates.csv')

#merge the two datasets based on shared Postal Code column
merge_tor = pd.merge(left=tor_df, right=coords, left_on='Postal Code', right_on='Postal Code')

#print dataset
merge_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood(s),Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
