# Segmenting and Clustering Neighborhoods in Toronto

Import relevant libraries:

In [1]:
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import numpy as np
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 

Use urllib and BeautifulSoup to get the wikitable data as html then initialise empty lists for each column and loop through the items in the wikitable and append to lists:

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page,'html.parser')

postal_code = []
borough = []
neighbourhood = []

for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    borough.append(data[1].find(text=True))
    neighbourhood.append(data[2].find(text=True))
    postal_code.append(data[0].find(text=True)) 

Combine the lists into one dataframe:

In [5]:
df = pd.DataFrame(postal_code,columns=['postal_code'])
df['borough'] = borough
df['neighbourhood'] = neighbourhood
df

Unnamed: 0,postal_code,borough,neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


Remove the unwanted new line '\n' at the end of each string:

In [6]:
df.replace('\n','', regex=True, inplace=True)

Remove rows where Borough = 'Not assigned'. Note: As there are no neighbourhoods with the value 'Not assigned' there is no handler needed for this at this stage.

In [7]:
df = df[df['borough'] != 'Not assigned']

In [8]:
df.shape

(103, 3)