## Segmenting and Clustering Neighborhoods in Toronto 

In [2]:
pip install bs4
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request

Read URL

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

Parse HTML link and look at the elements(tags)

In [4]:
soup = BeautifulSoup(page, 'html.parser')
#print(soup.prettify())

Find the postal table

In [5]:
all_tables=soup.find_all("table")

In [6]:
table=soup.find('table', class_='wikitable')

Create columns and find the data, then append data to those columns

In [7]:
A=[]
B=[]
C=[]
for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

Rename columns and replace "\n" with blank

In [8]:
df=pd.DataFrame(A,columns=['Postal code'])
df['Borough']=B
df['Neighborhood']=C
df['Postal code'] = df['Postal code'].replace('\n','', regex=True)
df['Borough'] = df['Borough'].replace('\n','', regex=True)
df['Neighborhood'] = df['Neighborhood'].replace('\n','', regex=True)
df['Neighborhood'] = df['Neighborhood'].replace('/',',', regex=True)

Only process the cells that have an assigned borough

In [9]:
df2 = df[df['Borough'] != 'Not assigned']
df2

Unnamed: 0,Postal code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing CentrE
169,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,..."


Print the number of rows of df2

In [10]:
df2.shape

(103, 3)

In [None]:
pip install geocoder

The query took too long to run, so i will use CSV file to get coordinates

In [12]:
import geocoder # import geocoder
postal_code = df2['Postal code']
def get_geocoder(postal_code):
     # initialize your variable to None
     lat_lng_coords = None
     # loop until you get the coordinates
     while(lat_lng_coords is None):
       g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
       lat_lng_coords = g.latlng
     latitude = lat_lng_coords[0]
     longitude = lat_lng_coords[1]
     return latitude,longitude
for i in range(0,len(postal_code)):
   df2['Latitude'][i],df2['Longitude'][i]=get_geocoder(df2.iloc[i]['Postal code'])

Import the csv file and change a column name

In [82]:
df= pd.read_csv("http://cocl.us/Geospatial_data")
df.rename(columns = {'Postal Code':'Postal_Code'},inplace = True) 

Join two dataframes

In [81]:
Geocode=df2.merge(df,left_on='Postal code',right_on='Postal_Code')
del Geocode['Postal_Code']
Geocode

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509
