### Segmenting and Clustering Neighborhoods in Toronto 

Import required libraries

In [38]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

The code below allows us to get list of postal codes of Canada stats data of the HTML table.

In [39]:
web='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

site = requests.get(web)

html_doc = lh.fromstring(site.content)

parse_tr = html_doc.xpath('//tr')

Ensuring that all the rows have the same width. 

In [40]:
[len(T) for T in parse_tr[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

Next, let’s parse the first row as our header.

In [41]:
parse_tr = html_doc.xpath('//tr')

col=[]
i=0

for t in parse_tr[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))
    

1:"Postal code
"
2:"Borough
"
3:"Neighborhood
"


Each header is appended to a tuple along with an empty list.

In [42]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(parse_tr)):
    #T is our j'th row
    T=parse_tr[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

Check the length of each column. They should all be the same.

In [43]:
[len(C) for (title,C) in col]

[181, 181, 181]

Create the DataFrame

In [44]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [45]:
df.head()

Unnamed: 0,Postal code\n,Borough\n,Neighborhood\n
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


In [46]:
df.dtypes

Postal code\n     object
Borough\n         object
Neighborhood\n    object
dtype: object

Rename the columns

In [47]:
df.columns = ['Postcode','Borough','Neighbourhood']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


Replacing undesirable simbols from de dataframe

In [48]:
df = df.replace('\n',' ', regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Using numpy to replace 'Not assigned' categoric values

In [49]:
df = df.replace('Not assigned',np.nan, regex=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Droping NaN values from dataframe

In [50]:
df.dropna(subset=["Borough"], axis=0, inplace=True)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Replacing '/' by a coma ','

In [14]:
df = df.replace('/',',', regex=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Showing the final shape of the dataframe, first number is rows and second columns

In [51]:
df.shape

(104, 3)