# Segmenting and Clustering Neighborhoods in Toronto

#### Importing the necessary modules

In [17]:
import pandas as pd
import requests
import lxml.html as lh


In [87]:
import numpy as np  # useful for many scientific computing in Python

#### Specifying the url as a parameter

In [18]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#### Defining some parameters to handle the html-contents

In [26]:
handle = requests.get(df_link)

In [28]:
handle_contents = lh.fromstring(handle.content)

In [94]:
tr_elements = handle_contents.xpath('//tr')

#### Checking the number of columns for the first 15 rows our data

In [125]:
[len(T) for T in tr_elements[:15]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [127]:
#### Fetching the table header first

In [140]:
# Creating empty list
col = []
i = 0

#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content().replace('\n', '')
    print('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood"


#### Neato! We got our header. Next, let us cycle through to get our data in each subsequence row.

In [141]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 3, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content().replace('\n', '') # Removing any newlines
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

In [None]:
#### It is always good to check that we got a well-defined table, where the number of rows (or length) is the same per column.

In [142]:
[len(C) for (title,C) in col]

[287, 287, 287]

#### Creating a dataframe from the scraping above

In [143]:
scraped_html={title:column for (title,column) in col}
tor_df=pd.DataFrame(scraped_html)

In [146]:
tor_df.head(13)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


#### Verifying that the headers and rows are well-defined (no additional characters, newlines or whatever)

In [145]:
print(tor_df[:2])

  Postcode       Borough Neighbourhood
0      M1A  Not assigned  Not assigned
1      M2A  Not assigned  Not assigned


#### But before we do that, we remove all the boroughs that are Not assigned.

In [162]:
filter_1 = tor_df['Borough']!='Not assigned'
print(filter_1)

0      False
1      False
2       True
3       True
4       True
5       True
6       True
7       True
8      False
9       True
10      True
11      True
12     False
13      True
14      True
15      True
16      True
17      True
18      True
19     False
20     False
21      True
22      True
23      True
24      True
25      True
26      True
27      True
28      True
29     False
       ...  
257    False
258    False
259    False
260    False
261     True
262    False
263    False
264     True
265     True
266     True
267     True
268     True
269     True
270     True
271     True
272     True
273    False
274    False
275    False
276    False
277    False
278    False
279    False
280    False
281     True
282     True
283     True
284     True
285     True
286    False
Name: Borough, Length: 287, dtype: bool


In [166]:
tor_df = tor_df[filter_1]

In [None]:
#### "M5A is listed twice" - not in this dataset

In [170]:
# Note quite sure about the point about "M5A is listed twice" is all about - on Wikipedia, there are no neighbourhood as Regent Park
print(tor_df[tor_df['Postcode']=='M5A'])

  Postcode           Borough Neighbourhood
4      M5A  Downtown Toronto  Harbourfront


In [172]:
print(tor_df[tor_df['Borough']=='Downtown Toronto'])

    Postcode           Borough                    Neighbourhood
4        M5A  Downtown Toronto                     Harbourfront
7        M7A  Downtown Toronto                     Queen's Park
16       M5B  Downtown Toronto                          Ryerson
17       M5B  Downtown Toronto                  Garden District
33       M5C  Downtown Toronto                   St. James Town
47       M5E  Downtown Toronto                      Berczy Park
56       M5G  Downtown Toronto               Central Bay Street
57       M6G  Downtown Toronto                         Christie
67       M5H  Downtown Toronto                         Adelaide
68       M5H  Downtown Toronto                             King
69       M5H  Downtown Toronto                         Richmond
82       M5J  Downtown Toronto                Harbourfront East
83       M5J  Downtown Toronto                  Toronto Islands
84       M5J  Downtown Toronto                    Union Station
98       M5K  Downtown Toronto          

In [222]:
#### Combining multiple Neighbourhoods into same borough - and creating a new dataset
tor_df_grp = pd.DataFrame(tor_df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x))).reset_index()

In [223]:
tor_df_grp

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### Filling in for Neighbourhoods with value 'Not assigned'
#### (Not the prettiest solution)

In [235]:
print(tor_df_grp[tor_df_grp['Neighbourhood']=='Not assigned'])

   Postcode       Borough Neighbourhood          Test
93      M9A  Queen's Park  Not assigned  Queen's Park


In [226]:
tor_df_grp['Neighbourhood'].replace('Not assigned', tor_df_grp['Borough'])

0                                          Rouge,Malvern
1                   Highland Creek,Rouge Hill,Port Union
2                        Guildwood,Morningside,West Hill
3                                                 Woburn
4                                              Cedarbrae
5                                    Scarborough Village
6              East Birchmount Park,Ionview,Kennedy Park
7                          Clairlea,Golden Mile,Oakridge
8          Cliffcrest,Cliffside,Scarborough Village West
9                             Birch Cliff,Cliffside West
10     Dorset Park,Scarborough Town Centre,Wexford He...
11                                      Maryvale,Wexford
12                                             Agincourt
13                 Clarks Corners,Sullivan,Tam O'Shanter
14     Agincourt North,L'Amoreaux East,Milliken,Steel...
15                                       L'Amoreaux West
16                                           Upper Rouge
17                             

In [236]:
#### Not the prettiest solution of replacing 'Not '

In [238]:
tor_df_grp["Neighbourhood"].mask(tor_df_grp["Neighbourhood"] == 'Not assigned', tor_df_grp['Borough'], inplace=True)

In [242]:
tor_df_grp

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [243]:
print(tor_df_grp[tor_df_grp['Borough']=="Queen's Park"])

   Postcode       Borough Neighbourhood
93      M9A  Queen's Park  Queen's Park


#### Lastly renaming Postcode to PostalCode and Neighbourhood to Neighborhood


In [247]:
tor_df_grp.rename(columns={'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'}, inplace=True)

In [248]:
tor_df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [244]:
tor_df_grp.shape

(103, 3)