# Segmenting and Clustering Neighbourhoods in Toronto

### Web Scraping

In [83]:
import pandas as pd
import numpy as np
site = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(site)[0]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Cleaning Dataframe

In [84]:
# drop the items without assigned boroughs
df = df[df['Borough'] != 'Not assigned']
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [85]:
# combining duplicate postcodes

# create an empty dataframe to fill up to replace with the clean one
df2 = pd.DataFrame(columns = ['Postcode', 'Borough', 'Neighbourhood'])

# loop through postcode by poscode
for postcode in df['Postcode']:
    tmp = df[df['Postcode'] == postcode].reset_index(drop=True)
    
    # concatanate neighbourhoods for those where a postcode has multiple rows
    if tmp.shape[0] != 1:
        tmpstr = ''
        for nhood in tmp['Neighbourhood']:            
            if tmpstr:
                tmpstr = tmpstr + ', ' + nhood
            else:
                tmpstr = nhood
        
        # create single row tmp with combined neighbourhoods for each postcode and append to df2
        tmp['Neighbourhood'] = tmpstr
        tmp.drop_duplicates(inplace=True)
        df2 = df2.append(tmp, ignore_index=True)
    
    # also need to append those rows where there is only one neoghbourhood per postcode
    else:
        df2 = df2.append(tmp, ignore_index=True)

In [86]:
# Replacing the unassigned neighbourhoods with their boroughs in the neighbourhood column
df2.drop_duplicates(inplace=True)
df2.replace('Not assigned', np.nan, inplace=True)
df2.fillna(axis=1, method='ffill', inplace=True)

In [87]:
df2.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,"Rouge, Malvern"
10,M3B,North York,Don Mills North
11,M4B,East York,"Woodbine Gardens, Parkview Hill"
13,M5B,Downtown Toronto,"Ryerson, Garden District"


<p>
I have first scraped the data from wikipedia using the read_html method. 
</p>
<p>
I have then used nested loops to produce a new dataframe with combined neighbourhoods that share postcodes
</p>
<p>
I finally have used replace to change the 'not assigned' neighbourhoods to nan and then the fillna method to duplicate the boroughs 
</p>

In [88]:
df = df2
df.shape

(103, 3)