# Segmenting and Clustering Neighborhoods in Toronto

## Import the relevant libraries

In [1]:
import pandas as pd
import numpy as np
import requests

pd.options.display.max_columns =None # full rows in output

#### get the url of requried data

In [2]:
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Let fetch the table from url

In [3]:
df = pd.read_html(url, match ='Postcode')  # there is pandas method to extract table from html page
df[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
df = df[0].copy()

## Step -1

### Ignore cells with a borough that is Not assigned.

In [5]:
df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


## Step -2

### two rows will be combined into one row with the neighborhoods separated with a comma

In [6]:
df =df.groupby(['Postcode','Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
df.reset_index(inplace =True)

In [7]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Step -3 

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [8]:
df[df['Neighbourhood'] == 'Not assigned'] # check the any 'Not assigned' value in 'Neighbourhood' column
        
        #OR
        
#df['Neighbourhood'].str.contains("Not assigned")

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


###### There is one column. So as by rule it replace by Borough value

In [9]:
df['Neighbourhood'].replace('Not assigned',"Queen's Park",inplace =True)

In [10]:
df.shape

(103, 3)