# Segmenting and Clustering (Toronto) Part I
### by Francisco J. O'Meany

In [241]:
from bs4 import BeautifulSoup
import pandas as pd

### 1.- Upload html file taken from wikipedia website

In [242]:
!wget -O toronto.html https://francisco.omeany.net/toronto.html

--2019-12-31 23:40:40--  https://francisco.omeany.net/toronto.html
Resolving francisco.omeany.net (francisco.omeany.net)... 192.254.157.172
Connecting to francisco.omeany.net (francisco.omeany.net)|192.254.157.172|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41959 (41K) [text/html]
Saving to: ‘toronto.html’


2019-12-31 23:40:40 (344 KB/s) - ‘toronto.html’ saved [41959/41959]



### 2.- Read toronto.html file

In [243]:
# read html file taken from wikipedia
with open("toronto.html") as html_file:
    fileSrc = BeautifulSoup(html_file, 'html.parser')


### 3.- Parse the file and:
###### (a) - Exclude 'Not assigned' Borough
###### (b) - Remove tabs and carriage returns
###### (c) - Assign borough to a not assigned neighborhoods

In [244]:
# define the dataframe columns
tdata = []
column_names = ['PostalCode', 'Borough', 'Neighborhood']
tdata.append(column_names)

# Parse html file and build dataframe excluding 'Not assigned' Boroughs
for fLine in fileSrc.find_all('tr'):
    children = fLine.findChildren("td" , recursive=True)
    if( children ):
        childf = []
        col = 0
        appendRow = True
        
        for child in children:
            # Exclude 'Not assigned' Boroughs
            if( child.text == 'Not assigned' and col == 1 ):
                appendRow = False
            
            # Remove tabs and carriage returns
            childf.append(child.text.replace("\t", "").replace("\r", "").replace("\n", ""))
            
            # If a cell has a borough but a Not assigned neighborhood, 
            # then the neighborhood will be the same as the borough
            if( col == 2 and childf[2] == 'Not assigned' ):
                childf[2] = childf[1]
            
            col = col+1
        
        # Append row only if Borough is assigned
        if( appendRow ):
            tdata.append(childf)

# Save dataframe
pd.DataFrame(tdata).to_csv('toronto_seg.csv', index=False, header=False)


### 4.- Read dataframe

In [245]:
# Read dataframe
toronto_df = pd.read_csv('toronto_seg.csv')
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


### 5.- Merge neighborhood rows with the same postal code

In [246]:
# Merge neighborhood rows with the same postal code
t_df = toronto_df.groupby(['PostalCode']).agg({'Borough':'first','Neighborhood': lambda x : ', '.join(x)})
t_df.to_csv('toronto_cluster.csv')
t_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


### 6.- Read the merged dataframe

In [247]:
# Read the merged dataframe
t_cluster = pd.read_csv('toronto_cluster.csv')
t_cluster = t_cluster[['PostalCode', 'Borough', 'Neighborhood']]
t_cluster.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


### 7.- Print number of rows on dataframe

In [248]:
# shape method to print the number of rows on dataframe
t_cluster.shape

(103, 3)