# Peer-graded Assignment
### Segmenting and Clustering Neighborhoods in Toronto

In [2]:
import numpy as np
import pandas as pd
import urllib.request

## Function: souptable_to_dataframe()
Can be used for parsing an html-table contained in a beautiful soup bs.element.tag object.  
At least intended to be generic.

In [29]:
import bs4
from bs4 import BeautifulSoup

def souptable_to_dataframe(souptable):

    if not isinstance(souptable, bs4.element.Tag):
        print("souptable_to_dataframe: Wrong param type!")
        return pd.DataFrame()
    
    rows = souptable.find_all("tr")
    
    if len(rows) < 2:
        print("souptable_to_dataframe: No Table found!")
        return pd.DataFrame()
    
    # First the num of columns and their headers need to be determined 
    
    colcount = 0
    
    headers = rows[0].find_all('th')
    colcount = len(headers)
    
    columns = []
    
    if colcount > 0:
        for header in headers:
            columns.append(header.find(text=True).replace('\n',''))
        #print("Headers: ", ', '.join(columns))
        
    # when there are no headers present the first row is used to count the columns
    # and numeric headers are used
    
    else:        
        print("souptable_to_dataframe: No Header found, using first row for determining no. of cols")
        headers = rows[0].find_all('tr')
        colcount = len(headers)
        columns = range(0,colcount)

    # A 1D Array is filled with the cell values and reshaped later     
    
    vals = []
    
    rcnt = 0
    print(colcount)
    for row in souptable.find_all("tr"):
        ccnt = 0
        cols = row.find_all("td")
        if len(cols) >= colcount:
            rcnt = rcnt+1 
            for col in cols:
                if ccnt < colcount:
                    vals.append(col.find(text=True).replace('\n',''))
                    ccnt = ccnt+1
                    
    # Use a dict to create the dataframe with the column as keys 
    # For reshaping and transposing the array has to be transformed to an numpy matrix and later back to a list
    
    return pd.DataFrame(dict(zip(columns,np.array(vals).reshape(rcnt,ccnt).transpose().tolist())))

# Url retrieval
page content is read into a beatiful soup object

In [4]:
wikiurl = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" 
wikisoup = BeautifulSoup(urllib.request.urlopen(wikiurl), 'lxml')

## Table scraping and processing
Since there's only one table on the page find_all() is used to retrieve it.  
It's passed into the souptable_to_dataframe() function and all 'Not assigned' boroughs are removed from the resulting dataframe

In [58]:
torontotable=wikisoup.find_all('table', class_='wikitable sortable')
df_tor = souptable_to_dataframe(torontotable[0])
df_tor.head()

3


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### remove all boroughs which are 'Not assigned' from the dataframe 

In [59]:
df_tor = df_tor[df_tor['Borough'] != 'Not assigned']
df_tor.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### replace 'Not assigned' neighborhoods with name of borough
**(!)** Only necessary for completing the task **(!)** since table currently has no rows where only neighborhood is not assigned 

In [60]:
for i,row in df_tor.loc[df_tor['Neighbourhood'] == "Not assigned",:].iterrows():
    df_tor.loc[i,'Neighbourhood'] = row['Borough']

df_tor.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### Group by postcode and join Neighbourhoods with ','
Borough is the same for each group and 'max' selects one 

In [215]:
df_tor = df_tor.groupby('Postcode').agg({'Borough': 'max', 'Neighbourhood': lambda c: ",".join(c)}).reset_index()
df_tor.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [214]:
df_tor.shape

(103, 3)