# Segmenting and Clustering Neighborhoods in Toronto

# Step 1 -------------------------------

#### Downloading the neccesary libraries for extraction of the wikipedia table

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Use SHIFT+TAB keys to popup inplace code help
%config IPCompleter.greedy = True

# Output multiple statements from one input cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

print('Libraries imported')

Libraries imported


## Customize Notebook

**table_from_top.** If the Wikipedia page has one table then use `table_from_top = 1` value. Otherwise count table number from top and replace value to get specific table.

**wikipedia_page.** Specify the wikipedia page name from where to source dataset. The CSV file will be saved with the same name.

**trace.** Set `trace = True` to trace how feature values are extracted. Does not save extracted dataset. Prefixes applied parsing/extraction rules to extracted values.

In [2]:
table_from_top = 1
wikipedia_page = 'List of postal codes of Canada: M'
trace = False

## Load and Parse

This section loads the Wikipedia page and parses the table data we are interested in converting to a dataset.

In [3]:
wikipedia_url = 'https://en.wikipedia.org/wiki/{}'.format(wikipedia_page)
page = requests.get(wikipedia_url)
soup = BeautifulSoup(page.content, 'lxml')
tables = soup.find_all('table', {'class': 'wikitable'})
table = tables[table_from_top - 1]

## Quick Preview

This section extracts the table header with feature or column names.

Use this section to quick preview if you have the right table in processing.

In [4]:
feature_names = []

header_row = table.find('tr')
for header in header_row.find_all('th'):
    feature_name = ' '.join(header.find_all(text=True))
    feature_name.replace('\n', '')
    feature_names.append(feature_name)

'Postcode'

'Borough'

'Neighbourhood'

## Data Wrangling

This section applies data wrangling rules based on exceptions found when parsing Wikipedia tables.

- If a feature value contains a link then extract text from the link.
- Ignore text which starts with `[` square brackets.
- Ignore image links (...flags) prefix link text.
- Ignore hidden text used for IDs.

In [5]:
def has_coords(tag):
    if tag.has_attr('class'):
        if tag['class'][0] == 'latitude' or tag['class'][0] == 'longitude':
            return True
    return False

def get_coords(child):
    coords = []
    for coord in child.find_all(has_coords):
        coords.append(coord.string)
    if coords:
        if trace:
            return 'C = {}'.format(' '.join(coords))
        else:
            return ' '.join(coords)
    else:
        return ''

samples = []
sample_rows = table.find_all('tr')[1:]
for sample_row in sample_rows:
    features = []
    for feature_col in sample_row.find_all('td'):
        feature_value = ''
        text = feature_col.string
        if text:
            if trace:
                features.append('T = {}'.format(text))
            else:
                features.append(text)
            continue
        
        for child in feature_col.children:
            if child.name == 'span':
                if child.has_attr('class'):
                    if child['class'] == 'display:none':
                        continue
                if child.find_all(has_coords):
                    feature_value = get_coords(child)
                    if feature_value:
                        break
                    else:
                        continue
            if child.name == 'sup':
                continue
            if child.name == 'a':
                if child.string[0] == '[':
                    continue            
            if child.name == 'a':
                if trace:
                    feature_value = 'A = {}'.format(child.string)
                else:
                    feature_value = child.string
                break
            if child.name == 'font':
                if trace:
                    feature_value = 'F = {}'.format(child.string)
                else:
                    feature_value = child.string
                break
            try:
                # feature_value = '' for any tags not covered above
                content = child.contents
            except AttributeError:
                # Handle whitespace between child tags, treated as a child string
                if child.isspace():
                    continue
                if trace:
                    feature_value = 'E = {}'.format(child)
                else:
                    feature_value = child
                break
        features.append(feature_value)
    samples.append(dict(zip(feature_names, features)))

## Preview Dataset

This section enables you to preview the parsed dataset.

In [6]:
df = pd.DataFrame(samples)
df.head()
#df.tail()

df_canada = df.iloc[:,[2,0,1]]
df_canada.head()


Unnamed: 0,Borough,Neighbourhood,Postcode
0,Not assigned,Not assigned,M1A
1,Not assigned,Not assigned,M2A
2,North York,Parkwoods,M3A
3,North York,Victoria Village,M4A
4,Downtown Toronto,Harbourfront,M5A


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Save Dataset

We can now save the dataset using the same Wikipedia page name we use earlier to extract the dataset.

In [8]:
#dataset_file_name = '../datasets/wikipedia/{}.csv'.format(wikipedia_page)
#if not trace:
 #   df.to_csv(dataset_file_name, index=False)

### Deleting all rows for which the borough is not assigned

In [7]:
df_canada = df_canada[df_canada.Borough != 'Not assigned']
df_canada = df_canada[df_canada.iloc[:,2] != 'Not assigned']
df_canada.head(5)
df_canada.tail(5)
df_canada.shape

length = df_canada.shape[0]





Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Unnamed: 0,Postcode,Borough,Neighbourhood
283,M8Z,Etobicoke,Kingsway Park South West
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor


(212, 3)

#### Making a new dataframe 'Df_new' with resetted indexes

In [8]:
#Df_new = df_canada.iloc[:,[0,1,2]]
Df_new = pd.DataFrame(np.zeros((212, 3)),columns = [['Postcode','Borough','Neighbourhood']])

for k in range(212):
    for i in range(3):
        Df_new.iloc[k,i] = df_canada.iloc[k,i]

Df_new.head(5)
Df_new.tail(5)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Unnamed: 0,Postcode,Borough,Neighbourhood
207,M8Z,Etobicoke,Kingsway Park South West
208,M8Z,Etobicoke,Mimico NW
209,M8Z,Etobicoke,The Queensway West
210,M8Z,Etobicoke,Royal York South West
211,M8Z,Etobicoke,South of Bloor


### Writing all neighbourhoods that belong to the same postcode

In [9]:
for k in range(212):
    if Df_new.iloc[211-k,0] == Df_new.iloc[211-k-1,0]:
        a = Df_new.iloc[211-k-1,2]
        c = Df_new.iloc[211-k,2]
        d = a.rstrip() + ", " + c.rstrip()

        
        Df_new.iloc[211-k-1,2] =  d
        Df_new.drop(211-k, inplace = True)
        

In [10]:
for k in range(Df_new.shape[0]):
    if Df_new.iloc[k,2].rstrip() == 'Not assigned':
        print(k)
        

4


As we can see, the fifth row contains such a value and in the next cell I'll therefore delete it. 

In [11]:
Df_new.iloc[4,2] = Df_new.iloc[4,1] 


In [13]:
Df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park
