# Toronto Neighborhood Clustering: Scrape Wiki

In [1]:
!pip install bs4

Collecting bs4
  Downloading https://files.pythonhosted.org/packages/10/ed/7e8b97591f6f456174139ec089c769f89a94a1a4025fe967691de971f314/bs4-0.0.1.tar.gz
Collecting beautifulsoup4 (from bs4)
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 31.6MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4->bs4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/jupyterlab/.cache/pip/wheels/a0/b0/b2/4f80b9456b87abedbc0bf2d52235414c3467d8889be38dd472
Successfully built bs4
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.8.2 bs

In [48]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

### Reading Wiki's page

In [49]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_html = requests.get(url).text
soup = BeautifulSoup(wiki_html, 'html.parser')

### Wrangling and Cleanup

In [50]:
data = []
for tr in soup.tbody.find_all('tr'):
    data.append([ td.get_text().strip() for td in tr.find_all('td')])

In [51]:
df = pd.DataFrame(data, columns=['PostalCode','Borough','Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [52]:
not_assigned_row = df[ (df.Borough == 'Not assigned') & (df.Neighborhood == 'Not assigned') ]
df.drop(not_assigned_row.index, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor


In [53]:
df.shape

(210, 3)

In [57]:
def combine_neighborhoods(grp_val):    
    if( len(grp_val) == 1 ):
        borough = grp_val['Borough'].tolist()[0] 
        neighborhood = grp_val['Neighborhood'].tolist()[0] 
        if( neighborhood) == 'Not assigned':
            return borough
        else:
            return neighborhood
    else:
        # Combine neighborhoods for the same postal code
        return ', '.join(sorted(grp_val['Neighborhood'].tolist())) 
df2 = df.groupby(['PostalCode', 'Borough']).apply(combine_neighborhoods).reset_index(name='Neighborhood')

In [58]:
df2

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [None]:
df2.to_csv('Wiki_Scrape.csv', index=False)

In [59]:
df2.shape

(103, 3)