# Scraping from Wikipedia

In [1]:
# Uncomment if beautiful soup dependency is not installed
#!conda install beautifulsoup4 --yes

In [2]:
#Import dependencies
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

#Scrape the table with BeautifulSoup and requests
website = requests.get("http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(website.content,'lxml')
zip_table = soup.find_all('table')[0] 

#use pandas to read a list of dataframes into zip_df
zip_df = pd.read_html(str(zip_table))
#reassign zip_df into the single dataframe in the list
zip_df = zip_df[0]
zip_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Fulfill assignment requests:

1) Only process the cells that have an assigned borough. Ignore cells with a borough that is **Not assigned**.


2) More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that **M5A** is listed twice and has two neighborhoods: **Harbourfront** and **Regent Park**. These two rows will be combined into one row with the neighborhoods separated with a comma.


3) If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the **Borough** and the **Neighborhood** columns will be **Queen's Park**.


4) Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.


5) In the last cell of your notebook, use the **.shape** method to print the number of rows of your dataframe.


In [3]:
#Delete rows with a borough that is "Not assigned"

zip_df = zip_df[zip_df['Borough'] != 'Not assigned']
zip_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [4]:
# If more than one neighbourhood is assigned to a borough, group by postcode and borough, then join Neighbourhood with a comma

zip_df = zip_df.groupby(['Postcode','Borough']).agg({'Neighbourhood': ','.join})
zip_df.reset_index(inplace=True)
zip_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [5]:
#If a neighbourhood is "Not assigned", then it will be reassigned the Borough name
#Recall: Boroughs that are "Not assigned" have already been deleted

zip_df['Neighbourhood'] = np.where(zip_df.Neighbourhood=='Not assigned',zip_df['Borough'],zip_df['Neighbourhood'])
zip_df.head(100) #look for Queen's Park Borough (index 93), where Neighbourhood was 'Not assigned'


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [6]:
zip_df.shape

(103, 3)