# Capstone Course
## Week 03 Assignment
### Part 1 - Data scrapping of Canada postal codes


In [1]:
# Inlcude Beutifulsoup library
from bs4 import BeautifulSoup


In [2]:
# Open the file directly from its URL
import urllib3
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
http = urllib3.PoolManager()
page = http.request('GET', url)
bs = BeautifulSoup(page.data)

# print(bs.prettify()) #to verify that we get the html file correctly




In [3]:
# Obtain the table information 

table = bs.find(lambda tag: tag.name=='tbody')  


In [24]:
# Process the information we have from the html table
# Since the format changed from a regular 3 column table to a grid,
# and will process each 'td' table data cell, and clean the information
# while we iterate between the cells

import pandas as pd

# Identify the total amount of cells, that will be the same as
# the total rows of our data frame
count_rows = 0
for row in table.find_all('tr'):
    for column in row.find_all('td'):
        count_rows +=1

# Create a dataframe so we can fill it with the scraped data

canada_codes = pd.DataFrame(columns = ['PostalCode','Borough','Neighbourhood'], index = range(0,count_rows))

# Process the cells in modified wiki table

cell_marker = 0
for row in table.find_all('tr'):
    for column in row.find_all('td'):
        text_string = column.get_text().strip('\n')
        postal_code = text_string[0:3]
        # Use the first parenthesis as a basis to separate bourghs from neighbourhoods
        first_parenthesis = text_string.find('(',3)
        if first_parenthesis == -1:
            # Catch an expection to the '(' rule, where its not consisten and a '/' is used
            if text_string.find('/',3) == -1:
                borough = text_string[3: len(text_string)]
                neighbourhood = 'NaN'
            else:
                borough = text_string[3: text_string.find('/',3)]
                neighbourhood = text_string[text_string.find('/',3):len(text_string)].replace('/','')
        else:
            borough = text_string[3: first_parenthesis]
            #And replace and clean the neighberhood string data
            neighbourhood = text_string[first_parenthesis : len(text_string)].replace('/',', ').replace('(','').replace(')','')
        
        # Insert the cell data into our dataframe
        canada_codes.iat[cell_marker,0] = postal_code
        canada_codes.iat[cell_marker,1] = borough
        canada_codes.iat[cell_marker,2] = neighbourhood
        
        cell_marker += 1

# Review the inserted data
canada_codes.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"


In [25]:
# Create a copy to manipulate to transform the canada_codes table, eliminating unecesary boroughs 'Not assigned'
df_copy = canada_codes[canada_codes.Borough != 'Not assigned'].copy()

df_copy.head(5)


Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


In [26]:
# We confirm there are no Neighberhoods with "Not assigned"
df_copy_2 = df_copy[df_copy.Neighbourhood == 'Not assigned'].copy()
df_copy_2.head(5)


Unnamed: 0,PostalCode,Borough,Neighbourhood


In [29]:
# We review the shape of the datafram, resulting in 103 rows with 3 columns
df_copy.shape

(103, 3)

### Week 03 Assignment - Results
### Part 1 - Data scrapping of Canada postal codes
## 103 rows of Postal Codes with 3 columns (PostalCode, Borough, Neigborhood)