# Data Scrapping

#### importing library

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

#### scrapping the web

In [4]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
print('Data downloaded!')

Data downloaded!


In [5]:
soup = BeautifulSoup(website_url,'html5lib')

#### initialize row and column for table

In [6]:
rows = []
columns=['Postcode', 'Borough', 'Neighbourhood']

#### filling the rows

In [7]:
table = soup.find("table", { "class" : "wikitable sortable" })

for row in table.findAll("tr"):
    cells = row.findAll("td")
    #For each "tr", assign each "td" to a variable.
    if len(cells) == 3:
        postcode = cells[0].find(text=True)
        borough = cells[1].find(text=True)
        neighbourhood = cells[2].find(text=True)
        rows.append([postcode,borough,neighbourhood])
 

#### import and create table

In [8]:
table = pd.DataFrame(rows, columns = columns)

#### dropping rows that contain 'Not assigned' and reset table's index

In [9]:
table = table[~table.Borough.str.contains("Not assigned")]
table = table[~table.Neighbourhood.str.contains("Not assigned")]
table = table.reset_index()
table.drop(['index'], axis = 1,inplace=True)
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Merging row and removing '/n'

In [10]:
table = table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
table = table.replace('\n','', regex=True)
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### showing the dimension of table

In [11]:
table.shape

(102, 3)

#### Exporting the table into csv

In [12]:
table.to_csv('output.csv')