#### Author: Ernie Sumoso

## Web scrapping addresses from websites and One-Hot Encoding cities

In [1]:
from bs4 import BeautifulSoup
import requests
import re

def scrap_addresses(url):
    r = requests.get(url)
    bs = BeautifulSoup(r.content)
    text = re.sub('[\n\t\r]', ',', bs.get_text())
    text = ' '.join(text.split())
    text = ','.join([chunk for chunk in text.split(',') if chunk != ''])
    sep = "[, ]*"
    return re.findall(f"[a-zA-Z]*{sep}[a-zA-Z]*{sep}[a-zA-Z]*{sep}[A-Z][0-9][A-Z] ?[0-9][A-Z][0-9]", text)

def scrap_address_websites(websites):
    addresses = []
    for website in websites:
        addresses += scrap_addresses(website)
    return addresses

websites = ["https://www.lambtoncollege.ca/",
            "https://www.torontomu.ca/",
            "https://www.senecapolytechnic.ca/home.html",
            "https://www.uwo.ca/index.html",
            "https://uwaterloo.ca/",
            "https://www.utoronto.ca/contacts"]

addresses = scrap_address_websites(websites)
addresses

['Road, Sarnia, ON, N7S 6K4',
 'Street, Toronto, ON M5B 2K3',
 'Toronto, Ontario, Canada M2J 2X5',
 'London, Ontario, Canada, ,N6A 3K7',
 'Waterloo, ON, Canada,N2L 3G1',
 'Circle,Toronto, Ontario M5S 1A1',
 'Road,Mississauga, Ontario L5L 1C6',
 'Trail,Toronto, Ontario M1C 1A4']

In [2]:
import pandas as pd

def get_dataframe(addresses):
    rows = []
    for web in addresses:
        address = web[:-7] + ',' + web[-7:]
        split = [chunk.strip() for chunk in address.split(',') if chunk not in ['', ' ']]
        rows.append(split)
    return pd.DataFrame(rows, columns=['Street', 'City', 'Province', 'Zip Code'])

df = get_dataframe(addresses)
df

Unnamed: 0,Street,City,Province,Zip Code
0,Road,Sarnia,ON,N7S 6K4
1,Street,Toronto,ON,M5B 2K3
2,Toronto,Ontario,Canada,M2J 2X5
3,London,Ontario,Canada,N6A 3K7
4,Waterloo,ON,Canada,N2L 3G1
5,Circle,Toronto,Ontario,M5S 1A1
6,Road,Mississauga,Ontario,L5L 1C6
7,Trail,Toronto,Ontario,M1C 1A4


In [3]:
def apply_onehot_encoding(df, columns):
    return pd.get_dummies(df, columns = columns)*1

apply_onehot_encoding(df, ['City'])

Unnamed: 0,Street,Province,Zip Code,City_Mississauga,City_ON,City_Ontario,City_Sarnia,City_Toronto
0,Road,ON,N7S 6K4,0,0,0,1,0
1,Street,ON,M5B 2K3,0,0,0,0,1
2,Toronto,Canada,M2J 2X5,0,0,1,0,0
3,London,Canada,N6A 3K7,0,0,1,0,0
4,Waterloo,Canada,N2L 3G1,0,1,0,0,0
5,Circle,Ontario,M5S 1A1,0,0,0,0,1
6,Road,Ontario,L5L 1C6,1,0,0,0,0
7,Trail,Ontario,M1C 1A4,0,0,0,0,1


In [4]:
apply_onehot_encoding(df, ['City', 'Province'])

Unnamed: 0,Street,Zip Code,City_Mississauga,City_ON,City_Ontario,City_Sarnia,City_Toronto,Province_Canada,Province_ON,Province_Ontario
0,Road,N7S 6K4,0,0,0,1,0,0,1,0
1,Street,M5B 2K3,0,0,0,0,1,0,1,0
2,Toronto,M2J 2X5,0,0,1,0,0,1,0,0
3,London,N6A 3K7,0,0,1,0,0,1,0,0
4,Waterloo,N2L 3G1,0,1,0,0,0,1,0,0
5,Circle,M5S 1A1,0,0,0,0,1,0,0,1
6,Road,L5L 1C6,1,0,0,0,0,0,0,1
7,Trail,M1C 1A4,0,0,0,0,1,0,0,1
