<h3>importing Packages</h3>

In [4]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np 
import pandas as pd

<h3>importing Data</h3>

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
request = requests.get(url).text


In [6]:
soup = BeautifulSoup(request, 'lxml') #initialize object from beautiful soup class

<h3>define the Scrapping function</h3>

In [7]:
class Scrap_wiki:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(self.parse_html_table(table))\
                    for table in soup.find_all('table', class_="wikitable sortable")]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
            for row in table.find_all('tr'):
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        n_columns = len(td_tags)
                        
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

<h3>create and make some changes on the data</h3>

In [37]:
scrap = Scrap_wiki()
df_initial = scrap.parse_url(url)[0] 

df_initial.rename(columns={"Postal Code\n": "Postal Code", "Borough\n": "Borough","Neighborhood\n": "Neighborhood"}, inplace=True)

df_initial['Postal Code']= df_initial['Postal Code'].str.replace("\n", "", case = False)
df_initial['Borough']= df_initial['Borough'].str.replace("\n", "", case = False)
df_initial['Neighborhood']= df_initial['Neighborhood'].str.replace("\n", "", case = False)
df_initial.head(20)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,Islington Avenue
9,M1B,Scarborough,"Malvern, Rouge"


In [30]:
df_without_NA = df_initial[df_initial.Borough != 'Not assigned'] #dropping the line with not assigned Borough
df_without_NA.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [35]:
#handling repeated postal code and borough 
df = df_without_NA.groupby(['Postal Code','Borough'])['Neighborhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df_without_NA.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores"
1,M7R,Mississauga,Canada Post Gateway Processing Centre
2,M6G,Downtown Toronto,Christie
3,M1W,Scarborough,"Steeles West, L'Amoreaux West"
4,M9B,Etobicoke,"West Deane Park, Princess Gardens, Martin Grov..."


In [36]:
df.shape 

(103, 3)