In [6]:
import pandas as pd
import requests
from bs4 import BeautifulSoup


res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
findtables = soup.find_all('table')

# scrape all the tables found on the webpage
tables = pd.read_html(str(findtables))

# check the total number of tables found (4 in this case)
print(len(tables))

4


In [7]:
# figure out which table out of the four is the one of interest (luckily the first one it is :D)
tables[0]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [8]:
# read the table into a dataframe, remove the space in the column header 'Postal Code' 

df = tables[0]
df.columns = ['PostalCode', 'Borough', 'Neighbourhood']

# check the shape of the dataframe (180 rows incl. header and 3 columns)
df.shape

(180, 3)

In [9]:
# filter out rows where borough has a valid value and store the filtered data into a new dataframe df_filtered 

df_filtered = df[df['Borough']!='Not assigned']

# check how many rows remain (103) and print out unique borough names to check if all 'not assigned' has been removed
print(df_filtered.shape)
print(df_filtered['Borough'].unique())

(103, 3)
['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


In [11]:
# add a column 'check_neighbourhood' that checks if any neighbourhood is 'Not assigned'
df_filtered['check_neighbourhood'] = df_filtered['Neighbourhood']!='Not assigned'
df_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,PostalCode,Borough,Neighbourhood,check_neighbourhood
2,M3A,North York,Parkwoods,True
3,M4A,North York,Victoria Village,True
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",True
5,M6A,North York,"Lawrence Manor, Lawrence Heights",True
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",True
...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",True
165,M4Y,Downtown Toronto,Church and Wellesley,True
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",True
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",True


In [12]:
# count the number of neighbours that have a valid value. The number equals row number indicating all neighbourhood is assigned a value
df_filtered['check_neighbourhood'].value_counts()

True    103
Name: check_neighbourhood, dtype: int64

In [13]:
# drop the checking column and store the cleaned data into df_clean

df_clean = df_filtered.drop(['check_neighbourhood'], axis=1)
df_clean

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [14]:
df_clean.shape

(103, 3)