### The following Notebook will scrape and clean a data frame from Wikipedia on Candadian postal codes
#### The first step is to import pandas and read the dataframe

In [1]:
# scrape webpage into pandas df

import pandas as pd

df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = df[0]
df.head()

Unnamed: 0,Postal Code,District,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Step 2 is to remove Not Assigned from the Neighbourhood column

In [2]:
# drop Not Assigned from the District column

mask = df['District'].isin(['Not assigned'])
df[~mask].head()

Unnamed: 0,Postal Code,District,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Step 3 is to identify neighbourhoods in the same postal code and combine them into one neighborhood separated by a comma

In [3]:
# Combine duplicate postal codes with neighborhoods separated by a comma

df1=df[~mask].groupby("Postal Code").agg(lambda x:','.join(set(x)))
df1.head()

Unnamed: 0_level_0,District,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


#### Step 4, if a postal code has a district but a neighbourhood which is unassigned, then the neighbourhood will be the same as the district

In [5]:
# If a cell has a district but a Not assigned neighborhood, then the neighborhood will be the same as the district.

df1.loc[df1['Neighbourhood']=="Not assigned",'Neighbourhood']=df1.loc[df1['Neighbourhood']=="Not assigned",'District']
df1.head()

Unnamed: 0_level_0,District,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


#### Finally, print the number of rows (shape) of the dataframe

In [6]:
df1.shape

(103, 2)