# Obtaining the data from "Wikipedia", cleaning and indexing

### 1.reading the table from the webpage in a Pandas DataFrame

In [276]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [277]:
import pandas as pd    
wiki_data = pd.read_html(url, index_col=0, attrs={"class":"wikitable"})     

### 2. Setting column names and reseting the index

In [278]:
df = pd.DataFrame(wiki_data[0])
df.reset_index(inplace=True)
df.columns = df.iloc[0,].tolist()
df = df.drop(index = 0)


In [279]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### 4. Dealing with "Not assigned" and with multiple Postcodes

All columns that have no Borough will be discarded  
All Neighborhoods that are "not assigned" will be the same as the Borough  
Neighborhoods that have the same Postcode will be merged into 1 row, separated with commas ","  

In [280]:
df = df[df.Borough!="Not assigned"]

In [281]:
df = df.apply(cleaner, axis=1)

In [282]:

def cleaner(cell):
    if cell[2] == "Not assigned":
        cell[2] = cell[1]
    return cell
    
    

In [283]:
df=df.sort_values("Postcode").reset_index().drop(['index'], axis=1)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,Rouge
1,M1B,Scarborough,Malvern
2,M1C,Scarborough,Port Union
3,M1C,Scarborough,Rouge Hill
4,M1C,Scarborough,Highland Creek


In [284]:
def merger(cell):
    c=', '
    cell.c.join()

In [285]:
a=df.groupby(["Postcode", "Borough"]).agg(lambda col: ', '.join(col))

In [286]:
a = a.reset_index()
a.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Confirming that this dataframe is the one requested by the assignment. 

In [287]:
a[a["Postcode"]=="M5G"]

Unnamed: 0,Postcode,Borough,Neighbourhood
57,M5G,Downtown Toronto,Central Bay Street


In [288]:
a[a["Postcode"]=="M4B"]

Unnamed: 0,Postcode,Borough,Neighbourhood
35,M4B,East York,"Parkview Hill, Woodbine Gardens"


In [289]:
a[a["Postcode"]=="M1B"]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"


In [290]:
a.shape

(103, 3)

In [291]:
a.to_csv("neigh_data.csv")