## Relevant libraries to be used + web-scraping:

In [1]:
#Libraries
import pandas as pd
import requests as req
from bs4 import BeautifulSoup


#Collecting the data
source = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
web_text= req.get(source).text
data = BeautifulSoup(web_text, 'lxml')
table_body = data.find('div', class_='mw-parser-output').table.tbody

## Function for getting our dataframe:

In [2]:
#Function:
def Toronto_NH(t):
    header = ['PostalCode','Borough','Neighborhood']
    frame = pd.DataFrame(columns = header) 
    
    Borough,PostalCode,Neighborhood  = 0,0,0
    for row in t.find_all('tr'):
        i = 0
        for cell in row.find_all('td'):
            if i == 0:
                PostalCode = cell.text
                i+=1
            elif i == 1:
                Borough = cell.text
                i+=1
            elif i == 2: 
                Neighborhood = cell.text.strip('\n').replace(']','')

        adjust_cols = {header[0]: PostalCode,header[1]: Borough,header[2]: Neighborhood}
        frame = frame.append(adjust_cols,ignore_index=True)
    return frame

#Output:
df = Toronto_NH(table_body)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,0,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


## Ignoring "Not assigned" cells for Borough & row with 0's

In [3]:
df = df[(df.Borough!='Not assigned') & (df.Borough!= 0)]
df.reset_index(drop = True, inplace = True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


## Joining neighborhoods with the same postal code

In [4]:
df = df.groupby('PostalCode', as_index=False).agg(lambda x: ', '.join(sorted(set(x))))
    
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Adjusting "Not assigned" Neighborhoods to their respective boroughs 

In [5]:
NH = df['Neighborhood']
df.loc[NH =='Not assigned','Neighborhood'] = df.loc[NH =="Not assigned",'Borough']

df.head(16)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Number of rows in our dataframe:

In [6]:
df_rows = df.shape[0]
print("No# of rows:", df_rows)

No# of rows: 103


##  Dataframe with geographical coordinates of Neighborhoods

In [7]:
#Getting our coordinates data from the provided source
coords = pd.read_csv('http://cocl.us/Geospatial_data')
coords.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

#Merging our first dataframe with the coordinates dataframe
df_with_coords = pd.merge(df, coords, on='PostalCode')
df_with_coords.head(16)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
