# Notebook to create the desired pandas dataframe (Postal Code, Borough, Neighborhood) 

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library
import urllib.request
from bs4 import BeautifulSoup

print('Import done')

Import done


### URL do get the postal codes from wikipedia

In [2]:
# URL/web page to be scraped to get ZIP codes of Canada
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")
right_table=soup.find('table', class_='wikitable sortable')

### Extract postal codes and corresponding boroughs and neighborhoods and create the desired pandas dataframe

In [3]:
Postcode=[]
Borough=[]
Neighborhood=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        Postcode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))
        
CA_postcodes = pd.DataFrame({'PostalCode': Postcode, 'Borough': Borough, 'Neighborhood': Neighborhood})
CA_postcodes.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Identify and drop entries with not assigned boroughs (and delete '\n'-extensions)

In [4]:
index_Notassigned = CA_postcodes[CA_postcodes['Borough'] == 'Not assigned'].index
CA_postcodes.drop(index_Notassigned, inplace=True)
CA_postcodes = CA_postcodes.replace('\n','', regex=True)
CA_postcodes.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Identify not assigned neighborhoods and replace 'Not assigned' with the corresponding borough

In [5]:
i = 0
for checker in CA_postcodes['Neighborhood']:
    i = i + 1
    if checker == 'Not assigned':
        CA_postcodes['Neighborhood'][i+1] = CA_postcodes['Borough'][i+1]

CA_postcodes.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Verify that 'Not assigned'-entries did not remain

In [6]:
chk_notassigned = CA_postcodes.isin(['Not assigned','Not assigned\n'])
chk_notassigned.describe()

Unnamed: 0,PostalCode,Borough,Neighborhood
count,211,211,211
unique,1,1,1
top,False,False,False
freq,211,211,211


### Group neighborhoods to a single postal code

In [7]:
final_df = CA_postcodes.groupby(['PostalCode','Borough'], sort=False)['Neighborhood'].apply(', '.join).to_frame(name = 'Neighborhood').reset_index()
final_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Use the .shape-method to print the dimensions of the dataframe

In [8]:
print('The dataframe has',final_df.shape[0],'rows and',final_df.shape[1],'columns')

The dataframe has 103 rows and 3 columns
