## Part 1 
### Canada borough data preprocessing

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Upload html file with post codes [data source](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

In [2]:
with open('ZIP_list_Canada.html') as html_file:
    soup = BeautifulSoup(html_file, 'lxml')
    
table  = soup.table
# print(table)

### Extract the data from the html source

In [3]:
arr = []
for rows in table.find_all('tr'):
    text = rows.text
    arr.append(text)
code = []
borough = []
neighborhood = []

for line in arr:
    elem = line.split('\n')
#     print(elem[1:4])
    code.append(elem[1])
    borough.append(elem[2])
    neighborhood.append(elem[3])
    
column_names = [code[0], borough[0], neighborhood[0]]
code = code[1:]
borough = borough[1:]
neighborhood = neighborhood[1:]
print(column_names)

['Postcode', 'Borough', 'Neighborhood']


In [4]:
code[:5]

['M1A', 'M2A', 'M3A', 'M4A', 'M5A']

In [5]:
borough[:5]

['Not assigned',
 'Not assigned',
 'North York',
 'North York',
 'Downtown Toronto']

In [6]:
neighborhood[:5]

['Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Harbourfront']

### Creating dataframe from Wiki data

In [7]:
df = pd.DataFrame([code, borough, neighborhood]).T
df.columns = column_names
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning the dataset

In [8]:
#Removing the boroughs that are "Not assigned"

df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [9]:
#Combining Neighborhoods with the same Postcode and Borough 
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [10]:
with pd.option_context('display.max_rows', 500, 'display.max_columns', 50):
    display(df)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [11]:
# Replacing not assigned neighborhoods with the borough values:

df.loc[df['Neighborhood'].str.find('Not assigned') != -1]

Unnamed: 0,Postcode,Borough,Neighborhood
93,M9A,Queen's Park,Not assigned


In [12]:
df.iloc[93]['Neighborhood'] = str(df.iloc[93]['Borough'])
df.iloc[93]

Postcode                 M9A
Borough         Queen's Park
Neighborhood    Queen's Park
Name: 93, dtype: object

In [13]:
df.shape

(103, 3)

## Part 2
### Adding latitude and longitude to each neighborhood

In [14]:
position = pd.read_csv('Geospatial_Coordinates.csv')
position.columns = ['Postcode', 'Latitude', 'Longitude']
position

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Merging dataframes

In [15]:
df = df.merge(position, how='left', on='Postcode')

In [16]:
with pd.option_context('display.max_rows', 500, 'display.max_columns', 50):
    display(df)

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Part 3 
### Clustering

In [18]:
from sklearn.cluster import KMeans
import folium

In [25]:
lat_mean = df['Latitude'].mean()
lon_mean = df['Longitude'].mean()
# create map of New York using latitude and longitude values
toronto_map = folium.Map(location=[lat_mean, lon_mean], zoom_start=10)

# add markers to map
for lat, lon, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

### Define Foursquare Credentials and Version

In [None]:
# CLIENT_ID = 'your-client-ID' # your Foursquare ID
# CLIENT_SECRET = 'your-client-secret' # your Foursquare Secret
# VERSION = '20180605' # Foursquare API version

In [None]:
# print('Your credentails:')
# print('CLIENT_ID: ' + CLIENT_ID)
# print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
# toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# kmeans = KMeans(n_clusters=5, random_state=0).fit(toronto_grouped_clustering)

