<h1>Scraping the following Wikipedia page into dataframe</h1><p>
1. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood<p>
2. Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.<p>
3. More than one neighborhood can exist in one postal code area.<p>
4. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.<p>

In [47]:
import bs4 as bs
import urllib.request
import pandas as pd

source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')

table = soup.find('table', attrs={'class':"wikitable sortable"})
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        stop_add = 0
        if row[1] != 'Not assigned':
            if row[2] == 'Not assigned':
                row[2] = row[1]
            for r in res:
                if row[0] == r[0]:
                    r[2] = r[2] + ', ' + row[2]
                    stop_add = 1
                    break
            if stop_add == 0:
                res.append(row)


df_neigh = pd.DataFrame(res, columns=["postcode", "borough", "neighborhood"])
print(df_neigh.head(11)) 
print('Totally imported:', df_neigh.shape) 

   postcode           borough                      neighborhood
0       M3A        North York                         Parkwoods
1       M4A        North York                  Victoria Village
2       M5A  Downtown Toronto                      Harbourfront
3       M6A        North York  Lawrence Heights, Lawrence Manor
4       M7A  Downtown Toronto                      Queen's Park
5       M9A      Queen's Park                      Queen's Park
6       M1B       Scarborough                    Rouge, Malvern
7       M3B        North York                   Don Mills North
8       M4B         East York   Woodbine Gardens, Parkview Hill
9       M5B  Downtown Toronto          Ryerson, Garden District
10      M6B        North York                         Glencairn
Totally imported: (103, 3)


<h1>Getting the latitude and the longitude coordinates of each neighborhood</h1>

In [48]:
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data

df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

df_neigh = pd.merge(left=df_neigh,right=df_coordinates, left_on='postcode', right_on='Postal Code')
df_neigh = df_neigh.drop(columns='Postal Code')
print(df_neigh.head(11))
print('Totally imported:', df_neigh.shape) 

   postcode           borough                      neighborhood   Latitude  \
0       M3A        North York                         Parkwoods  43.753259   
1       M4A        North York                  Victoria Village  43.725882   
2       M5A  Downtown Toronto                      Harbourfront  43.654260   
3       M6A        North York  Lawrence Heights, Lawrence Manor  43.718518   
4       M7A  Downtown Toronto                      Queen's Park  43.662301   
5       M9A      Queen's Park                      Queen's Park  43.667856   
6       M1B       Scarborough                    Rouge, Malvern  43.806686   
7       M3B        North York                   Don Mills North  43.745906   
8       M4B         East York   Woodbine Gardens, Parkview Hill  43.706397   
9       M5B  Downtown Toronto          Ryerson, Garden District  43.657162   
10      M6B        North York                         Glencairn  43.709577   

    Longitude  
0  -79.329656  
1  -79.315572  
2  -79.360636  

<h1>Exploring and cluster the neighborhoods in Toronto and nearby</h1>

### Clustering process

In [45]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5


grouped_clustering = df_neigh.drop(['neighborhood','borough','postcode'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

df_neigh.insert(0, 'Cluster Labels', kmeans.labels_)
df_neigh.head()

Unnamed: 0,Cluster Labels,postcode,borough,neighborhood,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,2,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


### Visualize cluster groups

In [46]:
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


latitude = 43.761539
longitude = -79.411079

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_neigh['Latitude'], df_neigh['Longitude'], df_neigh['neighborhood'], df_neigh['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters