# Segmenting and Clustering Neighborhoods in New York City

In [27]:
import pandas as pd
import numpy as np

!pip install folium
import folium

import requests
from bs4 import BeautifulSoup

from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


In [3]:
# capture the wiki webpage, and process it

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url).text
soup = BeautifulSoup(page, 'lxml')

In [4]:
# stopIndex is a helper function to help with string processing for Borough
# this cell extract the postcodes, boroughs, neighbourhoods from the page

def stopIndex(s):
    capital_indx = [i for i, c in enumerate(s) if c.isupper()]
    #print(capital_indx)
    for i in capital_indx:
        #print(i)
        if i != 0:
            if s[i-1].islower():
                return i
    else:
        return len(s)
    
    
postcodes = []
boroughs = []
neighbourhoods = []
for row in soup.table.find_all('td'):
    #print(row.text)
    r = row.text.strip()
    postcode = r[:3]
    stop_idx = stopIndex(r[3:].split('(')[0])
    borough = r[3:].split('(')[0][:stop_idx]
    
    if 'Not assigned' in r:
        neighbourhood = 'Not assigned'
    else:
        neighbourhood = r.split('(')[-1][:-1].split(' / ')
        neighbourhood = ', '.join(neighbourhood)
        #print(','.join(neighbourhood))
    postcodes.append(postcode)
    boroughs.append(borough)
    neighbourhoods.append(neighbourhood)

In [5]:
df = pd.DataFrame(data = {'Postcode' : postcodes, 'Borough': boroughs, 'Neighbourhood': neighbourhoods})
print(df.shape)
df.head()

(180, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Preprocessing Data

In [6]:
# Dropping the rows where Borough is 'Not assigned'
df = df[df['Borough'] != 'Not assigned']

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned',df['Borough'], df['Neighbourhood'])

In [7]:
print(df.shape)

(103, 3)


# Code Segment 2

In [21]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df_toronto = df[df['Borough'].str.contains('Toronto',regex=False)]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


## Combining the geodata and df_toronot to get latitudes and longitudes

In [22]:
geo_data.rename(columns={'Postal Code':'Postcode'},inplace=True)
df_combined = pd.merge(df_toronto,geo_data,on='Postcode')
df_combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


# Code segment 3
## Clustering

### Visualizing the data using folium

In [23]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(df_combined['Latitude'],df_combined['Longitude'],df_combined['Borough'],df_combined['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='black',
    fill=True,
    fill_color='red',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

# Clustering the data using Kmeans

In [24]:
k=5
toronto_clustering = df_combined.drop(['Postcode','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
df_combined.insert(0, 'Cluster Labels', kmeans.labels_)

In [28]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_combined['Latitude'], df_combined['Longitude'], df_combined['Neighbourhood'], df_combined['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters