# Assignment- Segmenting and Clustering Neighborhoods in Toronto

## Project Requirement: To explore and cluster the neighborhoods in Toronto.

### Importing required libraries


In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim

import warnings
warnings.filterwarnings('ignore')

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library


### Loading and Parsing web page into XML code

In [2]:
# loading data from internet into xml text format
wiki_data= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# using Beautiful Soup to parse the HTML/XML codes.
soup = BeautifulSoup(wiki_data,'lxml')
#print(soup.prettify())

### Processing: (Step 1) Locating and Extracting raw table from the webpage


In [3]:
table = soup.find('table')
columns=table.find_all('td')
no_elements=len(columns)
Postcode      = []
Borough       = []
Neighborhood = []

In [4]:
for i in range(0, no_elements, 3):
    Postcode.append(columns[i].text.strip())
    Borough.append(columns[i+1].text.strip())
    Neighborhood.append(columns[i+2].text.strip())

### Transforming raw data into an appropriate Pandas Dataframe

In [5]:
df_raw = pd.DataFrame(data=[Postcode, Borough, Neighborhood]).transpose()
df_raw.columns = ['Postcode', 'Borough', 'Neighborhood']

In [6]:
df_raw.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning data in the dataframe to drop values that are Not assigned

In [7]:
df_raw.drop(df_raw[df_raw['Borough'] == 'Not assigned'].index, inplace=True)
df_raw.loc[df_raw.Neighborhood== 'Not assigned', "Neighborhood"] = df_raw.Borough

### Grouping the data by Postcode & Borough

In [8]:
df_1 = df_raw.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_1.columns = ['Postcode', 'Borough', 'Neighborhood']

In [9]:
df_1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df_1['Borough'].unique()),
        df_1.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


### Number of rows and columns in dataframe

In [11]:
df_1.shape

(103, 3)

### Read the Geospatial csv file and inner join it with dataframe


In [12]:
df_loc = pd.read_csv('http://cocl.us/Geospatial_data')
df_loc.columns = ['Postcode', 'Latitude', 'Longitude']

In [13]:
df_2= pd.merge(df_1, df_loc, on=['Postcode'], how='inner')


In [14]:
df_2.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
#instantiating dataframe with required columns Borough and Neighborhood
neighborhoods = df_2[['Borough', 'Neighborhood', 'Latitude', 'Longitude']].copy()
neighborhoods.head(5)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


### Clustering neighborhoods in Toronto

In [16]:
#Selecting the address of neighborhoods as Toronto
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Visualizing the neighborhoods and clusters

In [17]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto