# Assignment Segmenting and Clustering Neighborhoods in Toronto - Week 3

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.2 MB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0         conda-forge
    geopy:         1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    openssl:       1.1.1

## SCRAPING Wikipedia Page to extract NEIGHBORHOODS in TORONTO

In [2]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'html.parser')


Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/3b/c8/a55eb6ea11cd7e5ac4bacdf92bac4693b90d3ba79268be16527555e186f0/beautifulsoup4-4.8.1-py3-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 2.7MB/s ta 0:00:011
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.1 soupsieve-1.9.5


In [5]:
My_table = soup.find_all('table', class_='sortable')

In [6]:
for table in My_table:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['postcode', 'borough', 'neighbourhood']:
        break

# Extract the columns we want and write to a semicolon-delimited text file.
with open('Toronto_neighbourhoods.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        postcode, borough, neighbourhood = [td.text.strip() for td in tds[:3]]
        # Wikipedia does something funny with country names containing
        # accented characters: extract the correct string form.
        print('; '.join([postcode, borough, neighbourhood]), file=fo)


In [7]:
#Create a dataframe from the extracted text file
df = pd.read_csv('Toronto_neighbourhoods.txt', sep=";", header=None)
df.columns=['Postal Code', 'Borough', 'Neighborhood']
df.shape

(288, 3)

## Removing the tables which are NOT ASSIGNED

In [8]:
df=df.drop(df[df['Borough']==' Not assigned'].index)
df.shape

(211, 3)

## Merging Multiple Neighborhoods with Similar Postal Codes

In [9]:
df1 = df.groupby('Postal Code').agg({'Borough':'first','Neighborhood': ', '.join})
df3=df[{'Postal Code'}]
df4=df3.drop_duplicates()
df_inner = pd.merge(df4, df1, on='Postal Code', how='inner')


## Reassigning Neighborhood Names to Borough Name in case its NOT ASSIGNED

In [10]:
df_inner=df_inner.replace(to_replace=r' Not assigned', value='Queen\'s Park', regex=True)

In [11]:
df_inner.shape

(103, 3)

## Adding Geographical Co-Ordinates of the Neighborhoods

In [12]:
df_loc = pd.read_csv('https://cocl.us/Geospatial_data')

In [13]:
df_loc.shape

(103, 3)

## Combining THE TWO DATAFRAMES

In [14]:
df_final = pd.merge(df_inner, df_loc, on='Postal Code', how='inner')

df_final.shape


(103, 5)

In [15]:
df_final

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


### Clustering the Neighborhoods in Toronto with only Boroughs that Contain the Word TORONTO

In [16]:
toronto_data = df_final[df_final['Borough'].str.contains(' Toronto', regex=False)].reset_index(drop=True)
toronto_data.shape

(38, 5)

In [17]:
toronto_data

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
8,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
9,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union S...",43.640816,-79.381752


## GETTING THE GEOGRAPHICAL CO-ORDINATES of TORONTO

In [18]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## VISUALIZING TORONTO and ITS NEIGHBORHOODS

In [19]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto