<a href="https://colab.research.google.com/github/1jlal/Coursera_Capstone/blob/master/Segmenting_and_Clustering_Toronto_City_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Segmenting and Clustering Toronto Neighborhoods Data**
---



# _ Part 1 _



In [1]:
import requests
from bs4 import BeautifulSoup as bs
import numpy as np

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium --yes
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Scraping Toronto postal codes table from wikipedia page using BeautifulSoup library and converting it into a pandas DataFrame

In [4]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
webpage = bs(r.content)

table = webpage.select('table.wikitable')[0]
# print(table)
columns = table.find_all('th')
column_names = [str(c.string).strip() for c in columns]
# print(column_names)

l = []
table_rows = table.find('tbody').find_all('tr')

for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.string).strip() for tr in td]
    l.append(row)
# print(l[0:10])

df = pd.DataFrame(l, columns=column_names)

df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [5]:
df = df.drop(index=[0], axis=1)
df.reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Removing all Boroughs from dataframe with no labels

In [8]:
df = df[df['Borough'] != 'Not assigned']

df[df['Neighbourhood'] == 'Not assigned'].count()

df.head()
df.shape

(103, 3)

# _ Part 2 _


Using geocoder to obtain coordinates of the postal code doesn't function accurately. So we will skip this code and use the csv file instead

In [12]:
# '''
# import geocoder
# latitude=[]
# longitude=[]
# for code in df_new['Postal Code']:
#     g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
#     print(code, g.latlng)
#     while (g.latlng is None):
#         g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
#         print(code, g.latlng)
#     latlng = g.latlng
#     latitude.append(latlng[0])
#     longitude.append(latlng[1])
    
    
# coord_data = [latitude, longitude] 
# coord_labels = ['Latitude', 'Longitude']
# coord_df = pd.DataFrame(coord_data).T
# coord_df.columns = coord_labels
# coord_df.head()


# df_cnd = pd.concat([df_new, coord_df], axis=1)
# df_cnd

# '''

Extracting the geo coordinates from the csv file into a dataframe

In [13]:
# reading csv file into dataframe

url='http://cocl.us/Geospatial_data'

df_coord =pd.read_csv(url)
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Sorting data by postal code to merge the two dataframes together

In [15]:
df = df.sort_values('Postal Code')
hoods = pd.merge(df, df_coord, how='right', on='Postal Code')
hoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
hoods.shape

(103, 5)

# _ Part 3 _

Checking the number of Boroughs and neighborhoods

In [17]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(hoods['Borough'].unique()),
        hoods.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


Use geopy library to get the latitude and longitude values of Toronto, Canada.


In [18]:
address = 'Toronto, CN'

geolocator = Nominatim(user_agent="CN_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6425637, -79.38708718320467.


Creating a map of Toronto with neighborhoods superimposed on top using Folium

In [23]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

#adding markers to map
for lat, lng, borough, neighbourhood in zip(hoods['Latitude'], hoods['Longitude'], hoods['Borough'], hoods['Neighbourhood']):
     label = f'{neighbourhood}, {borough}'
     label = folium.Popup(label, parse_html=True)
     folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto