# Segmenting and Clustering Neighborhoods in Toronto (Part1, Part2, and Part 3)
## In this assignment, I will be required to explore, segment, and cluster the neighborhoods in the city of Toronto.

# PART 1 ( Dataframe 1)

In [28]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
from bs4 import BeautifulSoup # library for pulling data out of HTML and XML files
import requests # library to handle requests

# getting data from wikipedia link
wikipedia_link = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# creating soup object
soup = BeautifulSoup(wikipedia_link.text, 'lxml')

# geting the data from the link and store it into a list
#Postcode      = []
#Borough       = []
#Neighbourhood = []
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    if (index == 0):
        columns = section
    else:
        data.append(section)

# transforming the data into a pandas dataframe
pd.DataFrame(data = data,columns = columns).head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Queen's Park,Not assigned


In [29]:
# removing all the duplicates
table = pd.DataFrame(data = data,columns = columns)
table = table[table.Borough != 'Not assigned']
table.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [30]:
# Removing all 'Not assigned' 
table = table[table['Borough'] != 'Not assigned']
table = table[table['Neighbourhood'] != 'Not assigned']
table.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North
14,M4B,East York,Woodbine Gardens


In [31]:
# Grouping by Nieghborhood Postcodes
df = table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ", ".join(x.astype(str))).reset_index()
df = df.sample(frac=1).reset_index(drop=True)
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M4J,East York,East Toronto
1,M1W,Scarborough,L'Amoreaux West
2,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
3,M5M,North York,"Bedford Park, Lawrence Manor East"
4,M1V,Scarborough,"Agincourt North, L'Amoreaux East, Milliken, St..."
5,M2J,North York,"Fairview, Henry Farm, Oriole"
6,M6L,North York,"Downsview, North Park, Upwood Park"
7,M4L,East Toronto,"The Beaches West, India Bazaar"
8,M4P,Central Toronto,Davisville North
9,M6N,York,"The Junction North, Runnymede"


In [32]:
df.shape

(102, 3)

# PART 2 (Dataframe 2)

In [27]:
# merging dataframe that contain coordinates with the one that contains borough names
df['Latitude'] = '0';
df['Longitude'] = '0';

link2 = pd.read_csv('https://cocl.us/Geospatial_data')
for i in df.index:
    for j in link2.index:
        if df.iloc[i, 0] == link2.iloc[j, 0]:
            df.iloc[i, 3] = link2.iloc[j, 1]
            df.iloc[i, 4] = link2.iloc[j, 2]            

# checking the dataframe     
df.head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M2N,North York,Willowdale South,43.7701,-79.4085
1,M7A,Downtown Toronto,Queen's Park,43.6623,-79.3895
2,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.7574,-79.2733
3,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol...",43.6435,-79.5772
4,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.6289,-79.3944
5,M3C,North York,"Flemingdon Park, Don Mills South",43.7259,-79.3409
6,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111,-79.2846
7,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7185,-79.4648
8,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
9,M4N,Central Toronto,Lawrence Park,43.728,-79.3888


# PART 3 (Explore and cluster the neighborhoods in Toronto)

In [43]:
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# getting latitude and longitude coordinates for Toronto
address = 'Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [None]:
# creating map of New York using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# adding markers to map
for lat, lng, label in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto