## This is Part 1 of Capstone project 

#### Importing libraries

In [1]:
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

#### Assign url

In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


#### Extract data from url with BeautifulSoup

In [3]:
soup = BeautifulSoup(url, 'lxml')
#print(soup.prettify()) # we will not print it because a lot of data will be displayed 

#### With BeautifulSoup find table data that can be sorted

In [5]:
my_table = soup.find('table',{'class':'wikitable sortable'})
# print(my_table) # here we also have a lot of data, but it always can be printed for verification 

#### Create the dataframe columns 

In [6]:
neighbourhoods = pd.DataFrame(columns = ["PostalCode", "Borough", "Neighbourhood"])

neighbourhoods

Unnamed: 0,PostalCode,Borough,Neighbourhood


#### Fill dataframe with data from url, one row at a time

In [7]:
for row in my_table.find_all('tr')[1:]:
    temp = []
    for cell in row.find_all('td'):
        temp.append(cell.text)
        #print(cell.text)
    print(temp)
    neighbourhoods = neighbourhoods.append(
    dict(zip(["PostalCode", "Borough", "Neighbourhood"], temp)), ignore_index=True)


['M1A\n', 'Not assigned\n', 'Not assigned\n']
['M2A\n', 'Not assigned\n', 'Not assigned\n']
['M3A\n', 'North York\n', 'Parkwoods\n']
['M4A\n', 'North York\n', 'Victoria Village\n']
['M5A\n', 'Downtown Toronto\n', 'Regent Park, Harbourfront\n']
['M6A\n', 'North York\n', 'Lawrence Manor, Lawrence Heights\n']
['M7A\n', 'Downtown Toronto\n', "Queen's Park, Ontario Provincial Government\n"]
['M8A\n', 'Not assigned\n', 'Not assigned\n']
['M9A\n', 'Etobicoke\n', 'Islington Avenue, Humber Valley Village\n']
['M1B\n', 'Scarborough\n', 'Malvern, Rouge\n']
['M2B\n', 'Not assigned\n', 'Not assigned\n']
['M3B\n', 'North York\n', 'Don Mills\n']
['M4B\n', 'East York\n', 'Parkview Hill, Woodbine Gardens\n']
['M5B\n', 'Downtown Toronto\n', 'Garden District, Ryerson\n']
['M6B\n', 'North York\n', 'Glencairn\n']
['M7B\n', 'Not assigned\n', 'Not assigned\n']
['M8B\n', 'Not assigned\n', 'Not assigned\n']
['M9B\n', 'Etobicoke\n', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale\n']
['M

['M5X\n', 'Downtown Toronto\n', 'First Canadian Place, Underground city\n']
['M6X\n', 'Not assigned\n', 'Not assigned\n']
['M7X\n', 'Not assigned\n', 'Not assigned\n']
['M8X\n', 'Etobicoke\n', 'The Kingsway, Montgomery Road, Old Mill North\n']
['M9X\n', 'Not assigned\n', 'Not assigned\n']
['M1Y\n', 'Not assigned\n', 'Not assigned\n']
['M2Y\n', 'Not assigned\n', 'Not assigned\n']
['M3Y\n', 'Not assigned\n', 'Not assigned\n']
['M4Y\n', 'Downtown Toronto\n', 'Church and Wellesley\n']
['M5Y\n', 'Not assigned\n', 'Not assigned\n']
['M6Y\n', 'Not assigned\n', 'Not assigned\n']
['M7Y\n', 'East Toronto\n', 'Business reply mail Processing Centre, South Central Letter Processing Plant Toronto\n']
['M8Y\n', 'Etobicoke\n', "Old Mill South, King's Mill Park, Sunnylea, Humber Bay, Mimico NE, The Queensway East, Royal York South East, Kingsway Park South East\n"]
['M9Y\n', 'Not assigned\n', 'Not assigned\n']
['M1Z\n', 'Not assigned\n', 'Not assigned\n']
['M2Z\n', 'Not assigned\n', 'Not assigned\n']
[

#### Check our data in Dataframe, 5 first rows

In [8]:
neighbourhoods.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


#### Data Cleaning: Remove "Not assigned" data and "\n" from data in each column

In [9]:
neighbourhoods = neighbourhoods[neighbourhoods.Borough != "Not assigned\n"]
neighbourhoods["Neighbourhood"] = neighbourhoods["Neighbourhood"].replace({'\n':''}, regex=True)
neighbourhoods["Borough"] = neighbourhoods["Borough"].replace({'\n':''}, regex=True)
neighbourhoods["PostalCode"] = neighbourhoods["PostalCode"].replace({'\n':''}, regex=True)

newDF = neighbourhoods.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
newDF.reset_index(inplace=True)
newDF['Neighbourhood'] = np.where(newDF['Neighbourhood'] == 'Not assigned', newDF['Borough'], newDF['Neighbourhood'])


In [10]:
newDF.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### That's much better <br> Now check the data shape 

In [11]:
newDF.shape  

(103, 3)

In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(newDF['Borough'].unique()),
        newDF.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


## This is Part 2 of Capstone project

#### Importing csv file with latitudes and longitudes for Postal Codes <br> Unfortunately geocoder.google function didn't work 

In [13]:
LatLong = pd.read_csv('https://cocl.us/Geospatial_data')
LatLong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merging our Dataframe with LatLong dataframe just created 

In [14]:
LatLong.rename(columns={'Postal Code':'PostalCode'},inplace=True)
NewDF1 = pd.merge(newDF,LatLong,on='PostalCode')
NewDF1.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


#### Creating Dataframe that contains Toronto in the Borough

In [15]:
NewDF2 = NewDF1[NewDF1['Borough'].str.contains('Toronto', regex = False)]
NewDF2

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Define Toronto coordinates 

In [16]:
from geopy.geocoders import Nominatim
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### With Folium visualize Toronto neighbourhoods on map

In [17]:
#!pip install folium
import folium


TorontoMap = folium.Map(location = [latitude, longitude], zoom_start=10)

for lat, lng, borough, neighbourhood in zip(NewDF2['Latitude'], NewDF2['Longitude'], NewDF2['Borough'], NewDF2['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='purple',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(TorontoMap)

TorontoMap    

## This is Part 3 of Capstone project

#### Let's make cluster for Neighbourhoods

In [18]:
# importing Kmeans
from sklearn.cluster import KMeans
# Set number of clusters
clusters = 5

grouped_clustering = NewDF2.drop(['PostalCode', 'Borough', 'Neighbourhood'],1)
# run k-means clustering
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(grouped_clustering)
# check cluster labels
kmeans.labels_[0:10]

array([0, 0, 0, 0, 4, 0, 0, 3, 0, 1], dtype=int32)

In [19]:
NewDF2.insert(0, 'Cluster Labels', kmeans.labels_)
NewDF2


Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [20]:
import matplotlib.cm as cm
import matplotlib.colors as colors

#### Create new map to display clusters

In [21]:

MapClusters = folium.Map(location = [43.6534817, -79.3839347], zoom_start=10)

# make scheme for clusters
x = np.arange(clusters)
y = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0,1, len(y)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(NewDF2['Latitude'], NewDF2['Longitude'], NewDF2['Neighbourhood'], NewDF2["Cluster Labels"]):
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.7).add_to(MapClusters)
    
MapClusters