# Capstone project
## Week 3 assignment: Segmenting and clustring neighberhoods in Toronto

## 1: Creating the dataframe from the wikipedia page for data using BeautifulSoup library

In [1]:
#Importing libraries to create our dataframe
import pandas as pd
import requests
from bs4 import BeautifulSoup

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_data=requests.get(url).text
soup=BeautifulSoup(html_data,'html5lib')

tables=soup.find_all('table')

for i,table in enumerate(tables):
    if ('Borough' in str(table)):
        table_index=i
    
toronto_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        PostalCode =col[0].text.strip()
        Borough =col[1].text.strip()
        Neighborhood =col[2].text.strip()

        toronto_data = toronto_data.append({"PostalCode":PostalCode, "Borough":Borough, "Neighborhood":Neighborhood}, ignore_index=True)
        
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


we will examine the column 'Borough' and see how many 'Not assigned' values are present in the dataframe

In [2]:
toronto_data['Borough'].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 4
Toronto/York         1
Mississauga          1
Name: Borough, dtype: int64

We have 77 value of 'Not assigned', we will drop the rows containing this value and re-examine our column 'Borough'

In [3]:
for i,line in enumerate(toronto_data['Borough']):
    if (line=='Not assigned'):
        toronto_data.drop(i,axis=0, inplace=True)
        
toronto_data['Borough'].value_counts()


North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
East York            5
York                 4
Toronto/York         1
Mississauga          1
Name: Borough, dtype: int64

We can see that we droped all rows containing 'Not assigned' value for the 'Borough' column

In [4]:
toronto_data.reset_index(drop=True, inplace=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Let's verify if there are any duplicated PostalCode in the column 'PostalCode' 

In [5]:
toronto_data.duplicated(keep=False, subset=['PostalCode']).value_counts()

False    103
dtype: int64

I didn't find any duplicated PostalCode

Let's verify if there are any 'Not Assigned' value in the 'Neighborhood' column

In [6]:
toronto_data['Neighborhood'].value_counts(ascending=False)

Downsview                                                                                                        4
Don Mills                                                                                                        2
The Danforth West, Riverdale                                                                                     1
Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West                   1
St. James Town, Cabbagetown                                                                                      1
                                                                                                                ..
The Annex, North Midtown, Yorkville                                                                              1
South Steeles, Silverstone, Humbergate, Jamestown, Mount Olive, Beaumond Heights, Thistletown, Albion Gardens    1
Bedford Park, Lawrence Manor East                                               

There is no 'Not assigned' value in the 'Neighberhood' column

In [7]:
toronto_data.shape

(103, 3)

Our dataframe 'Toronto_data' is ready for the next steps

## 2: Creating the dataframe using the latitude and longitude from the given csv file and merging it with the first dataframe

we will begin by reading the csv file from the given link and transform it into a dataframe

In [8]:
toronto_info = pd.read_csv("https://cocl.us/Geospatial_data")
toronto_info.shape

(103, 3)

In [9]:
toronto_info.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


we will merge the two dataframes by using the key=PostalCode. But we will first rename the column 'Postal Code' to 'PostalCode' in order to use this column as key for both dataframes

In [10]:
toronto_info.rename(columns={'Postal Code': 'PostalCode'}, inplace= True)
toronto_df = pd.merge(toronto_data, toronto_info, how="inner", on="PostalCode")

In [11]:
toronto_df.shape

(103, 5)

In [12]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


Now we have our dataframe with the 5 columns

## 3: Exploring and clustring neighberhoods in Toronto

we will use the geopy library to get the latitude and longitude of Toronto and then create a map with neighberhoods superimposed on top using Folium library

In [41]:
import json 
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium 
import numpy as np


In [27]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Toronto location: ', latitude, longitude)

Toronto location:  43.6534817 -79.3839347


In [30]:
# create map of Toronto
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10.5)

# add markers to map of neighberhoods

for lat, lng, label in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

we will explore neighberhoods using Foursquare. We first define our credentials and the version 

In [31]:
CLIENT_ID = '3C5MYXZLLSDM0NUINSM5VCD3UDR1FT4SH33HY05G1JH2DCLC' 
CLIENT_SECRET = 'NI1JABVHIRKNLLOD3AH3XAXRUYXC5B4XQV3SYORIWVRZLQVA' 
VERSION = '20180605' 
LIMIT = 100 

we will use the next function in order to explore all neigherhoods in Toronto and get the nearby venues

In [36]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
                                   latitudes=toronto_df['Latitude'],
                                   longitudes=toronto_df['Longitude']
                                  )

Let's see how many venues were returned for each neighberhood


In [37]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Alderwood, Long Branch",10,10,10,10,10,10
"Bathurst Manor, Wilson Heights, Downsview North",22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
"Bedford Park, Lawrence Manor East",23,23,23,23,23,23
...,...,...,...,...,...,...
"Willowdale, Willowdale East",34,34,34,34,34,34
"Willowdale, Willowdale West",5,5,5,5,5,5
Woburn,5,5,5,5,5,5
Woodbine Heights,5,5,5,5,5,5


we will group by Neighberhood and take the mean of frequency of occurence of each category and call the dataframe toronto_grouped that we will use for clustering

In [38]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Truck Stop,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


we will use the K-means to cluster our data in the folium map

In [55]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)
labels = kmeans.labels_
toronto_grouped_clustering["Labels"] = labels
toronto_grouped_clustering.head()

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Neighborhood'], toronto_grouped_clustering['Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
