# Toronto Neighborhood Web Scrapping

### Checking for the new libraries needed

- beautifulsoup4
- lxml
- request

In [1]:

!pip install beautifulsoup4
!pip install lxml
!pip install request

Collecting request
  Downloading https://files.pythonhosted.org/packages/f1/27/7cbde262d854aedf217061a97020d66a63163c5c04e0ec02ff98c5d8f44e/request-2019.4.13.tar.gz
Collecting get (from request)
  Downloading https://files.pythonhosted.org/packages/3f/ef/bb46f77f7220ac1b7edba0c76d810c89fddb24ddd8c08f337b9b4a618db7/get-2019.4.13.tar.gz
Collecting post (from request)
  Downloading https://files.pythonhosted.org/packages/0f/05/bd79da5849ea6a92485ed7029ef97b1b75e55c26bc0ed3a7ec769af666f3/post-2019.4.13.tar.gz
Collecting query_string (from get->request)
  Downloading https://files.pythonhosted.org/packages/12/3c/412a45daf5bea9b1d06d7de41787ec4168001dfa418db7ec8723356b119f/query-string-2019.4.13.tar.gz
Collecting public (from query_string->get->request)
  Downloading https://files.pythonhosted.org/packages/54/4d/b40004cc6c07665e48af22cfe1e631f219bf4282e15fa76a5b6364f6885c/public-2019.4.13.tar.gz
Building wheels for collected packages: request, get, post, query-string, public
  Building wheel

### Importing the libraries

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Web Scrap phase

In [3]:
#getting the source code from the url
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [5]:
#set the html object
soup = BeautifulSoup(source,'lxml')


In [6]:
#Extracting the table from the source code
table=soup.find('table',class_='wikitable sortable')


In [7]:
#Creating a list to contain the text from the rows of the HTML table
row_test=[]
for row in table.find_all('tr'):
    row_test.append(row.text)


In [8]:
#Removing /n text 
row_test=[value.split('\n') for value in row_test]


In [9]:
#Creating the first df
df=pd.DataFrame(row_test[1:],columns=row_test[0])
#Reshapping the df
df=df[['Postcode','Borough','Neighborhood']]

### Data Wrangling phase 

In [10]:
#Setting the non assigned values as nan
df.replace("Not assigned",np.nan,inplace=True)

In [11]:
#Removing the nan values
df.dropna(inplace=True)

In [217]:
#Checking the amount of unique postcodes
postcode=list(df['Postcode'].unique())
len(postcode)

102

In [13]:
#Checking the amount of unique borough
borough=list(df['Borough'].unique())
len(borough)

10

In [27]:
#Taking in separate list all values from each postcode 
o_post=[]
o_boro=[]
o_neig=[]
for i in postcode:
    o_post.append(str(i))
    o_boro.append(list(df[df['Postcode']==str(i)].Borough))
    o_neig.append(list(df[df['Postcode']==str(i)].Neighborhood))

In [42]:
#Check if there are two borough in the same postcode
[set(sublist) for sublist in o_boro]

[{'North York'},
 {'North York'},
 {'Downtown Toronto'},
 {'North York'},
 {'Downtown Toronto'},
 {'Scarborough'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'North York'},
 {'Etobicoke'},
 {'Scarborough'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'York'},
 {'Etobicoke'},
 {'Scarborough'},
 {'East Toronto'},
 {'Downtown Toronto'},
 {'York'},
 {'Scarborough'},
 {'East York'},
 {'Downtown Toronto'},
 {'Downtown Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'West Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East York'},
 {'Downtown Toronto'},
 {'West Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East Toronto'},
 {'Downtown Toronto'},
 {'West Toronto'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East Toronto'},
 {'Downtown Toronto'},
 {'North York'},
 {'North York'},
 {'Scarborough'},
 {'North York'},
 {'North York'},
 {'East Toronto'},
 {'North Yor

In [43]:
#Saving the unique values of the borough instead of having a list of lists.
o_boro = [sublist[0] for sublist in o_boro]

In [106]:
#Removing the characters like '',[] and saving it in o_neig as a list
o_neig=[str(value).translate(str.maketrans("'"," ")).strip("[ ]") for value in o_neig]

In [209]:
#Building the final DF and removing the characters ""
df_final=pd.DataFrame({"Postal Code" : o_post, "Borough" : o_boro, "Neighborhood": o_neig})
df_final["Neighborhood"].replace('\"',"",regex=True,inplace=True)

In [191]:
df_final

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights , Lawrence Manor"
4,M7A,Downtown Toronto,Queen s Park
5,M1B,Scarborough,"Rouge , Malvern"
6,M3B,North York,Don Mills North
7,M4B,East York,"Woodbine Gardens , Parkview Hill"
8,M5B,Downtown Toronto,"Ryerson , Garden District"
9,M6B,North York,Glencairn


In [142]:
df_final.shape

(102, 3)

# Clustering the Toronto Neighborhoods

### Importing the libraries

In [143]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!pip install geopy 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!pip install folium
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 6.4MB/s eta 0:00:011
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.10.1
Folium installed
Libraries imported.


### Define Foursquare Credentials and Version

In [144]:
CLIENT_ID = 'QBNEH0A5SL5FIN0SD4MRT5NEIDQPSOTNWUWDACM0COSEIZWY' # your Foursquare ID
CLIENT_SECRET = 'GEHTZ4ZQJP3JLMCNKZDNC3DAYMYHD41L1SLM3QU10EW2GVJR' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: QBNEH0A5SL5FIN0SD4MRT5NEIDQPSOTNWUWDACM0COSEIZWY
CLIENT_SECRET:GEHTZ4ZQJP3JLMCNKZDNC3DAYMYHD41L1SLM3QU10EW2GVJR


In [183]:
address='M3A, Toronto, Ontario'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.653963 -79.387207


In [182]:
df_coord=pd.read_csv('http://cocl.us/Geospatial_data')
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [210]:
df_final_wcoord=pd.merge(df_final, df_coord, how = 'inner', on= "Postal Code")
df_final_wcoord.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen s Park,43.662301,-79.389494


### Cluster Neighborhoods

In [203]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(borough)

LabelEncoder()

In [212]:
df_final_wcoord['#Borough']=le.transform(df_final_wcoord['Borough'])
df_final_wcoord.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,#Borough
0,M3A,North York,Parkwoods,43.753259,-79.329656,6
1,M4A,North York,Victoria Village,43.725882,-79.315572,6
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1
3,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763,6
4,M7A,Downtown Toronto,Queen s Park,43.662301,-79.389494,1


In [214]:
le2 = preprocessing.LabelEncoder()
le2.fit(postcode)

LabelEncoder()

In [218]:
df_final_wcoord['#Postal Code']=le2.transform(df_final_wcoord['Postal Code'])
df_final_wcoord.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,#Borough,#Postal Code
0,M3A,North York,Parkwoods,43.753259,-79.329656,6,25
1,M4A,North York,Victoria Village,43.725882,-79.315572,6,34
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,53
3,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763,6,71
4,M7A,Downtown Toronto,Queen s Park,43.662301,-79.389494,1,85


In [219]:
#Import the model
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

df_final_wcoord_clustering = df_final_wcoord.drop(['Postal Code','Neighborhood','Borough'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_final_wcoord_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 4, 2, 1, 3, 0, 4, 4, 2, 1], dtype=int32)

In [223]:
df_final_wcoord.insert(0, 'Cluster Labels', kmeans.labels_)
df_final_wcoord.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude,#Borough,#Postal Code
0,4,M3A,North York,Parkwoods,43.753259,-79.329656,6,25
1,4,M4A,North York,Victoria Village,43.725882,-79.315572,6,34
2,2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,1,53
3,1,M6A,North York,"Lawrence Heights , Lawrence Manor",43.718518,-79.464763,6,71
4,3,M7A,Downtown Toronto,Queen s Park,43.662301,-79.389494,1,85


In [225]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_final_wcoord['Latitude'], df_final_wcoord['Longitude'], df_final_wcoord['Neighborhood'], df_final_wcoord['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters