# Segmenting and Clustering Neighborhoods in Toronto
## Website scraping exercise

### Installing and importing all the needed libraries

In [1]:
#Install the Beautiful Soup library for pulling data out of HTML and XML files
!conda install -c conda-forge beautifulsoup4 --yes

#Install the lxml’s HTML parser
!conda install -c conda-forge lxml --yes

#Install geopy
!conda install -c conda-forge geopy --yes 

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Solving environment: | ^C
failed

CondaError: KeyboardInterrupt



In [116]:
from bs4 import BeautifulSoup #import the Beautiful Soup library
import requests #import the requests library
import pandas as pd #import the pandas library
import numpy as np #import the numpy library
from geopy.geocoders import Photon
from geopy.geocoders import Nominatim

### Scrape the information from the website using the BeautifulSoup library

In [3]:
#Specifiy the url of the website that will be explored
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Get the webpage's source code
wiki_source = requests.get(wiki_url).text

#Convert the webpage's source code into a BeautifulSoup object
wiki_soup = BeautifulSoup(wiki_source, 'lxml')

#Extract the table with the list of postal codes in Canada
wiki_table = wiki_soup.find('table', class_='wikitable sortable')

### Organize the information retrieved from the web in a DataFrame

In [4]:
#Put the information of the table in a list
table = []
for row in wiki_table.find_all("tr"):
    row = row.text.split("\n")
    row = [elem for elem in row if elem != ''] #Eliminate the '' elements in the list
    row = [np.nan if elem == "Not assigned" else elem for elem in row] #replace the 'Not assigned' elements with NaNs
    table.append(row)

#Turn the list into a Pandas Dataframe
PC_Canada_df = pd.DataFrame(table[1:],columns=table[0]) #The first row contains the column names
PC_Canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### DataFrame conditioning (1/3) - Remove the "Not assigned" Boroughs

In [5]:
#Drop the rows that don't have a Borough assigned
PC_Canada_df.dropna(subset=["Borough"],inplace=True)
PC_Canada_df.reset_index(drop=True,inplace=True)
PC_Canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### DataFrame conditioning (2/3) - Replace the "Not assigned" Neighbourhoods with their Borough's name

In [6]:
#Identify the entries where there are Neighbourhoods not assigned
#Assign its Borough name as Neighbourhood name
index_nan = PC_Canada_df[PC_Canada_df["Neighbourhood"].isna()].index
PC_Canada_df.iloc[index_nan,2] = PC_Canada_df.iloc[index_nan,1]
PC_Canada_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


### DataFrame conditioning (3/3) - Group the dataframe by Postcode

In [7]:
#Group the dataframe by Postcode
PC_Canada_grouped_df = PC_Canada_df[['Postcode','Borough']].drop_duplicates().reset_index(drop=True)
PC_Canada_grouped_df['Neighbourhood'] = PC_Canada_df.groupby(by='Postcode',as_index=False,sort=False)['Neighbourhood'].apply(lambda x:  "%s" % ', '.join(x))
PC_Canada_grouped_df.head(18)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Number of entries (rows) in the final DataFrame

In [8]:
#Print the number of rows of the final DataFrame
print("Number of entries: "+str(PC_Canada_grouped_df.shape[0])+" datapoints")

Number of entries: 103 datapoints


### Getting the geographical data

In [115]:
#geolocator = Nominatim(user_agent="toronto_explorer")
geolocator = Photon()

lat_list = []
long_list = []
for index,row in PC_Canada_grouped_df.iterrows():
    postal_code = row["Postcode"]
    #For a structured query, provide a dictionary whose keys are one of: addressLine, locality (city), adminDistrict (state), countryRegion, or postalcode.
    query = postal_code+", Toronto, Canada"
    location = None
    while(location is None):
        location = geolocator.geocode(query)
    lat_list.append(location.latitude)
    long_list.append(location.longitude)
    

In [117]:
PC_Canada_grouped_df['Latitude'] = lat_list
PC_Canada_grouped_df['Longitude'] = long_list
PC_Canada_grouped_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.740375,-79.321746
1,M4A,North York,Victoria Village,43.732658,-79.311189
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6514,-79.365837
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.722778,-79.450933
4,M7A,Queen's Park,Queen's Park,43.774391,-79.504811
