# Segmenting and Clustering Neighborhoods in Toronto
In this assignment, we will be exploring, segmenting, and clustering the neighborhoods in the city of Toronto.

## Part 1: Scraping data from Wikipedia webpage

First we will be installing and importing the required libraries

In [None]:
import sys
!conda install --yes --prefix {sys.prefix} beautifulsoup4
!conda install --yes --prefix {sys.prefix} lxml
!conda install --yes --prefix {sys.prefix} html5lib
!conda install --yes --prefix {sys.prefix} requests
!conda install --yes --prefix {sys.prefix} -c conda-forge folium=0.5.0 --yes
!conda install --yes --prefix {sys.prefix} -c conda-forge geopy --yes
print("everything is installed now...")


In [1]:
from bs4 import BeautifulSoup
import requests
import lxml
import pandas as pd
import numpy as np
import folium
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

# to enable autocomplete in the notebook
%config IPCompleter.greedy=True 

Getting the webpage source from Wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_page = requests.get(url)

Parsing the html content of the webpage using beatifulsoup <br> 
Then extract the table tag that contains the table with neighborhoods data

In [3]:
soup = BeautifulSoup(html_page.content,'lxml')
table = soup.table
print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postcode
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighbourhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Parkwoods" title="Parkwoods">
     Parkwoods
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    <a href="/wiki/North_York" title="North York">
     North York
    </a>
   </td>
   <td>
    <a href="/wiki/Victoria_Village" title="Victoria Village">
     Victoria Village
    </a>
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    <a href="/wiki/Downtown_Toronto" title="Downtown Toronto">
     Downtown Toronto
    </a>
   </td>
   <td>
    <a href="

Extract table headers from the table and put them in a list

In [4]:
headers = table.find_all('th')
headers_list = []
for x in headers:
    headers_list.append(x.text)
headers_list[2] = headers_list[2].replace('\n','')
print(headers_list)

['Postcode', 'Borough', 'Neighbourhood']


Extract rows from the table (without headers) by putting them in a list (evert row as list element): <br>
- first of all we convert the table data to a list that contains every row in the table as an element
- then we extract the table elements one by one and put them in the appropriate place in an empty list 
- finally we convert the list made to a dataframe

In [None]:
content = table.find_all('tr')
del content[0]

# initializing list of neighbourhoods
l = []

# put neighbourhoods in the list one by one (loop over the extracted list that contains rows in it)
for tr in content:
    # convert the extracted row to a list that contains elements of the table as an element in the list
    row = tr.find_all('td')
    print(row)
    # convert every element in the previous list to the text content (removing the tags from it)
    tmp_lst = [elem.text for elem in row]
    # the next line is to remove the \n (newline) from the last element of the list
    tmp_lst[2] = tmp_lst[2].replace('\n','')
    print(tmp_lst)
    # appending the list to the list of the lists
    l.append(tmp_lst)
    print(l)
    
df_nbrs = pd.DataFrame(l,columns=headers_list)
print(df_nbrs.shape)
df_nbrs.head(30)

Ignore cells with a borough that is <b>"Not assigned"

In [None]:
indexNames = df_nbrs[ df_nbrs['Borough'] == "Not assigned" ].index
df_nbrs.drop(indexNames , inplace=True)
df_nbrs.reset_index(inplace=True)
df_nbrs.drop("index",axis=1, inplace=True)
print(df_nbrs.shape)
df_nbrs.head(30)

In the next cell, I am grouping the dataframe by the Postcode column to join the Neighbourhoods that have the same Postcode

In [None]:
grouped = df_nbrs.groupby("Postcode").agg([','.join])
final_df = grouped.reset_index().droplevel(1,axis=1)
final_df.head(20)


In the next cell I am removing the duplicates in every row in Borough column in the previous result

In [None]:
for i in range(0,final_df.shape[0]):
    final_df["Borough"].iloc[i] = final_df["Borough"].iloc[i].split(',')[0]
final_df.head(20)

Handling cases where Neighbourhood column is Not assigned

In [None]:
for i in final_df[final_df["Neighbourhood"]=="Not assigned"].index:
        final_df["Neighbourhood"].iloc[i] = final_df["Borough"].iloc[i]
# showing the example mentioned (9th row in wikipedia page)
final_df[final_df["Postcode"]=="M7A"]

Showing that there is no rows with "Not assigned" Borough

In [None]:
final_df[final_df["Borough"]=="Not assigned"]

Showing that there is no duplicates at column Postcode, meaning that all Neighbourhoods with the same Postcode were combined into one row

In [None]:
final_df[final_df["Postcode"].duplicated()]

Showing that there is no Neighbourhood == Not assigned

In [None]:
final_df[final_df["Neighbourhood"]=="Not assigned"]

In [None]:
final_df.shape

## Part 2:


Reading the CSV file that contains the coordinates of the nieghbourhoods

In [None]:
coord_df = pd.read_csv("https://cocl.us/Geospatial_data")
coord_df.head()

Rename the "Postal Code" column to "Postcode" so that we could merge the coordinates data with the dataframe that contains the neighbourhoods extracted in Part 1

In [None]:
coord_df.rename(columns={'Postal Code':'Postcode'}, inplace=True)
coord_df.head()

Merging the coordinates in the dataframe that contains the neighbourhoods extracted in Part 1

In [None]:
combined_df = pd.merge(final_df, coord_df, how='left',
        left_on='Postcode', right_on='Postcode')
combined_df.head()


the next cell is used to check that the merge is done correctly by testing the examples mentioned in the assignment 

In [None]:
combined_df[combined_df["Postcode"]=="M5A"]

# Part 3: Explore and Cluster the data 

Preview the combined dataframe that contains neighbourhood data with coordinates data 

In [None]:
combined_df.head()

Get the address of toronto

In [None]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Display the map of toronto with neighbourhoods circled on it <br>
Neighbourhoods that exist in the same borough has the same color

In [None]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

unique_borough = combined_df.Borough.unique() 
kclusters = len(unique_borough)
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, borough, neighborhood in zip(combined_df['Latitude'], combined_df['Longitude'], combined_df['Borough'], combined_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[np.where(unique_borough==borough)[0][0]-1],
        fill=True,
        fill_color=rainbow[np.where(unique_borough==borough)[0][0]-1],
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto