# **Segmenting and Clustering Neighborhoods in Toronto**

**Downloading all needed dependencies:**

In [5]:
import numpy as np
import pandas as pd
!pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests
import urllib.request
from bs4 import SoupStrainer



**Parsing Html document with BeautifulSoup:**

In [6]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(URL)
html_content = r.text
only_table_tags = SoupStrainer("table")
soup = BeautifulSoup(html_content, "html.parser", parse_only=only_table_tags) # parses all tables from html document
soup = soup.find_all('table')[0] # parses only the first table
print(soup.prettify()) # prints the organized tree

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postal Code
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighborhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    North York
   </td>
   <td>
    Parkwoods
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    North York
   </td>
   <td>
    Victoria Village
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Regent Park, Harbourfront
   </td>
  </tr>
  <tr>
   <td>
    M6A
   </td>
   <td>
    North York
   </td>
   <td>
    Lawrence Manor, Lawrence Heights
   </td>
  </tr>
  <tr>
   <td>
    M7A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Queen's Park, Ontario Provincial Government
   </td>
  </tr>
  <tr>
   <td>
    M8A
   </td>
   <td>
    Not assigned

**Creating a Pandas Dataframe:**

In [7]:
new_table = pd.DataFrame(columns=['Postal Code', 'Borough', 'Neighborhood'])
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood


**Filling the dataframe with BeautifulSoup object's data:**

In [8]:
row_marker = 0
for row in soup.find_all('tr'): # loops through all soup object's table rows
    column_marker = 0 # identifies the column number
    columns = row.find_all('td') 
    listt = [0, 0, 0] # will be filled with a row's content and transformed to a new dataframe
    for column in columns: # loops through each row's column values
        listt[column_marker] = column.get_text().replace('\n','').strip() # fills the listt array with column values
        column_marker += 1
    df = pd.DataFrame([listt], columns=['Postal Code', 'Borough', 'Neighborhood']) # creates new dataframe containing one row
    new_table = new_table.append(df) # appends the new dataframe to the original
    row_marker += 1
new_table.reset_index(inplace = True, drop = True)
new_table.drop(new_table.index[0], inplace = True)
new_table.reset_index(inplace = True, drop = True)
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


**Removing rows which contain a "Not assigned" borough value:**

In [9]:
new_table = new_table[new_table['Borough'] != 'Not assigned']
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Business reply mail Processing Centre
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


**Making sure there aren't repeated postal codes in different rows:**

In [10]:
len(new_table['Postal Code'].unique())

103

**Making sure there aren't neighborhoods with "Not assigned" or "" values:**

In [11]:
new_table[new_table['Neighborhood']=='Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


In [12]:
new_table[new_table['Neighborhood']=='']

Unnamed: 0,Postal Code,Borough,Neighborhood


**The organized dataframe looks like:**

In [13]:
new_table.reset_index(inplace = True, drop = True)
new_table

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing Centre
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
