# Segmenting and Clustering Neighborhoods in Toronto

#### By Devin Arrants

## 1. Access data 

In [1]:
import numpy as np 
import pandas as pd
from urllib.request import urlopen

!conda install -c conda-forge bs4 --yes 
from bs4 import BeautifulSoup ##used to extract data from HTML files
print("Import Complete")

Solving environment: / ^C
failed

CondaError: KeyboardInterrupt

Import Complete


In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

In [3]:
soup = BeautifulSoup(html, "html.parser")    ##create a beautiful soup object from the html

## 2. Format the Data
In this step we format the data in such a way that it will populate the data table in the next step with ease. 

In [4]:
rows = soup.find_all('td') #getting the rows from the table 

In [5]:
cleantext = BeautifulSoup(str(rows), "html.parser").get_text()   

## 2. Clean and Format the Data
In this step we clean and format the data until it is in the correct format to be put into a pandas data frame

In [6]:
 #slice the text to where the postal codes begin and where the postal codes end
cleantext = cleantext[cleantext.rfind("M1A"):cleantext.rfind(", \n, \n, \n")] 

In [7]:
#create a list from the string by splitting at \n
liststr = cleantext.split("\n,")

In [8]:
#create the data frame and columns
column_names = ['Postal Code', 'Borough', 'Neighborhood'] 
neighborhoods_df = pd.DataFrame(columns=column_names)

## Populate the data frame
This loop will traverse the list while the list still exists by looking at three elements at a time.
The first element is always the postal code, the second element is always the borough name, and the third element is always the neighborhood name
If there is no neighborhood assigned to a particular borough, there is still a space at the index where it is
Then to ensure that we are not repeating data we slice the list that is being looped through so it contains three less elements 

In [9]:
#traverse data so everytime a new line is encountered it is stored in the corresponding variable 
while len(liststr) > 0:
    postal = liststr[0]
    borough_name = liststr[1]
    neighborhood_name = liststr[2]
    liststr = liststr[3:]    #cut the list so that we are not repeating postal codes
    neighborhoods_df = neighborhoods_df.append({'Postal Code': postal,
                                          'Borough': borough_name,
                                          'Neighborhood': neighborhood_name}, ignore_index=True)
                                                

In [10]:
#remove rows of not assigned boroughs
neighborhoods_df.replace('Not assigned',np.nan,inplace=True, regex=True)
neighborhoods_df.dropna(axis=0, inplace=True)

In [11]:
print('The dataframe has {} boroughs and {} postal codes.'.format(
        len(neighborhoods_df['Borough'].unique()),
       len(neighborhoods_df['Postal Code'].unique())
    )
)

#reset the index
neighborhoods_df.reset_index(drop=True, inplace=True)

#there is no need to merge cells with the same postal codes because they are all unique. I guess wikipedia updated
#additionally there is no borough with a not assigned neighborhood so that step is unnecessary

The dataframe has 10 boroughs and 103 postal codes.


In [12]:
#format the neighborhoods with commas
neighborhoods_df['Neighborhood'].replace("/", ",", inplace=True, regex=True)
neighborhoods_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [13]:
print("This is the number of postal codes that have a borough associated with it:", neighborhoods_df.shape[0])

This is the number of postal codes that have a borough associated with it: 103
