# Week 3 Peer Graded Assignment:
# Segmenting and Clustering Neighborhoods in Toronto
### Aaron Armour

## Part 1 - scraping Wikipedia page to build a dataframe with postal code, borough and neighborhood

Import modules we will use

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup


Make a request for the Wikipedia webpage, and make an alteration to clean one of the data items so it will be properly processed in a later step

In [2]:
# URL of Wikipedia page with the table of data we will use
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

response = requests.get(url)
webdata = response.content

# Making replacements in the raw html to fix up the neighborhood data in the row for postal code M5V
webdata = webdata.replace(b'\n<pre>', b'')
webdata = webdata.replace(b'</pre>\n', b'')


Use the BeautifulSoup module to obtain the table in amongst the raw html.

In [3]:
soup = BeautifulSoup(webdata)
table = soup.find('tbody')


With our BeautifulSoup object we can find the rows of the table and process these as described in the assignment instructions.

In [4]:
# This function processes the data in a row.
# Returns: a tuple of data - (postal_code, borough, neighborhoods)
def process_row(row):
    items = [item.contents for item in row.find_all('td')]
    assert len(items) == 3  # Expect 3 items, some might just be a '\n'
    assert len(items[0]) == len(items[1]) == len(items[2]) == 1 # Each should just be one item
    
    return (items[0][0].rstrip(), items[1][0].rstrip(), ', '.join(items[2][0].rstrip().split(' / ')))

data = []
for i, row in enumerate(table.children):
    if i == 0:
        # Skip the first row which has the table headings
        continue
        
    if row.name == 'tr':  # Just process the rows of the table which have <tr> tags
        postalCode, borough, neighborhood = process_row(row)
        if borough != 'Not assigned':
            if neighborhood == 'Not assigned':
                neighborhood = borough
                        
            data.append((postalCode, borough, neighborhood))


Create a Pandas DataFrame from the list of data created above.

In [5]:
df = pd.DataFrame(data, columns = ['PostalCode', 'Borough', 'Neighborhood'])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Use the shape attribute to find out the number of rows and columns in our DataFrame df.

In [6]:
df.shape

(103, 3)

## Part 2 - obtaining geographic coordinates for the neighborhoods

In [7]:
#!pip install geocoder
import geocoder

In [8]:
g = geocoder.google('m3a, toronto, ontario')
print(g.latlng)

None


In [None]:
# Example given in assignment description

postal_code = 'M5G'
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

In [9]:
from time import sleep

# A function to assist with getting coordinates for the postal codes
def get_lat_long(postal_code, max_attempts=50, pause_time=0.05):
    # initialize your variable to None
    lat_lng_coords = None
    attempt = 0
    
    # loop until you get the coordinates
    while(lat_lng_coords is None and attempt < max_attempts):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        sleep(pause_time)
        attempt += 1
        
    if lat_lng_coords is not None:
        return (lat_lng_coords[0], lat_lng_coords[1])
    else:
        return None


In [10]:
latlong = get_lat_long('M5G')

In [11]:
latlong is None

True

Seems that the geocoder approach is not working at all...

In [None]:
!wget -O geospatial_data.csv https://cocl.us/Geospatial_data

In [12]:
geo_df = pd.read_csv('geospatial_data.csv')

In [13]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
geo_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

In [15]:
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
# TO DO: Might want to just assign this to the DataFrame df

new_df = df.merge(geo_df, on='PostalCode')
new_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [17]:
new_df.shape

(103, 5)

## Part 3 - clustering and analysis of neighborhoods in Toronto