# Week 3 Assignment
## Segmenting and Clustering Neighborhoods in Toronto

#### Scrape neighbourhoods from Wikipedia 

In [1]:
import pandas as pd # library for data analysis
import numpy as np
import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML documents

In [2]:
# scrape from wikipedia
wikiurl="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print("Wikipedia response status code: ", response.status_code)

# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
TorontoTable=soup.find('table',{'class':"wikitable"})

# convert table into dataframe
df=pd.read_html(str(TorontoTable))
# convert list to dataframe
df=pd.DataFrame(df[0])
df=df.rename(columns = {'Postal Code':'PostalCode'})
print(df.head())

Wikipedia response status code:  200
  PostalCode           Borough              Neighbourhood
0        M1A      Not assigned               Not assigned
1        M2A      Not assigned               Not assigned
2        M3A        North York                  Parkwoods
3        M4A        North York           Victoria Village
4        M5A  Downtown Toronto  Regent Park, Harbourfront


In [3]:
# remove unassigned boroughs
df = df[df.Borough != 'Not assigned']
# check if any neighbourhoods are unassigned
print("Any unassigned Neighbourhoods left? ", any(df.Neighbourhood == 'Not assigned'))
print("Any duplicated PostalCodes left? ", any(df.PostalCode.duplicated()))

# reset index to start at zero
df.reset_index(inplace = True, drop = True)

print(df.head())

Any unassigned Neighbourhoods left?  False
Any duplicated PostalCodes left?  False
  PostalCode           Borough                                Neighbourhood
0        M3A        North York                                    Parkwoods
1        M4A        North York                             Victoria Village
2        M5A  Downtown Toronto                    Regent Park, Harbourfront
3        M6A        North York             Lawrence Manor, Lawrence Heights
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government


In [4]:
print("Dataframe rows and columns:", df.shape)

Dataframe rows and columns: (103, 3)


#### Add latitudes and longitudes to the neighbourhoods

In [5]:
### Attempted to get lat&long using geocoder, but not getting any returns

##conda install -c conda-forge geocoder
#import geocoder # import geocoder
#
## define function that gets lat&long for a postal code
#def get_latlong(postalcode):
#    # initialize your variable to None
#    lat_lng_coords = None
#    
#    # loop until you get the coordinates
#    print ("Working on post code: ", postalcode)
#    while(lat_lng_coords is None):
#        print ("Trying geocoder")
#        g = geocoder.google('{}, Toronto, Ontario'.format(postalcode))
#        lat_lng_coords = g.latlng
#    latitude = lat_lng_coords[0]
#    longitude = lat_lng_coords[1]
#    return(latitude, longitude)
#df['Latitude'], df['Longitude'] = zip(*df['PostalCode'].apply(get_latlong))
#df.head()

In [6]:
# Get lat&long from separately provided file
latlong = pd.read_csv("https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv")
latlong = latlong.rename(columns = {'Postal Code':'PostalCode'})
print(latlong.head())

  PostalCode   Latitude  Longitude
0        M1B  43.806686 -79.194353
1        M1C  43.784535 -79.160497
2        M1E  43.763573 -79.188711
3        M1G  43.770992 -79.216917
4        M1H  43.773136 -79.239476


In [7]:
# add lat&long to the neighbourhoods
df = pd.merge(df, latlong, how='inner', on='PostalCode')
print(df.head())

  PostalCode           Borough                                Neighbourhood  \
0        M3A        North York                                    Parkwoods   
1        M4A        North York                             Victoria Village   
2        M5A  Downtown Toronto                    Regent Park, Harbourfront   
3        M6A        North York             Lawrence Manor, Lawrence Heights   
4        M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government   

    Latitude  Longitude  
0  43.753259 -79.329656  
1  43.725882 -79.315572  
2  43.654260 -79.360636  
3  43.718518 -79.464763  
4  43.662301 -79.389494  


In [8]:
print("Dataframe rows and columns:", df.shape)

Dataframe rows and columns: (103, 5)
