# Data Science Capstone Project

## Part 1 (Week 3: Segmenting and Clustering Neighborhoods in Toronto)

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
#Scraping postal code table from Wikipedia page and creating boroughs_df dataframe
frompage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(frompage.content,'lxml')
table = soup.find_all('table')[0] 
boroughs_df = pd.read_html(str(table))[0]
boroughs_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
#Dropping rows that have 'Boroughs' "Not assigned"
#Note: these rows happen to coincide with Neighborhoods that are also "Not assigned"
indexes_to_drop = boroughs_df[ boroughs_df['Borough'] == 'Not assigned' ].index
boroughs_df.drop(indexes_to_drop , inplace=True)
boroughs_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
#Combining Neighborhoods that share a postal code in one row, with a comma for separation.
#Note, the Wikipedia page as of today 7/7/2020 appears to already have grouped the Neighborhoods by postal code (which is what the following code performs)
boroughs_df=boroughs_df.groupby(['Postal Code','Borough'])['Neighborhood'].agg(', '.join).reset_index()
boroughs_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
#Using the .shape method to print the number of rows in my dataframe (103 rows)
boroughs_df.shape

(103, 3)

## Part 1B - Retrieving Latitude and Longitude Coordinates

In [7]:
url='https://cocl.us/Geospatial_data/Geospatial_coordinates.csv'
df_coords=pd.read_csv(url)

#Note that both dataframes (boroughs_df & df_coords) are sorted by Postal Code
boroughs_df['Latitude']=df_coords['Latitude']
boroughs_df['Longitude']=df_coords['Longitude']
boroughs_df.head(12)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
