# Capstone - Segmenting and Clustering Neighbourhoods in Toronto

## Part 1.  Getting data from Wikipedia, creating and formatting data frame.

In [1]:
#import needed libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

print('Libraries imported.')

Libraries imported.


In [2]:
# download data from Wikipedia, use Beautiful Soup to scrape table
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
soup= BeautifulSoup(source, 'xml')

In [3]:
#make soup object into a table
table=soup.find('table')

In [4]:
#make the dataframe with 3 columns
column_names=['Postal Code','Borough','Neighborhood']
df=pd.DataFrame(columns=column_names)

In [5]:
# find the needed postcode, borough, neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [6]:
#examine the dataframe result
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
#size of dataframe
df.shape

(180, 3)

In [8]:
#if there are any NaN, drop them
df.dropna()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [9]:
#reset the index
df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [10]:
#eliminate cells where no boroughs assigned
df2=df[df['Borough']!='Not assigned']

In [11]:
#examine new df
df2.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
#look at size of new df (should be smaller than previous)
df2.shape

(103, 3)

In [13]:
#if a borough has a 'Not assigned' neighborhood, set neighborhood = to borough
df3=df2
df3.loc[df3['Neighborhood']=="Not assigned",'Neighborhood']=df3.loc[df3['Neighborhood']=="Not assigned",'Borough']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [14]:
#examine new df
df3

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [15]:
#size of new df, should be same size as previous
df3.shape

(103, 3)

In [16]:
#some df tidying, group by postal code
temp_df=df3.groupby('Postal Code')['Neighborhood'].apply(lambda x: "%s" % ', '.join(x))
temp_df=temp_df.reset_index(drop=False)
temp_df.rename(columns={'Neighborhood':'Neighborhood_joined'},inplace=True)

In [17]:
#then merge df, key is Postal Code
df_merge = pd.merge(df3, temp_df, on='Postal Code')

In [18]:
#drop extra column and duplicate values
df_merge.drop(['Neighborhood'],axis=1,inplace=True)
df_merge.drop_duplicates(inplace=True)

In [19]:
#rename columns in merged set for consistency with previous df
df_merge.rename(columns={'Neighborhood_joined':'Neighborhood'},inplace=True)

In [20]:
#look at merged df
df_merge.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [21]:
#check size of merged df.  should be same
df_merge.shape

(103, 3)

## Part 2.  Add latitude and longitude of each neighbourhood to data frame

In [22]:
#rename df
df4=df_merge

In [23]:
#importing the lat long from the file provided by Coursera
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [24]:
#view new df
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [25]:
#add latitude and longitude columns to the dataframe created in the previous excercise
df4['Latitude'] = geo_df['Latitude']
df4['Longitude'] = geo_df['Longitude']
df4.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.763573,-79.188711
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.770992,-79.216917
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.773136,-79.239476
