# WEEK 3: SEGMENTING AND CLUSTERING 

In [1]:
import pandas as pd
import numpy as np

In [2]:
#!pip install requests
#!pip install beautifulsoup4
from bs4 import BeautifulSoup
import requests

### Scrape the Table on the given Wikipedia Page with Beautiful Soup and Display the head of the DataFrame

In [49]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(url, headers=headers)

In [52]:
soup = BeautifulSoup(page.text, "html.parser")
table = soup.find('table')
table_rows = table.find_all('tr')

l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df=pd.DataFrame(l, columns=["Postcode", "Borough", "Neighbourhood"])
df=df[df['Borough']!='Not assigned'].dropna()
df['Neighbourhood']=df['Neighbourhood'].str.replace('\n','')

In [53]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


### Group together rows with the same postalcode as required

In [44]:
df=df.groupby(['Postcode', 'Borough']).agg(lambda col: ', '.join(col)).reset_index()

In [45]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


### Fill 'Not- assigned'-values in Neighbourhood-Column with Borough-Name and display the changed rows

In [46]:
na_idx=df[df.Neighbourhood.str.contains('Not assigned')].index
df.loc[na_idx,'Neighbourhood']=df.loc[na_idx,'Borough']
df.iloc[na_idx]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Queen's Park


### Display Shape of Dataframe

In [138]:
df.shape

(103, 1)

### Filter on Toronto Boroughs only

In [58]:
toronto_df=df[df.Borough.str.contains('Toronto')]

In [61]:
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
18,M5B,Downtown Toronto,Ryerson
19,M5B,Downtown Toronto,Garden District
35,M5C,Downtown Toronto,St. James Town
...,...,...,...
251,M4X,Downtown Toronto,St. James Town
252,M5X,Downtown Toronto,First Canadian Place
253,M5X,Downtown Toronto,Underground city
263,M4Y,Downtown Toronto,Church and Wellesley


### Test geocoder

In [62]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
    g = geocoder.google('{}, Toronto, Ontario'.format('M5A'))
    lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

KeyboardInterrupt: 

### Use CSV since Geocoder does not reply

In [64]:
coordinates=pd.read_csv('Geospatial_Coordinates.csv')

In [71]:
toronto_df=pd.merge(toronto_df, coordinates, left_on='Postcode', right_on='Postal Code').drop('Postal Code', axis=1)

In [72]:
toronto_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
1,M5A,Downtown Toronto,Regent Park,43.654260,-79.360636
2,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
3,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
4,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
...,...,...,...,...,...
69,M4X,Downtown Toronto,St. James Town,43.667967,-79.367675
70,M5X,Downtown Toronto,First Canadian Place,43.648429,-79.382280
71,M5X,Downtown Toronto,Underground city,43.648429,-79.382280
72,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
