### Segmenting and Clustering Neighbourhood in Toronto

In [152]:
#Install beautifulsoup for web-scrapping
!pip install beautifulsoup4

print('Installation complete!')

Installation complete!


In [172]:
#import required modules
from bs4 import BeautifulSoup
import requests
import pandas as pd

print('python modules imported')

python modules imported


In [173]:
#Get the html page for scrapping
html_text = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
bs = BeautifulSoup(html_text,'html')

In [174]:
#get the table
table = bs.find('table', class_='wikitable sortable')
column_names = []
data = []

#for each row in the table scrap the header / data
for row in table.find_all('tr'):
    if(row.th != None):
        for theader in row.find_all('th'):
            column_names.append(theader.text.strip())
    if(row.td != None):
        sub_data = []
        for tdata in row.find_all('td'):
            sub_data.append(tdata.text.strip())
        data.append(sub_data)


In [175]:
#create a data frame using the scrapped header and data
df = pd.DataFrame(data, columns=column_names)

In [176]:
#clean the data
df = df[~df.Borough.str.contains("Not assigned")]

### Reindex the data missing values will be assigned to NaN

In [177]:
df.reindex()
df = df[~df.Borough.str.contains("NaN")]

df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()
df['Neighbourhood'].replace('Not assigned',df['Borough'],inplace=True)


df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."


In [178]:
df.shape

(103, 3)

In [182]:
## Section 2

#install geocode
!pip install geocoder

#download geo data
!wget -q -O 'toronto_data.csv' https://cocl.us/Geospatial_data

#import geocoder
import geocoder

#read the data
geo_df = pd.read_csv('toronto_data.csv')


#insert columns to store geo coordinates
for i in df.index:
    df.at[i, 'Latitude'] = float(geo_df[geo_df['Postal Code'] == df.at[i, 'Postcode']]['Latitude'])
    df.at[i, 'Longitude'] = float(geo_df[geo_df['Postal Code'] == df.at[i, 'Postcode']]['Longitude'])  



In [184]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437
