In [4]:
import wikipedia, re
from bs4 import BeautifulSoup
import pandas as pd
import wikipedia as wp
import numpy as np

###  Scrape  Wiki Toronto page

In [5]:
#Get the html source
html = wp.page("List_of_postal_codes_of_Canada:_M").html().encode("UTF-8")
df = pd.read_html(html)[0]
df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
df.shape

(288, 3)

In [7]:
df_filter=df.set_index('Borough')
df_filter.head()

Unnamed: 0_level_0,Postcode,Neighbourhood
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1
Not assigned,M1A,Not assigned
Not assigned,M2A,Not assigned
North York,M3A,Parkwoods
North York,M4A,Victoria Village
Downtown Toronto,M5A,Harbourfront


##### Drop the 'Not assigned' Boroughs

In [8]:
df=df_filter.drop(['Not assigned'])
df.reset_index(inplace=True)
df.shape

(211, 3)

##### Test if 'Not assigned' exists in Neighbourhood

In [9]:
test = list(df['Neighbourhood'])
'Not assigned' in test

True

##### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [10]:
#df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
df['Neighbourhood'] = np.where(df['Neighbourhood']=='Not assigned', df['Borough'], df['Neighbourhood'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


##### For similar Postcode, group Neighbourhood into one row with the neighborhoods separated with a comma 

In [11]:
df_grouped = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
test = list(df_grouped['Neighbourhood'])
'Not assigned' in test

False

##### The final number of rows are 103

In [13]:
df_grouped.shape

(103, 3)

##### Import the geospatial coordinates from http://cocl.us/Geospatial_data

In [14]:
path = '/Users/catalinmates/Documents/Coursera/Capstone/Geospatial_Coordinates.csv'
df_geo = pd.read_csv(path)

In [15]:
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
df_geo.shape

(103, 3)

##### Geospatial df has the exact same number of rows as my df_grouped

In [17]:
df_geo = df_geo.rename(columns=
                      {df_geo.columns[0]:'Postcode'})
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


##### Merge the two dfs so each Postal Code has a unique Latitude and Longitude

In [18]:
df_merge = pd.merge(df_grouped, df_geo, on='Postcode')
df_merge.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


##### Toronto latitude and longitude are: 

In [19]:
latitude = 43.6532
longitude = -79.383184

In [20]:
import folium
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

##### Create the map of Toronto based on the entire dataset

In [22]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Borough'], df_merge['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)


map_toronto

#### Understand how many Boroughs exist

In [23]:
df_merge['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       "Queen's Park", 'Mississauga', 'Etobicoke'], dtype=object)

####  Cluster all Toronto boroughs together and the rest of boroughs indivually

In [24]:
df_merge['Neighbourhood_new'] = np.where( df_merge['Borough'].str.contains("Toronto|Queen's Park"), 'City of Toronto', df_merge['Borough'])


In [25]:
df_merge['Neighbourhood_new'].unique()

array(['Scarborough', 'North York', 'East York', 'City of Toronto',
       'York', 'Mississauga', 'Etobicoke'], dtype=object)

In [26]:
toronto_data = df_merge[df_merge['Neighbourhood_new'] == 'City of Toronto'].reset_index(drop=True)
toronto_data.shape

(39, 6)

In [27]:
scarborough = df_merge[df_merge['Neighbourhood_new'] == 'Scarborough'].reset_index(drop=True)
scarborough.shape

(17, 6)

In [28]:
north_york = df_merge[df_merge['Neighbourhood_new'] == 'North York'].reset_index(drop=True)
north_york.shape

(24, 6)

In [29]:
east_york = df_merge[df_merge['Neighbourhood_new'] == 'East York'].reset_index(drop=True)
east_york.shape

(5, 6)

In [30]:
york = df_merge[df_merge['Neighbourhood_new'] == 'York'].reset_index(drop=True)
york.shape

(5, 6)

In [31]:
mississauga = df_merge[df_merge['Neighbourhood_new'] == 'Mississauga'].reset_index(drop=True)
mississauga.shape

(1, 6)

In [32]:
etobicoke = df_merge[df_merge['Neighbourhood_new'] == 'Etobicoke'].reset_index(drop=True)
etobicoke.shape

(12, 6)

### Map final clustered boroughs identified by different colors

In [36]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(etobicoke['Latitude'], etobicoke['Longitude'], etobicoke['Borough'], etobicoke['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
for lat, lng, borough, neighborhood in zip(york['Latitude'], york['Longitude'], york['Borough'], york['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

        
for lat, lng, borough, neighborhood in zip(mississauga['Latitude'], mississauga['Longitude'], mississauga['Borough'], mississauga['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='yellow',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
for lat, lng, borough, neighborhood in zip(east_york['Latitude'], east_york['Longitude'], east_york['Borough'], east_york['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='green',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

for lat, lng, borough, neighborhood in zip(north_york['Latitude'], north_york['Longitude'], north_york['Borough'], north_york['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='orange',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
for lat, lng, borough, neighborhood in zip(scarborough['Latitude'], scarborough['Longitude'], scarborough['Borough'], scarborough['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='purple',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighbourhood_new']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color='black',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)


map_toronto