# Capstone Project - Segmenting and Clustering Neighborhoods in Toronto

### Section 1 - Creating the Dataframe

#### Let's start with importing the required libraries for this section.

In [None]:
!pip install folium

import requests
import pandas as pd
import bs4
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium



You should consider upgrading via the 'python -m pip install --upgrade pip' command.


##### Next, let's pull the data from the given Wikipedia page.
##### After this is done, we will first exclude FSAs that don't have assigned boroughs.

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

req = requests.get(url)
soup = bs4.BeautifulSoup(req.content,'lxml')

table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
df.head()

fsa_df = df[~df.Borough.isin(['Not assigned'])]
fsa_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


##### Next, we combine Forward Service Areas with multiple neighbourhoods in them onto one row entry per Forward Service Area.

##### These combined neighbourhoods are appended to the row with commas.

In [3]:
postal_df = fsa_df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()
postal_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### Let's also rename neighbourhoods that have boroughs, but no neighbourhood names themselves, as their borough names instead.

In [4]:
postal_df.loc[postal_df['Neighbourhood']=="Not assigned",'Neighbourhood'] = postal_df.loc[postal_df['Neighbourhood']=="Not assigned",'Borough']
postal_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##### What's the shape of this new dataframe?

In [5]:
postal_df.shape
#Uncomment here to create .csv file of sorted postal code dataframe.
#postal_df.to_csv('postal_df.csv')

(103, 3)

### Section 2 - Importing Coordinate Data

#### Next up - importing the geospatial data to the notebook.

In [6]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Now, to merge our two dataframes into one. We will also drop the second column that makes duplicate postal code entries.

In [7]:
merged_postal_df = pd.merge(postal_df,
                            geo_df,
                            how='left',
                            left_on='Postcode',
                            right_on = 'Postal Code')

merged_postal_df.drop('Postal Code', axis=1, inplace=True)
merged_postal_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### What's the shape of our finalized dataframe?

In [8]:
merged_postal_df.shape

#Uncomment here to create .csv file of geospatial data-merged postal code dataframe.
#merged_postal_df.to_csv('merged_postal_df.csv')

(103, 5)

### Section 3 - Visualizing Toronto's Neighbourhoods

#### Lastly, we'll make a labeled map of Toronto's neighbourhoods in Folium.

In [9]:
T_lat = 43.653963
T_long = -79.387207

map_toronto = folium.Map(location=[T_lat, T_long],
                         zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(merged_postal_df['Latitude'],
                                           merged_postal_df['Longitude'],
                                           merged_postal_df['Borough'],
                                           merged_postal_df['Neighbourhood']):
    
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#59c123',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)  
    
map_toronto