# Week 3 Assignment - Segmenting & Clustering Neighbourhood in City of Toronto

## 1. Create Postal Code of Toronto Dataframe

### Import Python Libraries

In [91]:
import pandas as pd
from pandas import DataFrame
import numpy as np

### Import data from wikipedia 

In [95]:
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
tables = pd.read_html(URL,match="Postal Code")
print("There are : ",len(tables)," tables")
print("Take look at table 0")
tables[0]

There are :  1  tables
Take look at table 0


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Turn the data into data frame

In [96]:
TON_Pos_df = DataFrame(tables[0], columns=['Postal Code', 'Borough', 'Neighbourhood'])
TON_Pos_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


### Ignore cells with a borough that is not assigned

In [97]:
TON_Pos_df = TON_Pos_df[TON_Pos_df['Borough'] != 'Not assigned'] #exclude not assigned boroughs 
TON_Pos_df.reset_index(drop = True, inplace = True)
TON_Pos_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [98]:
TON_Pos_df['Neighbourhood'] = np.where(TON_Pos_df['Neighbourhood'] == 'Not assigned', TON_Pos_df['Borough'], TON_Pos_df['Neighbourhood'])

TON_Pos_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### To print the numbers of rows and columns of the data frame

In [99]:
TON_Pos_df.shape

(103, 3)

## 2. Create Geographical Coordinates Dataframe

### Import Geo Data (.CSV)

In [102]:
geo_url = 'http://cocl.us/Geospatial_data'

geo_tables = pd.read_csv(geo_url)
geo_tables.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge Geo Data into Toronto Postal Code Dataframe

In [104]:
df = TON_Pos_df.merge(geo_tables, on="Postal Code", how = 'inner')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# 3. Explore and cluster the neighborhoods in Toronto

### Import Libraries

In [106]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Folium installed and imported!


### Create map of Toronto

In [124]:
Ton_lat = 43.651070
Ton_long = -79.347015
Toronto_map = folium.Map(location=[Ton_lat, Ton_long], zoom_start=12)
Toronto_map

### Explore/Display Borough Contains the Word 'Toronto'

In [113]:
df['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [114]:
df_Ton = df[df['Borough'].str.contains('Toronto')]
df_Ton

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [125]:
# add markers to Toronto map
for lat, lng, borough, neighborhood in zip(df_Ton['Latitude'], df_Ton['Longitude'], df_Ton['Borough'], df_Ton['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(Toronto_map)  
    
Toronto_map

### Define FourSquare credentials

In [126]:
CLIENT_ID = 'MZX3H40NH0YUQ0VUSKUEC5GYC0L3ICUX2FCQPVYUOQ5H41AW' # your Foursquare ID
CLIENT_SECRET = 'QSRK0IFWHK2QGVRQ03U3NPTRSM4AUW1GIWSSVOGXDE5H2QBD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MZX3H40NH0YUQ0VUSKUEC5GYC0L3ICUX2FCQPVYUOQ5H41AW
CLIENT_SECRET:QSRK0IFWHK2QGVRQ03U3NPTRSM4AUW1GIWSSVOGXDE5H2QBD


### Cluster Neighbourhoods in Toronto

##### Run k-means to cluster the neighborhood into 5 clusters.

In [139]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(np.stack((df_Ton['Latitude'], df_Ton['Longitude']),axis = 1))

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df_Ton['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(df_Ton['Latitude'], df_Ton['Longitude'], df_Ton['Borough'], df_Ton['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker([latitude, longitude],
                        radius=5,
                        popup=label,
                        color=colors,
                        fill=True,
                        fill_color=colors[cluster-1],
                        fill_opacity=0.7).add_to(Toronto_map)  

Toronto_map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
