# Part 1 of Peer-graded Assignment

#### The necessary libraries were imported. Pandas was needed for created the required dataframe

In [1]:
import pandas as pd
import numpy as np

#### The url for the table was stored and the data was converted into a dataframe using pandas.

In [51]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)
postalCodeTable = tables[0]

#### All rows where the Borough was not assigned were removed.

In [52]:
postalCodeTable = postalCodeTable[postalCodeTable['Borough'] != 'Not assigned']

#### A join function was applied to all rows that were grouped together by the same postcode to remove redundencies.

In [53]:
postalCodeTable = postalCodeTable.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))
postalCodeTable = pd.DataFrame(postalCodeTable)
postalCodeTable = postalCodeTable.reset_index(drop=False)

#### All rows where the Neighbourhood was unassigned has the Borough category be put in for the neighbourhood category.

In [54]:
maskForUnassigned = postalCodeTable['Neighbourhood'] == 'Not assigned'
postalCodeTable.loc[maskForUnassigned, 'Neighbourhood'] = postalCodeTable[maskForUnassigned]['Borough']
postalCodeTable.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### The shape of the final dataframe was given from the shape property of the dataframe.

In [6]:
postalCodeTable.shape

(103, 3)

## Part 2 of Peer-graded Assignment

#### Geocoder was imported to find the longitude and latitude of the different postcode areas

In [7]:
import geocoder

#### This is a function that will help find the latitude and longitude given a particular postcode

In [8]:
def findLatLng(postcode):
    coordinate = None

    while coordinate is None:
        g = geocoder.arcgis(location=f'{postcode}, Toronto, Ontario')
        coordinate = g.latlng
        
    return coordinate

#### This finds the latitudes and longitudes for all the postcodes and stores them in lists

In [9]:
postCodeList = list(postalCodeTable['Postcode'])
latLongList = [findLatLng(postcode) for postcode in postCodeList]
latList = [coordinate[0] for coordinate in latLongList]
longList = [coordinate[1] for coordinate in latLongList]

#### This creates and adds the latitude and longitude columns into the dataframe.

In [10]:
postalCodeTable['Latitude'] = latList
postalCodeTable['Longitude'] = longList

#### The dataframe now contains the latitude and longitude of all postcodes

In [49]:
postalCodeTable.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726245,-79.26367
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713133,-79.285055
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.723575,-79.234976
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.696665,-79.260163


## Part 3 of Peer-graded Assignment

#### These libraries will be used for plotting data on a map and for accessing the FourSquare API

In [12]:
import folium
import requests

#### I will only be dealing with data that contains the word 'Toronto' in the Borough to simplify the data visualization

In [13]:
torontoTable = postalCodeTable[postalCodeTable['Borough'].str.contains('Toronto')]

#### Reset the index after removing data that does not contain 'Toronto' in the Borough column

In [14]:
torontoTable = torontoTable.reset_index(drop=True)
torontoTable.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676845,-79.295225
1,M4K,East Toronto,"The Danforth West, Riverdale",43.683262,-79.35512
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.667965,-79.314673
3,M4M,East Toronto,Studio District,43.662766,-79.33483
4,M4N,Central Toronto,Lawrence Park,43.72816,-79.387085


#### Find the location of Toronto, Canada to center the folium map around

In [15]:
locationOfToronto = geocoder.arcgis(location='Toronto, Ontario').latlng
locationOfToronto

[43.648690000000045, -79.38543999999996]

#### Plot all of the postcodes based on their longitudes and latitudes on a map of Toronto

In [16]:
map_toronto = folium.Map(location=[locationOfToronto[0], locationOfToronto[1]], zoom_start=11)

for lat, lng, label in zip(torontoTable['Latitude'], torontoTable['Longitude'], torontoTable['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
    [lat, lng],
    radius=5,
    popup=label,
    color='red',
    fill=True,
    fill_color='414141',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
    
map_toronto

#### Set Credentials on FourSquare account

In [55]:
CLIENT_ID = '1MHL3UPS04LZJW51PPG0105CVW4OGJ0UVOKEH30TYDGPYVF4'
CLIENT_SECRET = 'RZO4YH3DF1X0MF4F1CK5FKZMRC0KVDHHD3UY32BKTDUDP12Q'
VERSION = '20180605'
LIMIT = 30

#### Function to get venues in a specified radius given a coordinate

In [56]:
def getCloseVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results
        ])
        
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood',
                                 'Neighborhood Latitude',
                                 'Neighborhood Longitude',
                                 'Venue',
                                 'Venue Latitude',
                                 'Venue Longitude',
                                 'Venue Category'
                                ]
        
    return nearby_venues   

#### Retrieve nearby venues from the fourSquare api for the Toronto Neighborhood Data

In [57]:
torontoVenues = getCloseVenues(names=torontoTable['Neighbourhood'],
                        latitudes=torontoTable['Latitude'],
                       longitudes=torontoTable['Longitude'])

#### Look at the returned dataFrame for the venues that are nearby each neighborhood

In [40]:
torontoVenues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676845,-79.295225,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676845,-79.295225,Glen Manor Ravine,43.676821,-79.293942,Trail
2,The Beaches,43.676845,-79.295225,Tori's Bakeshop,43.672114,-79.290331,Vegetarian / Vegan Restaurant
3,The Beaches,43.676845,-79.295225,The Beech Tree,43.680493,-79.288846,Gastropub
4,The Beaches,43.676845,-79.295225,Beaches Bake Shop,43.680363,-79.289692,Bakery


#### Convert Each category into a dummary variable to make future data processing and machine learning easier

In [41]:
toronto_onehot = pd.get_dummies(torontoVenues[['Venue Category']], prefix='', prefix_sep='')
toronto_onehot['Neighbourhood'] = torontoVenues['Neighborhood']
toronto_onehot = toronto_onehot[[toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])]
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Take the mean of the venue Category when grouping by neighborhood to find which venues are most prevalent by each neighborhood

In [42]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Art Gallery,...,Theme Restaurant,Thrift / Vintage Store,Toy / Game Store,Track,Trail,Train Station,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333


#### Function that returns the n most popular venues for a particular row.

In [43]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Create a dataframe that specifies the top 10 venues for each neighborhood

In [44]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append(f'{ind + 1}{indicators[ind]} Most Common Venue')
    except:
        columns.append(f'{ind + 1}th Most Common Venue')
neighbors_venues_sorted = pd.DataFrame(columns=columns)
neighbors_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbors_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
neighbors_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Coffee Shop,Steakhouse,American Restaurant,Gym,Pizza Place,Speakeasy,Bar,Concert Hall,Restaurant
1,Berczy Park,Café,Farmers Market,Cocktail Bar,Seafood Restaurant,Bistro,Japanese Restaurant,Bakery,Basketball Stadium,Beer Bar,Jazz Club
2,"Brockton, Exhibition Place, Parkdale Village",Café,Coffee Shop,Hotel,Vegetarian / Vegan Restaurant,Furniture / Home Store,Sandwich Place,Soup Place,Beer Bar,Cocktail Bar,Seafood Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Café,Theater,Steakhouse,Concert Hall,Sushi Restaurant,Restaurant,Brazilian Restaurant,Burger Joint,Burrito Place
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Italian Restaurant,Coffee Shop,Spa,Caribbean Restaurant,Restaurant,Yoga Studio,Bakery,Sandwich Place,Ramen Restaurant,Seafood Restaurant


#### Import required KMeans library so that a k-means clustering algorithm can be applied on the Toronto venue data

In [45]:
from sklearn.cluster import KMeans
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

#### Merge the top 10 venue dataframe with the dataframe that contains the postcode and latitude/longitude for each neighborhood

In [46]:
neighbors_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = torontoTable

toronto_merged = toronto_merged.join(neighbors_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

#### Look at the newly merged dataframe

In [47]:
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676845,-79.295225,1,Park,Bakery,Pub,BBQ Joint,Ice Cream Shop,Japanese Restaurant,Breakfast Spot,Indie Movie Theater,Juice Bar,Bookstore
1,M4K,East Toronto,"The Danforth West, Riverdale",43.683262,-79.35512,1,Greek Restaurant,Bakery,Ice Cream Shop,Italian Restaurant,Café,Yoga Studio,Pizza Place,Brewery,Restaurant,Pub
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.667965,-79.314673,1,Beach,Brewery,Coffee Shop,Indian Restaurant,Park,Italian Restaurant,Sushi Restaurant,Pub,Café,Liquor Store
3,M4M,East Toronto,Studio District,43.662766,-79.33483,1,Coffee Shop,Brewery,Italian Restaurant,Chinese Restaurant,Café,Snack Place,Ice Cream Shop,Boutique,Sandwich Place,Pizza Place
4,M4N,Central Toronto,Lawrence Park,43.72816,-79.387085,2,Café,Bookstore,College Quad,Pharmacy,Bus Line,Gym / Fitness Center,Trail,Park,College Gym,Coffee Shop


#### Import modules required to generate colors for plotting on map. All of the neighborhood groups were plotted and colored based on their k-means cluster number 

In [48]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[locationOfToronto[0], locationOfToronto[1]], zoom_start=12)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=5,
    popup=label,
    color=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=0.7).add_to(map_clusters)

map_clusters