### Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
import requests
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import io

import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0
import folium

print('Libraries imported.')

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py35_1 conda-forge
    branca:  0.3.1-py_0   conda-forge
    folium:  0.5.0-py_0   conda-forge
    vincent: 0.4.4-py_1   conda-forge

altair-2.2.2-p 100% |################################| Time: 0:00:00  52.40 MB/s
branca-0.3.1-p 100% |################################| Time: 0:00:00  35.50 MB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00  37.88 MB/s
folium-0.5.0-p 100% |################################| Time: 0:00:00  45.81 MB/s
Libraries imported.


### Import & skim the melbourne postcodes dataset
*Only areas within 10KM of the city centre are selected*

In [2]:
url="https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv"
s=requests.get(url).content
df=pd.read_csv(io.StringIO(s.decode('utf-8')))

In [3]:
array = [3000,3002,3003,3004,3005,3006,3008,3010,3011,3012,3013,3015,3016,3019,3025,3031,3032,3039,3040,3041,3044,3050,3051,3052,3053,3054,3055,3056,3057,3058,3060,3065,3066,3067,3068,3070,3071,3072,3078,3079,3081,3101,3102,3104,3121,3122,3123,3124,3126,3141,3142,3143,3144,3145,3161,3162,3181,3182,3183,3184,3185,3188,3205,3206,3207]

In [75]:
df_vic = df.loc[df['postcode'].isin(array)] #Select only the postcodes which are within 10km of the city
df_vic = df_vic[['postcode','locality','long','lat']] #Select the relevant columns of the dataset
df_vic = df_vic.dropna() #Drop rows whith incomplete data
df_vic.reset_index(inplace = True, drop=True)
df_vic.shape

(165, 4)

### Generate map of relevant areas

In [5]:
map_vic = folium.Map(location=[-37.8136, 144.9631], zoom_start=12) #Generate map of melbourne

for lat, lng, label in zip(df_vic['lat'], df_vic['long'], df_vic['locality']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_vic)  
    
map_vic

#### Foursquare client details

In [36]:
CLIENT_ID = 'PFIKFUM4W2WNJQVFGNKQRGIHKJAB21KYRNMYQI3L53MLYRE5'
CLIENT_SECRET = 'G2E5ESZYRMS31XSDVCNRR4TU2AGAWD5JDJR5M4GNP32D2GB5'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PFIKFUM4W2WNJQVFGNKQRGIHKJAB21KYRNMYQI3L53MLYRE5
CLIENT_SECRET:G2E5ESZYRMS31XSDVCNRR4TU2AGAWD5JDJR5M4GNP32D2GB5


#### Function to retrieve gyms for each suburb

In [54]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        LIMIT = 40
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query=gym&radius={}&limit={}'.format( #query parameter added to url
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Call the function

In [55]:
melbourne_venues = getNearbyVenues(names = df_vic['locality'],
                                  latitudes = df_vic['lat'],
                                  longitudes = df_vic['long']
                                  )

PRAHRAN
PRAHRAN EAST
WINDSOR
ST KILDA
ST KILDA SOUTH
ST KILDA WEST
BALACLAVA
ST KILDA EAST
BRIGHTON ROAD
ELWOOD
ELSTERNWICK
GARDENVALE
RIPPONLEA
HAMPTON
HAMPTON EAST
HAMPTON NORTH
SOUTH MELBOURNE
SOUTH MELBOURNE DC
ALBERT PARK
MIDDLE PARK
GARDEN CITY
PORT MELBOURNE
CAMBERWELL WEST
HARTWELL
MIDDLE CAMBERWELL
CAMBERWELL EAST
CANTERBURY
CHAPEL STREET NORTH
DOMAIN ROAD PO
SOUTH YARRA
HAWKSBURN
TOORAK
ARMADALE
ARMADALE NORTH
KOOYONG
MALVERN
MALVERN NORTH
CAULFIELD EAST
CENTRAL PARK
DARLING
DARLING SOUTH
MALVERN EAST
WATTLETREE ROAD PO
CAULFIELD JUNCTION
CAULFIELD NORTH
CAULFIELD
CAULFIELD SOUTH
HOPETOUN GARDENS
BRUNSWICK WEST
MOONEE VALE
MORELAND WEST
BRUNSWICK
BRUNSWICK LOWER
BRUNSWICK NORTH
BRUNSWICK EAST
LYGON STREET NORTH
BATMAN
COBURG
COBURG NORTH
MERLYNSTON
MORELAND
FAWKNER
FAWKNER EAST
FAWKNER NORTH
FITZROY
COLLINGWOOD
COLLINGWOOD NORTH
ABBOTSFORD
CLIFTON HILL
FITZROY NORTH
NORTHCOTE
NORTHCOTE SOUTH
THORNBURY
GILBERTON
NORTHLAND CENTRE
PRESTON
PRESTON LOWER
PRESTON SOUTH
PRESTON WEST

In [56]:
mv = melbourne_venues[melbourne_venues['Venue Category'].str.contains('Gym')] #only selects venues with relevant categories
mv = mv.drop_duplicates(subset=('Venue Latitude','Venue Longitude')).reset_index(drop=True) #Removes duplicates
mv.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,PRAHRAN,-37.854683,144.995504,Brophy's Body and Mind,-37.855571,144.99333,Gym
1,PRAHRAN,-37.854683,144.995504,Train 24/7 Fitness,-37.852502,144.998695,Gym
2,PRAHRAN,-37.854683,144.995504,National Institute Of Circus Arts,-37.852965,144.991219,College Gym
3,PRAHRAN,-37.854683,144.995504,Juggernaut Personal Training,-37.858179,144.99263,Gym
4,PRAHRAN,-37.854683,144.995504,Chapel Fitness,-37.858164,144.99242,Gym / Fitness Center


In [76]:
mv.shape

(73, 8)

#### Create map of gyms

In [57]:
map_mv = folium.Map(location=[-37.8136, 144.9631], zoom_start=12.4)

for lat, lng, label in zip(mv['Venue Latitude'], mv['Venue Longitude'], mv["Venue"]):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.2,
        parse_html=False).add_to(map_mv)  
    
map_mv

### KMeans analysis

In [58]:
X = mv[['Venue Longitude', 'Venue Latitude']].values #Converts venue location columns to array

The following finds 10 distinct clusters from the gym coordinates & the mean location of each

In [59]:
k_means = KMeans(init="k-means++", n_clusters=10, n_init=12)
k_means.fit(X)
k_means_labels = k_means.labels_
k_centers = k_means.cluster_centers_
k_centers #centres of each of the clusters

array([[ 144.95733176,  -37.80382487],
       [ 144.99432929,  -37.86159437],
       [ 144.92286058,  -37.77027509],
       [ 145.03646975,  -37.81918751],
       [ 144.97113505,  -37.75228577],
       [ 144.95606054,  -37.82125193],
       [ 144.99682351,  -37.83178127],
       [ 145.03539381,  -37.88462052],
       [ 144.98293832,  -37.80455244],
       [ 144.90438774,  -37.80822124]])

In [60]:
k_means_labels

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5, 5, 5, 5, 5, 6, 6, 6, 7, 7, 4, 4, 4,
       4, 8, 8, 8, 8, 8, 8, 8, 8, 4, 3, 6, 6, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 0, 5, 5, 5, 5, 9, 2, 2, 2, 2, 2, 0, 0,
       0, 0, 0, 0], dtype=int32)

In [61]:
mv.insert(0, 'Cluster Labels', k_means_labels) #inserts a column into the gym dataset which specifies which cluster each belongs to

In [77]:
mv = mv[['Neighborhood', 'Neighborhood Latitude','Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category', 'Cluster Labels']]
mv.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Cluster Labels
0,PRAHRAN,-37.854683,144.995504,Brophy's Body and Mind,-37.855571,144.99333,Gym,1
1,PRAHRAN,-37.854683,144.995504,Train 24/7 Fitness,-37.852502,144.998695,Gym,1
2,PRAHRAN,-37.854683,144.995504,National Institute Of Circus Arts,-37.852965,144.991219,College Gym,1
3,PRAHRAN,-37.854683,144.995504,Juggernaut Personal Training,-37.858179,144.99263,Gym,1
4,PRAHRAN,-37.854683,144.995504,Chapel Fitness,-37.858164,144.99242,Gym / Fitness Center,1


#### Cluster visualisation

In [63]:
map_clusters = folium.Map(location=[-37.8136, 144.9631], zoom_start=12.4)

kclusters=10
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []

for lat, lng, label, cluster in zip(mv['Venue Latitude'], mv['Venue Longitude'], mv["Venue"], mv['Cluster Labels']):
    label = folium.Popup(str(label) + 'Cluster' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_clusters)  
    
map_clusters

#### List of kmeans centres

In [64]:
df_centers = pd.DataFrame({'long':k_centers[:,0],'lat':k_centers[:,1]})
df_centers

Unnamed: 0,lat,long
0,-37.803825,144.957332
1,-37.861594,144.994329
2,-37.770275,144.922861
3,-37.819188,145.03647
4,-37.752286,144.971135
5,-37.821252,144.956061
6,-37.831781,144.996824
7,-37.884621,145.035394
8,-37.804552,144.982938
9,-37.808221,144.904388


#### Map of KMeans centres

In [65]:
map_gyms = folium.Map(location=[-37.8136, 144.9631], zoom_start=12)

for lat, lng in zip(df_centers['lat'], df_centers['long']):
    label = folium.Popup(parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='blue',
        fill=True,
        fill_color='red',
        fill_opacity=0.2,
        parse_html=False).add_to(map_gyms)  
    
map_gyms

### Finding the 10 suburbs which contains the most gyms

In [72]:
mv_grouped = mv.groupby(['Neighborhood']).count() #Groups dataset by neighborhood
mv_grouped = mv_grouped.sort_values(by='Venue', ascending=False) #Sorts dataset by 
mv_grouped = mv_grouped['Venue'].head(n=10)
mv_grouped

Neighborhood
MELBOURNE              14
SOUTH MELBOURNE         6
PRAHRAN                 5
WORLD TRADE CENTRE      4
COLLINGWOOD             4
BALACLAVA               4
MOONEE PONDS            4
CARLTON                 4
FITZROY                 3
CHAPEL STREET NORTH     3
Name: Venue, dtype: int64

In [73]:
gyms = df_vic.loc[df_vic['locality'].isin(mv_grouped.index)]
gyms.drop_duplicates(subset=('long','lat')).reset_index(drop=True)
gyms = gyms.head(n=10)

In [74]:
map_gyms = folium.Map(location=[-37.8136, 144.9631], zoom_start=12)

for lat, lng, label in zip(gyms['lat'], gyms['long'], gyms['locality']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=14,
        popup=label,
        color='blue',
        fill=True,
        fill_color='red',
        fill_opacity=0.2,
        parse_html=False).add_to(map_gyms)  
    
map_gyms