In [1]:
import pandas as pd
import requests
import io
import numpy as np

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

In [2]:
#Download and wrangle the dataset for Postcodes in Melbourne Only
au_po_list_url="https://www.matthewproctor.com/Content/postcodes/australian_postcodes.csv"
url_request  = requests.get(au_po_list_url)
df_mel = pd.read_csv(io.StringIO(url_request.text))


In [4]:
df_mel=df_mel.dropna(subset=['sa3'])
df_mel=df_mel[(df_mel['type'] == "Delivery Area") & (df_mel['sa4name'].str.match("Melbourne"))]

df_mel.drop(['dc','type','status','sa3','sa4','sa4name','region'], axis=1, inplace=True)

df_mel.head()

Unnamed: 0,id,postcode,locality,state,long,lat,sa3name
6100,4746,3000,MELBOURNE,VIC,144.956776,-37.817403,Melbourne City
6102,4748,3002,EAST MELBOURNE,VIC,144.982207,-37.818517,Melbourne City
6103,4749,3003,WEST MELBOURNE,VIC,144.949592,-37.810871,Melbourne City
6104,4750,3004,MELBOURNE,VIC,144.970161,-37.844246,Port Phillip
6107,4752,3005,WORLD TRADE CENTRE,VIC,144.950858,-37.824608,Melbourne City


In [5]:
# Map the current Suburbs

!pip install folium #Full install as Watson Studio doesnt like the other method
import folium


Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 10.5MB/s ta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0


In [8]:
#Display the initial Map
mel_long=-37.813611
mel_lat=144.963056

map_initial = folium.Map(location=[mel_long, mel_lat], zoom_start=10)

for lat, lon, pco in zip(df_mel['lat'], df_mel['long'], df_mel['postcode']):
    label = folium.Popup('{}'.format(pco), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=1).add_to(map_initial)
    
map_initial

In [12]:
#Build out the data source using Foursquare on the suburb data we have

CLIENT_ID = 'M0NK41BIWIW2KGFFGTRUVC2TG1R1PCMPK55TQRHUDXDDBZG1' 
CLIENT_SECRET = 'GXU1ZJENSAXGDHYMPOWZGCGMZ5NWUFXQRYRBWZ52QIUUBRXI' 
VERSION = '20180605' 
LIMIT = 50 
venues = []
radius = 500


for lat, long, postcode, suburb, region in zip(df_mel['lat'], df_mel['long'], df_mel['postcode'], df_mel['locality'], df_mel['sa3name']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)

    results = requests.get(url).json()["response"]['groups'][0]['items']
    for venue in results:
        venues.append((
            postcode, 
            suburb,
            region,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))



In [13]:
df_fsq_raw = pd.DataFrame(venues)
df_fsq_raw.columns = ['postcode', 'suburb', 'region', 'regionLatitude', 'regionLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

In [14]:
arr_venues_drink = (['Cocktail Bar', 'Bar', 'Wine Bar','Pub', 'Hotel Bar','Sports Bar', 'Gastropub', 'Dive Bar', 'Whisky Bar', 'Karaoke Bar',
       'Sake Bar', 'Rooftop Bar', 'Beer Bar','Hotel','Beer Garden','Casino','Social Club', 'Sports Club', 'Comedy Club', 'Nightclub',
       'Jazz Club', 'Rock Club','Brewery'])
df_fsq_drinks=df_fsq_raw[df_fsq_raw['VenueCategory'].isin(arr_venues_drink)]

In [15]:
df_fsq_drinks.head()

Unnamed: 0,postcode,suburb,region,regionLatitude,regionLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
4,3000,MELBOURNE,Melbourne City,-37.817403,144.956776,The Lui Bar,-37.819067,144.957739,Cocktail Bar
24,3000,MELBOURNE,Melbourne City,-37.817403,144.956776,Dikstein's Corner Bar,-37.816189,144.960353,Bar
28,3000,MELBOURNE,Melbourne City,-37.817403,144.956776,RACV Club,-37.81846,144.957365,Social Club
41,3000,MELBOURNE,Melbourne City,-37.817403,144.956776,The Irish Times,-37.816135,144.960563,Bar
51,3002,EAST MELBOURNE,Melbourne City,-37.818517,144.982207,Frank Grey Smith Bar,-37.819601,144.983194,Bar


In [16]:
# Build the one hot encoding for the discovered Foursquare data

mel_1H = pd.get_dummies(df_fsq_drinks[['VenueCategory']], prefix="", prefix_sep="")
mel_1H['postcode'] = df_fsq_drinks['postcode'] 
fixed_columns = list(mel_1H.columns[-3:]) + list(mel_1H.columns[:-3])
mel_1H = mel_1H[fixed_columns]

In [17]:
#Build a dataframe for the One hot data grouped by relevent columns
mel_grouped = mel_1H.groupby(["postcode"]).sum().reset_index()

In [93]:
#Enumerate the top 5 venues per suburb

num_top_venues = 15
indicators = ['st', 'nd', 'rd']


areaColumns = ['postcode']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

region_venues_sorted = pd.DataFrame(columns=columns)
region_venues_sorted['postcode'] = mel_grouped['postcode']

for ind in np.arange(mel_grouped.shape[0]):
    row_categories = mel_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    region_venues_sorted.iloc[ind, 1:] = row_categories_sorted.index.values[0:num_top_venues]

In [98]:
# Perform clusters analysis
kclusters = 5

mel_clustering = mel_grouped.drop(["postcode"], 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(mel_clustering)

In [99]:
# create a new dataframe that includes the cluster, top 10 and geo data
mel_merged = mel_grouped.copy()
mel_merged["Cluster Labels"] = kmeans.labels_

#region_venues_sorted.head()
mel_merged.head()
mel_merged = mel_merged.set_index("postcode").merge(region_venues_sorted, on="postcode")


In [100]:
df_mel_=df_mel.copy(deep=True)

mel_merged_final=mel_merged.join(df_mel_.set_index("postcode"), on="postcode")
mel_merged_final.sort_values(["Cluster Labels"], inplace=True)

mel_merged_final.dropna(axis = 0, how ='any', inplace = True)


In [101]:
# Generate Cluster map showing clusters with low to high availability
mel_long=-37.813611
mel_lat=144.963056

map_clusters = folium.Map(location=[mel_long, mel_lat], zoom_start=10)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
#rainbow = ['red','orange','yellow','green','blue','indigo','violet']
rainbow = ['violet','indigo','blue','green','yellow','orange','red']



markers_colors = []

for lat, lon, sub, loc, cluster in zip(mel_merged_final['lat'], mel_merged_final['long'], mel_merged_final['postcode'],  mel_merged_final['locality'], mel_merged_final['Cluster Labels']):
    label = folium.Popup('{}({})'.format(loc,sub), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_clusters)
    
map_clusters