<a href="https://colab.research.google.com/github/Enell261/Capstone_project/blob/main/Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [159]:
#import the necessary modules

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim

We want to get a list of all the suburbs in Pretoria

---



In [160]:
#Get the html from the wikipedia page containing the list
url = 'https://en.wikipedia.org/wiki/List_of_Pretoria_suburbs'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [161]:
# Get a list of all the elements corresponding to the Pretoria suburbs
pta_east = soup.find_all('ul')[1]
pta_ne = soup.find_all('ul')[2]
pta_nw = soup.find_all('ul')[3]
pta_cw = soup.find_all('ul')[4]

In [162]:
#Create a list of all the links for the suburbs on the page
links = []
burbs = [pta_east, pta_ne, pta_nw, pta_cw]
for suburb in burbs:
  suburbs_links = suburb.find_all('a')
  links.append(suburbs_links)

In [163]:
#links is a nested link, we unnest it using itertools
from itertools import chain
links = list(chain(*links))

In [164]:
len(links)

102

In [165]:
#Extract the suburb names from the list
suburbs = []
for link in links:
    title = link.contents
    suburbs.append(title)

In [166]:
#Suburbs is a nested list, we need to unnest it using itertools
suburbs = list(chain(*suburbs))

In [167]:
# Add the string ', Pretoria' for easier geocoding later
suburbs = [s + ', Pretoria' for s in suburbs]

In [168]:
suburbs = pd.DataFrame({'Suburb':suburbs})

In [169]:
suburbs.head()

Unnamed: 0,Suburb
0,"Alphen Park, Pretoria"
1,"Arcadia, Pretoria"
2,"Ashlea Gardens, Pretoria"
3,"Brooklyn, Pretoria"
4,"Brummeria, Pretoria"


We need to manually geocode each suburb as there is such no dataset readily available 

In [None]:
pip install geocoder

In [None]:
pip install geopandas

In [None]:
pip install geopy

Get coordinates for each suburb


In [173]:
from geopy.extra.rate_limiter import RateLimiter
#create a Nominatim instance
locator = Nominatim(user_agent="myGeocoder")
#create function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
#create location column
suburbs['location'] = suburbs['Suburb'].apply(geocode)
#create longitude, laatitude and altitude from location column (returns tuple)
suburbs['point'] = suburbs['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# split point column into latitude, longitude and altitude columns
suburbs[['latitude', 'longitude', 'altitude']] = pd.DataFrame(suburbs['point'].tolist(), index=suburbs.index)

In [174]:
# we only need the suburb and its coordinates
suburbs.drop(['location', 'point', 'altitude'], axis=1, inplace=True)

In [175]:
#drop suburbs with missing coordinates
suburbs.dropna(inplace=True)

In [176]:
suburbs.to_csv('Pretoria Geospatial data.csv', index=False)

In [177]:
suburbs.head()

Unnamed: 0,Suburb,latitude,longitude
0,"Alphen Park, Pretoria",-25.78289,28.263953
1,"Arcadia, Pretoria",-25.745627,28.210221
2,"Ashlea Gardens, Pretoria",-25.785833,28.264167
3,"Brooklyn, Pretoria",-25.765556,28.239722
4,"Brummeria, Pretoria",-25.744224,28.283561


**Mapping Pretoria and its suburbs**<br>
We now use folium to create a map of Pretoria with markers

In [178]:
# Declare the address as Pretoria

address = 'Pretoria, South Africa'

# Get the coordinates for Toronto using geolocator
locat = locator.geocode(address)
latitude = locat.latitude
longitude = locat.longitude

In [179]:
print('The geograpical coordinate of Pretoria are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Pretoria are -25.7459374, 28.1879444.


In [180]:
# draw a map of Pretoria

pta_map = folium.Map(location=[latitude, longitude], zoom_start=11)

#add markers to the map

for lat, lng, label in zip(suburbs['latitude'], suburbs['longitude'], suburbs['Suburb']):
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7,
      parse_html=False).add_to(pta_map)

In [181]:
pta_map

We access Foursquare for Pretoria data

In [182]:
CLIENT_ID = 'PJX3PJ4FIED23NYXJEMYAHHJH0S4UHKZRL214FEKVLMT13FW' # my Foursquare ID
CLIENT_SECRET = 'HHUGJ5DXTGCJ35ORN5DJXXCM5ZPS42ESFGJ4TX2ICJULGLAX' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [183]:
# get url for Pretoria from Foursquare

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 3000 # define radius as 3km

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=PJX3PJ4FIED23NYXJEMYAHHJH0S4UHKZRL214FEKVLMT13FW&client_secret=HHUGJ5DXTGCJ35ORN5DJXXCM5ZPS42ESFGJ4TX2ICJULGLAX&v=20180605&ll=-25.7459374,28.1879444&radius=3000&limit=100'

In [184]:
# Create the json file for the Pretoria data from Foursquare
results = requests.get(url).json()

In [185]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [186]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Café Riche,Café,-25.746579,28.187304
1,Pretoria Zoo,Exhibit,-25.740416,28.188903
2,State Theatre,Performing Arts Venue,-25.746167,28.195084
3,National Zoological Gardens (Pretoria Zoo),Zoo,-25.734267,28.191047
4,Manhattan Hotel Pretoria,Hotel,-25.756855,28.191166


In [187]:
#Get the number of venues returned by the function
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

53 venues were returned by Foursquare.


In [188]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [189]:
#Create a dataframe
pta_venues = getNearbyVenues(names=suburbs['Suburb'],
                                   latitudes=suburbs['latitude'],
                                   longitudes=suburbs['longitude']
                                  )

Alphen Park, Pretoria
Arcadia, Pretoria
Ashlea Gardens, Pretoria
Brooklyn, Pretoria
Brummeria, Pretoria
Bryntirion, Pretoria
Colbyn, Pretoria
Constantiapark, Pretoria
Die Wilgers, Pretoria
Eastwood, Pretoria
Elarduspark, Pretoria
Erasmuskloof, Pretoria
Erasmusrand, Pretoria
Faerie Glen, Pretoria
Garsfontein, Pretoria
Groenkloof, Pretoria
Hatfield, Pretoria
Hazeldene, Pretoria
Hazelwood, Pretoria
Hillcrest, Pretoria
La Montagne, Pretoria
Lukasrand, Pretoria
Lydiana, Pretoria
Lynnwood, Pretoria
Lynnwood Glen, Pretoria
Lynnwood Manor, Pretoria
Lynnwood Park, Pretoria
Lynnwood Ridge, Pretoria
Menlo Park, Pretoria
Menlyn, Pretoria
Meyerspark, Pretoria
Monument Park, Pretoria
Moreleta Park, Pretoria
Muckleneuk, Pretoria
Murrayfield, Pretoria
New Muckleneuk, Pretoria
Newlands, Pretoria
Olympus, Pretoria
Rietvalleirand, Pretoria
Sterrewag, Pretoria
Sunnyside, Pretoria
Trevenna, Pretoria
Val de Grace, Pretoria
Wapadrand, Pretoria
Waterkloof, Pretoria
Waterkloof Glen, Pretoria
Waterkloof Park, P

In [190]:
# Check the number of venues per neighbourh
pta_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Alphen Park, Pretoria",12,12,12,12,12,12
"Amandasig, Pretoria",2,2,2,2,2,2
"Arcadia, Pretoria",22,22,22,22,22,22
"Ashlea Gardens, Pretoria",5,5,5,5,5,5
"Atteridgeville, Pretoria",3,3,3,3,3,3
...,...,...,...,...,...,...
"Wingate Park, Pretoria",12,12,12,12,12,12
"Wolmer, Pretoria",6,6,6,6,6,6
"Wonderboom South, Pretoria",5,5,5,5,5,5
"Woodhill, Pretoria",37,37,37,37,37,37


In [191]:
# one hot encoding
pta_coded = pd.get_dummies(pta_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
pta_coded['Neighbourhood'] = pta_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [pta_coded.columns[-1]] + list(pta_coded.columns[:-1])
pta_coded = pta_coded[fixed_columns]

pta_coded.head()

Unnamed: 0,Neighbourhood,ATM,African Restaurant,American Restaurant,Art Museum,Asian Restaurant,Athletics & Sports,Auto Workshop,Automotive Shop,BBQ Joint,Bakery,Bar,Bed & Breakfast,Bike Rental / Bike Share,Bistro,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Burger Joint,Bus Station,Butcher,Cafeteria,Café,Casino,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Soccer Field,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cricket Ground,Deli / Bodega,Department Store,Dessert Shop,Diner,...,Paper / Office Supplies Store,Park,Performing Arts Venue,Pharmacy,Pizza Place,Platform,Playground,Plaza,Portuguese Restaurant,Pub,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Resort,Restaurant,Road,Salad Place,Seafood Restaurant,Shop & Service,Shopping Mall,Snack Place,Soccer Field,Spa,Spanish Restaurant,Sporting Goods Shop,Sports Bar,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,University,Video Store,Wine Shop,Zoo
0,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [192]:
pta_grouped = pta_coded.groupby('Neighbourhood').mean().reset_index()
pta_grouped

Unnamed: 0,Neighbourhood,ATM,African Restaurant,American Restaurant,Art Museum,Asian Restaurant,Athletics & Sports,Auto Workshop,Automotive Shop,BBQ Joint,Bakery,Bar,Bed & Breakfast,Bike Rental / Bike Share,Bistro,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Burger Joint,Bus Station,Butcher,Cafeteria,Café,Casino,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Soccer Field,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cricket Ground,Deli / Bodega,Department Store,Dessert Shop,Diner,...,Paper / Office Supplies Store,Park,Performing Arts Venue,Pharmacy,Pizza Place,Platform,Playground,Plaza,Portuguese Restaurant,Pub,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Resort,Restaurant,Road,Salad Place,Seafood Restaurant,Shop & Service,Shopping Mall,Snack Place,Soccer Field,Spa,Spanish Restaurant,Sporting Goods Shop,Sports Bar,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,University,Video Store,Wine Shop,Zoo
0,"Alphen Park, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.0,0.0,0.083333,0.000000,0.000000,0.0,0.000000,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Amandasig, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Arcadia, Pretoria",0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.090909,0.000000,0.0,0.0,0.0,0.090909,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.045455,0.000000,0.0,0.00,0.045455,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.00,0.000000,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Ashlea Gardens, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.400000,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Atteridgeville, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.333333,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,"Wingate Park, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.083333,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.166667,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
83,"Wolmer, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.166667,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.166667,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.166667,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,"Wonderboom South, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.200000,0.0,0.0,0.0,0.000000,0.000000,0.200000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.2,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,"Woodhill, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.027027,0.000000,0.0,0.0,0.000000,0.027027,0.027027,0.0,0.054054,0.054054,0.0,0.0,0.0,0.081081,0.0,0.0,0.054054,0.027027,0.054054,0.0,0.0,0.027027,0.0,0.000000,0.0,0.0,0.0,0.027027,0.027027,0.054054,...,0.027027,0.0,0.00,0.00,0.027027,0.000000,0.0,0.00,0.054054,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.00,0.054054,0.0,0.054054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
num_top_venues = 5

for hood in pta_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = pta_grouped[pta_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alphen Park, Pretoria----
          venue  freq
0         Hotel  0.17
1    Restaurant  0.17
2           Bar  0.08
3  Burger Joint  0.08
4   Supermarket  0.08


----Amandasig, Pretoria----
                        venue  freq
0               Garden Center   0.5
1        Fast Food Restaurant   0.5
2                   Nightclub   0.0
3  Modern European Restaurant   0.0
4               Movie Theater   0.0


----Arcadia, Pretoria----
                  venue  freq
0                 Hotel  0.18
1  Fast Food Restaurant  0.09
2         Shopping Mall  0.09
3                  Café  0.09
4        Breakfast Spot  0.09


----Ashlea Gardens, Pretoria----
                        venue  freq
0                       Hotel   0.4
1                  Restaurant   0.4
2          Italian Restaurant   0.2
3  Modern European Restaurant   0.0
4               Movie Theater   0.0


----Atteridgeville, Pretoria----
            venue  freq
0   Shopping Mall  0.67
1  Clothing Store  0.33
2             ATM  0.00
3 

In [194]:
# write a function that puts the most common places in a dataframe 

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Create a dataframe that consists of the 10 most common places in a neighbourhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = pta_grouped['Neighbourhood']

for ind in np.arange(pta_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(pta_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

In [211]:
neighbourhoods_venues_sorted.shape

(87, 12)

In [196]:
# set number of clusters
kclusters = 5

pta_clustering = pta_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(pta_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 0, 4, 4, 4, 3, 4, 4, 4, 4], dtype=int32)

In [212]:
len(kmeans.labels_)

87

In [197]:
suburbs['Neighbourhood'] = suburbs['Suburb']

In [198]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

pta_merged = suburbs

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
pta_merged = pta_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [215]:
pta_merged.dropna(inplace=True)

In [216]:
pta_merged['Cluster Labels'] = pta_merged['Cluster Labels'].astype(int)

Map the clusters

In [217]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(pta_merged['latitude'], pta_merged['longitude'], pta_merged['Neighbourhood'], pta_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       


In [218]:
map_clusters

In [None]:
pta_merged['Cluster Labels'].value_counts()

Analyse the clusters

In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 0, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 1, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 2, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 3, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 4, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]