<a href="https://colab.research.google.com/github/Enell261/Python-Projects/blob/main/Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import the necessary modules

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim

We want to get a list of all the suburbs in Pretoria

---



In [None]:
#Get the html from the wikipedia page containing the list
url = 'https://en.wikipedia.org/wiki/List_of_Pretoria_suburbs'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# Get a list of all the elements corresponding to the Pretoria suburbs
pta_east = soup.find_all('ul')[1]
pta_ne = soup.find_all('ul')[2]
pta_nw = soup.find_all('ul')[3]
pta_cw = soup.find_all('ul')[4]

In [None]:
#Create a list of all the links for the suburbs on the page
links = []
burbs = [pta_east, pta_ne, pta_nw, pta_cw]
for suburb in burbs:
  suburbs_links = suburb.find_all('a')
  links.append(suburbs_links)

In [None]:
#links is a nested link, we unnest it using itertools
from itertools import chain
links = list(chain(*links))

In [None]:
len(links)

102

In [None]:
#Extract the suburb names from the list
suburbs = []
for link in links:
    title = link.contents
    suburbs.append(title)

In [None]:
#Suburbs is a nested list, we need to unnest it using itertools
suburbs = list(chain(*suburbs))

In [None]:
# Add the string ', Pretoria' for easier geocoding later
suburbs = [s + ', Pretoria' for s in suburbs]

In [None]:
suburbs = pd.DataFrame({'Suburb':suburbs})

In [None]:
suburbs.head()

Unnamed: 0,Suburb
0,"Alphen Park, Pretoria"
1,"Arcadia, Pretoria"
2,"Ashlea Gardens, Pretoria"
3,"Brooklyn, Pretoria"
4,"Brummeria, Pretoria"


We need to manually geocode each suburb as there is such no dataset readily available 

In [None]:
pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |███▎                            | 10kB 14.7MB/s eta 0:00:01[K     |██████▋                         | 20kB 19.2MB/s eta 0:00:01[K     |██████████                      | 30kB 10.2MB/s eta 0:00:01[K     |█████████████▎                  | 40kB 8.3MB/s eta 0:00:01[K     |████████████████▋               | 51kB 5.2MB/s eta 0:00:01[K     |████████████████████            | 61kB 5.7MB/s eta 0:00:01[K     |███████████████████████▎        | 71kB 6.1MB/s eta 0:00:01[K     |██████████████████████████▋     | 81kB 6.3MB/s eta 0:00:01[K     |██████████████████████████████  | 92kB 5.8MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 3.9MB/s 
Collecting ratelim
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592ca

In [None]:
pip install geopandas

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/2a/9f/e8a440a993e024c0d3d4e5c7d3346367c50c9a1a3d735caf5ee3bde0aab1/geopandas-0.8.2-py2.py3-none-any.whl (962kB)
[K     |▍                               | 10kB 17.4MB/s eta 0:00:01[K     |▊                               | 20kB 10.1MB/s eta 0:00:01[K     |█                               | 30kB 9.4MB/s eta 0:00:01[K     |█▍                              | 40kB 9.0MB/s eta 0:00:01[K     |█▊                              | 51kB 4.7MB/s eta 0:00:01[K     |██                              | 61kB 4.6MB/s eta 0:00:01[K     |██▍                             | 71kB 5.3MB/s eta 0:00:01[K     |██▊                             | 81kB 5.7MB/s eta 0:00:01[K     |███                             | 92kB 5.8MB/s eta 0:00:01[K     |███▍                            | 102kB 5.9MB/s eta 0:00:01[K     |███▊                            | 112kB 5.9MB/s eta 0:00:01[K     |████                            | 122kB 5.9MB

In [None]:
pip install geopy



Get coordinates for each suburb


In [None]:
from geopy.extra.rate_limiter import RateLimiter
#create a Nominatim instance
locator = Nominatim(user_agent="myGeocoder")
#create function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
#create location column
suburbs['location'] = suburbs['Suburb'].apply(geocode)
#create longitude, laatitude and altitude from location column (returns tuple)
suburbs['point'] = suburbs['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# split point column into latitude, longitude and altitude columns
suburbs[['latitude', 'longitude', 'altitude']] = pd.DataFrame(suburbs['point'].tolist(), index=suburbs.index)

In [None]:
# we only need the suburb and its coordinates
suburbs.drop(['location', 'point', 'altitude'], axis=1, inplace=True)

In [None]:
#drop suburbs with missing coordinates
suburbs.dropna(inplace=True)

In [None]:
suburbs.to_csv('Pretoria Geospatial data.csv', index=False)

In [None]:
suburbs.head()

Unnamed: 0,Suburb,latitude,longitude
0,"Alphen Park, Pretoria",-25.78289,28.263953
1,"Arcadia, Pretoria",-25.745627,28.210221
2,"Ashlea Gardens, Pretoria",-25.785833,28.264167
3,"Brooklyn, Pretoria",-25.765556,28.239722
4,"Brummeria, Pretoria",-25.744224,28.283561


**Mapping Pretoria and its suburbs**<br>
We now use folium to create a map of Pretoria with markers

In [None]:
# Declare the address as Pretoria

address = 'Pretoria, South Africa'

# Get the coordinates for Toronto using geolocator
locat = locator.geocode(address)
latitude = locat.latitude
longitude = locat.longitude

In [None]:
print('The geograpical coordinate of Pretoria are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Pretoria are -25.7459374, 28.1879444.


In [None]:
# draw a map of Pretoria

pta_map = folium.Map(location=[latitude, longitude], zoom_start=11)

#add markers to the map

for lat, lng, label in zip(suburbs['latitude'], suburbs['longitude'], suburbs['Suburb']):
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7,
      parse_html=False).add_to(pta_map)

In [None]:
pta_map

We access Foursquare for Pretoria data

In [None]:
CLIENT_ID = 'PJX3PJ4FIED23NYXJEMYAHHJH0S4UHKZRL214FEKVLMT13FW' # my Foursquare ID
CLIENT_SECRET = 'HHUGJ5DXTGCJ35ORN5DJXXCM5ZPS42ESFGJ4TX2ICJULGLAX' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [None]:
# get url for Pretoria from Foursquare

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 3000 # define radius as 3km

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=PJX3PJ4FIED23NYXJEMYAHHJH0S4UHKZRL214FEKVLMT13FW&client_secret=HHUGJ5DXTGCJ35ORN5DJXXCM5ZPS42ESFGJ4TX2ICJULGLAX&v=20180605&ll=-25.7459374,28.1879444&radius=3000&limit=100'

In [None]:
# Create the json file for the Pretoria data from Foursquare
results = requests.get(url).json()

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,categories,lat,lng
0,Café Riche,Café,-25.746579,28.187304
1,Pretoria Zoo,Exhibit,-25.740416,28.188903
2,State Theatre,Performing Arts Venue,-25.746167,28.195084
3,National Zoological Gardens (Pretoria Zoo),Zoo,-25.734267,28.191047
4,Manhattan Hotel Pretoria,Hotel,-25.756855,28.191166


In [None]:
#Get the number of venues returned by the function
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

53 venues were returned by Foursquare.


In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
#Create a dataframe
pta_venues = getNearbyVenues(names=suburbs['Suburb'],
                                   latitudes=suburbs['latitude'],
                                   longitudes=suburbs['longitude']
                                  )

Alphen Park, Pretoria
Arcadia, Pretoria
Ashlea Gardens, Pretoria
Brooklyn, Pretoria
Brummeria, Pretoria
Bryntirion, Pretoria
Colbyn, Pretoria
Constantiapark, Pretoria
Die Wilgers, Pretoria
Eastwood, Pretoria
Elarduspark, Pretoria
Erasmuskloof, Pretoria
Erasmusrand, Pretoria
Faerie Glen, Pretoria
Garsfontein, Pretoria
Groenkloof, Pretoria
Hatfield, Pretoria
Hazeldene, Pretoria
Hazelwood, Pretoria
Hillcrest, Pretoria
La Montagne, Pretoria
Lukasrand, Pretoria
Lydiana, Pretoria
Lynnwood, Pretoria
Lynnwood Glen, Pretoria
Lynnwood Manor, Pretoria
Lynnwood Park, Pretoria
Lynnwood Ridge, Pretoria
Menlo Park, Pretoria
Menlyn, Pretoria
Meyerspark, Pretoria
Monument Park, Pretoria
Moreleta Park, Pretoria
Muckleneuk, Pretoria
Murrayfield, Pretoria
New Muckleneuk, Pretoria
Newlands, Pretoria
Olympus, Pretoria
Rietvalleirand, Pretoria
Sterrewag, Pretoria
Sunnyside, Pretoria
Trevenna, Pretoria
Val de Grace, Pretoria
Wapadrand, Pretoria
Waterkloof, Pretoria
Waterkloof Glen, Pretoria
Waterkloof Park, P

In [None]:
# Check the number of venues per neighbourh
pta_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Alphen Park, Pretoria",12,12,12,12,12,12
"Amandasig, Pretoria",2,2,2,2,2,2
"Arcadia, Pretoria",22,22,22,22,22,22
"Ashlea Gardens, Pretoria",5,5,5,5,5,5
"Atteridgeville, Pretoria",3,3,3,3,3,3
...,...,...,...,...,...,...
"Wingate Park, Pretoria",12,12,12,12,12,12
"Wolmer, Pretoria",6,6,6,6,6,6
"Wonderboom South, Pretoria",5,5,5,5,5,5
"Woodhill, Pretoria",37,37,37,37,37,37


In [None]:
# one hot encoding
pta_coded = pd.get_dummies(pta_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
pta_coded['Neighbourhood'] = pta_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [pta_coded.columns[-1]] + list(pta_coded.columns[:-1])
pta_coded = pta_coded[fixed_columns]

pta_coded.head()

Unnamed: 0,Neighbourhood,ATM,African Restaurant,American Restaurant,Art Museum,Asian Restaurant,Athletics & Sports,Auto Workshop,Automotive Shop,BBQ Joint,Bakery,Bar,Bed & Breakfast,Bike Rental / Bike Share,Bistro,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Burger Joint,Bus Station,Butcher,Cafeteria,Café,Casino,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Soccer Field,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cricket Ground,Deli / Bodega,Department Store,Dessert Shop,Diner,...,Paper / Office Supplies Store,Park,Performing Arts Venue,Pharmacy,Pizza Place,Platform,Playground,Plaza,Portuguese Restaurant,Pub,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Resort,Restaurant,Road,Salad Place,Seafood Restaurant,Shop & Service,Shopping Mall,Snack Place,Soccer Field,Spa,Spanish Restaurant,Sporting Goods Shop,Sports Bar,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,University,Video Store,Wine Shop,Zoo
0,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,"Alphen Park, Pretoria",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
pta_grouped = pta_coded.groupby('Neighbourhood').mean().reset_index()
pta_grouped

Unnamed: 0,Neighbourhood,ATM,African Restaurant,American Restaurant,Art Museum,Asian Restaurant,Athletics & Sports,Auto Workshop,Automotive Shop,BBQ Joint,Bakery,Bar,Bed & Breakfast,Bike Rental / Bike Share,Bistro,Bookstore,Boutique,Brazilian Restaurant,Breakfast Spot,Burger Joint,Bus Station,Butcher,Cafeteria,Café,Casino,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,College Soccer Field,Comfort Food Restaurant,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Cricket Ground,Deli / Bodega,Department Store,Dessert Shop,Diner,...,Paper / Office Supplies Store,Park,Performing Arts Venue,Pharmacy,Pizza Place,Platform,Playground,Plaza,Portuguese Restaurant,Pub,Rental Car Location,Rental Service,Residential Building (Apartment / Condo),Resort,Restaurant,Road,Salad Place,Seafood Restaurant,Shop & Service,Shopping Mall,Snack Place,Soccer Field,Spa,Spanish Restaurant,Sporting Goods Shop,Sports Bar,Stationery Store,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Train Station,University,Video Store,Wine Shop,Zoo
0,"Alphen Park, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.0,0.0,0.083333,0.000000,0.000000,0.0,0.000000,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Amandasig, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Arcadia, Pretoria",0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.090909,0.000000,0.0,0.0,0.0,0.090909,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.045455,0.000000,0.0,0.00,0.045455,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.00,0.000000,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Ashlea Gardens, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.400000,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Atteridgeville, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.333333,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83,"Wingate Park, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.083333,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.166667,0.000000,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84,"Wolmer, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.166667,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.166667,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00,0.00,0.000000,0.166667,0.0,0.00,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
85,"Wonderboom South, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.2,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.200000,0.0,0.0,0.0,0.000000,0.000000,0.200000,...,0.000000,0.0,0.00,0.00,0.000000,0.000000,0.0,0.00,0.000000,0.0,0.2,0.0,0.0,0.0,0.000000,0.0,0.00,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,"Woodhill, Pretoria",0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.027027,0.000000,0.0,0.0,0.000000,0.027027,0.027027,0.0,0.054054,0.054054,0.0,0.0,0.0,0.081081,0.0,0.0,0.054054,0.027027,0.054054,0.0,0.0,0.027027,0.0,0.000000,0.0,0.0,0.0,0.027027,0.027027,0.054054,...,0.027027,0.0,0.00,0.00,0.027027,0.000000,0.0,0.00,0.054054,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.00,0.054054,0.0,0.054054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
num_top_venues = 5

for hood in pta_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = pta_grouped[pta_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alphen Park, Pretoria----
                venue  freq
0          Restaurant  0.17
1               Hotel  0.17
2                Café  0.08
3        Intersection  0.08
4  Italian Restaurant  0.08


----Amandasig, Pretoria----
                        venue  freq
0               Garden Center   0.5
1        Fast Food Restaurant   0.5
2                         ATM   0.0
3          Miscellaneous Shop   0.0
4  Modern European Restaurant   0.0


----Arcadia, Pretoria----
                  venue  freq
0                 Hotel  0.18
1  Fast Food Restaurant  0.09
2         Shopping Mall  0.09
3                  Café  0.09
4        Breakfast Spot  0.09


----Ashlea Gardens, Pretoria----
                        venue  freq
0                       Hotel   0.4
1                  Restaurant   0.4
2          Italian Restaurant   0.2
3  Modern European Restaurant   0.0
4               Movie Theater   0.0


----Atteridgeville, Pretoria----
                        venue  freq
0               Shopping M

In [None]:
# write a function that puts the most common places in a dataframe 

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Create a dataframe that consists of the 10 most common places in a neighbourhood

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = pta_grouped['Neighbourhood']

for ind in np.arange(pta_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(pta_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Alphen Park, Pretoria",Restaurant,Hotel,Intersection,Burger Joint,Bistro,Italian Restaurant,Bar,Café,Supermarket,Grocery Store
1,"Amandasig, Pretoria",Garden Center,Fast Food Restaurant,Zoo,Falafel Restaurant,Food Court,Food & Drink Shop,Flower Shop,Farmers Market,Fabric Shop,French Restaurant
2,"Arcadia, Pretoria",Hotel,Shopping Mall,Fast Food Restaurant,Café,Breakfast Spot,Gas Station,Pizza Place,Multiplex,Movie Theater,Portuguese Restaurant
3,"Ashlea Gardens, Pretoria",Hotel,Restaurant,Italian Restaurant,Zoo,Food Court,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant
4,"Atteridgeville, Pretoria",Shopping Mall,Clothing Store,Cricket Ground,Food Court,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Zoo


In [None]:
neighbourhoods_venues_sorted.shape

(88, 11)

In [None]:
# set number of clusters
kclusters = 5

pta_clustering = pta_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(pta_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([2, 4, 2, 0, 1, 0, 2, 2, 0, 2], dtype=int32)

In [None]:
len(kmeans.labels_)

88

In [None]:
suburbs['Neighbourhood'] = suburbs['Suburb']

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

pta_merged = suburbs

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
pta_merged = pta_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [None]:
pta_merged.dropna(inplace=True)

In [None]:
pta_merged['Cluster Labels'] = pta_merged['Cluster Labels'].astype(int)

Map the clusters

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(pta_merged['latitude'], pta_merged['longitude'], pta_merged['Neighbourhood'], pta_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       


In [None]:
map_clusters

In [None]:
pta_merged['Cluster Labels'].value_counts()

Analyse the clusters

In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 0, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

Unnamed: 0,latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,-25.785833,Hotel,Restaurant,Italian Restaurant,Zoo,Food Court,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant
5,-25.740501,Restaurant,Convenience Store,Italian Restaurant,Shopping Mall,Gastropub,Park,Golf Course,Gift Shop,Deli / Bodega,Department Store
27,-25.761111,Restaurant,Cricket Ground,Seafood Restaurant,Pub,Fabric Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Exhibit
36,-25.780621,Snack Place,Clothing Store,Restaurant,Liquor Store,Pub,Steakhouse,Burger Joint,Sushi Restaurant,Gas Station,Department Store
37,-25.7625,Breakfast Spot,Gas Station,Ethiopian Restaurant,Restaurant,Market,Burger Joint,Shopping Mall,Outdoors & Recreation,Bar,Park
46,-25.7575,Breakfast Spot,Ethiopian Restaurant,Burger Joint,Market,Gas Station,Cocktail Bar,Grocery Store,Shopping Mall,Exhibit,Fabric Shop
62,-25.727339,Pizza Place,Convenience Store,Fast Food Restaurant,Portuguese Restaurant,Gas Station,Exhibit,Flower Shop,Farmers Market,Falafel Restaurant,Fabric Shop
63,-25.675556,Shopping Mall,Department Store,Gas Station,Restaurant,Falafel Restaurant,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Zoo
65,-25.7,Chinese Restaurant,Convenience Store,Diner,Rental Car Location,Gas Station,Zoo,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market
67,-25.709722,Resort,Construction & Landscaping,Fast Food Restaurant,Gas Station,Bike Rental / Bike Share,Karaoke Bar,Zoo,Food & Drink Shop,Flower Shop,Farmers Market


In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 1, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

Unnamed: 0,latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
77,-25.65077,Light Rail Station,Platform,Road,Zoo,Falafel Restaurant,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Fabric Shop
82,-25.663056,Light Rail Station,Convenience Store,Clothing Store,Shopping Mall,Platform,Dessert Shop,Diner,Electronics Store,Ethiopian Restaurant,Exhibit
83,-25.762982,Shopping Mall,Clothing Store,Cricket Ground,Food Court,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Zoo


In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 2, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

Unnamed: 0,latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,-25.782890,Restaurant,Hotel,Intersection,Burger Joint,Bistro,Italian Restaurant,Bar,Café,Supermarket,Grocery Store
1,-25.745627,Hotel,Shopping Mall,Fast Food Restaurant,Café,Breakfast Spot,Gas Station,Pizza Place,Multiplex,Movie Theater,Portuguese Restaurant
3,-25.765556,Café,Coffee Shop,Restaurant,Burger Joint,Breakfast Spot,Pharmacy,Pizza Place,Deli / Bodega,Shopping Mall,Greek Restaurant
4,-25.744224,Café,Shopping Mall,Road,Zoo,Cricket Ground,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant
7,-25.739981,Fast Food Restaurant,Seafood Restaurant,Burger Joint,Golf Course,Fried Chicken Joint,Convenience Store,Pharmacy,Pizza Place,Coffee Shop,Portuguese Restaurant
...,...,...,...,...,...,...,...,...,...,...,...
91,-25.757778,Pizza Place,Pharmacy,Health & Beauty Service,Food Court,Cricket Ground,Deli / Bodega,Department Store,Dessert Shop,Diner,Electronics Store
96,-25.749472,Bakery,Fast Food Restaurant,Restaurant,Zoo,French Restaurant,Food Court,Food & Drink Shop,Flower Shop,Farmers Market,Falafel Restaurant
97,-25.753611,Fast Food Restaurant,Soccer Field,Gay Bar,Zoo,Football Stadium,Food Court,Food & Drink Shop,Flower Shop,Farmers Market,Falafel Restaurant
98,-25.761111,Fast Food Restaurant,Hotel,History Museum,Train Station,Concert Hall,Auto Workshop,Coffee Shop,Zoo,Falafel Restaurant,Food Court


In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 3, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

Unnamed: 0,latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
87,-25.708611,Grocery Store,Zoo,Fabric Shop,Food Court,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Exhibit


In [None]:
pta_merged.loc[pta_merged['Cluster Labels'] == 4, pta_merged.columns[[1] + list(range(5, pta_merged.shape[1]))]]

Unnamed: 0,latitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
66,-25.675,Garden Center,Fast Food Restaurant,Zoo,Falafel Restaurant,Food Court,Food & Drink Shop,Flower Shop,Farmers Market,Fabric Shop,French Restaurant
69,-25.663333,Garden Center,Neighborhood,Zoo,French Restaurant,Food Court,Food & Drink Shop,Flower Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant
