# Capstone Project Notebook
#### This notebook will be used for the capstone project of the IBM Data Science Professional Course

In [1]:
# import required packages
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from bs4 import BeautifulSoup
import requests
import geocoder
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import json
import urllib

Luckily there is a data set of all UK post codes on doogal.co.uk, so read that in:

In [2]:
# set URL and read postcodes from doogal
post_url = "https://www.doogal.co.uk/PostcodeDistrictsCSV.ashx"
post = pd.read_csv(post_url)

To get the boroughs of london from the above dataset we need a list of what boroughs are considered 'London' we can get that by scraping from the wikipedia article using beautiful soup:

In [3]:
# store the target page in a variable as text
source = requests.get('https://en.wikipedia.org/wiki/List_of_London_boroughs').text

# use the 'lxml' parser to organise the data correctly
soup = BeautifulSoup(source, 'lxml')

# there is only one table in the data so a simple first find will work
table = soup.find('table')

In [4]:
# read the data scraped from wikipedia
dfB = pd.read_html(str(table))
# convert to Datframe
dfB = dfB[0]
# isolate just the Boroughs in a list, using string split to remove some unwanted text
Boroughs = dfB['Borough'].str.split(r' \[').str.get(0)

In [5]:
# london has 33 Boroughs, so check this (0 to 32 = 33)
Boroughs.shape

(32,)

In [6]:
# now we can cut down the UK wide postcode data to just postcodes in the boroughs of London.
LDN_Postcodes = post[post["Region"].isin(Boroughs)]
LDN_Postcodes = LDN_Postcodes.dropna()
LDN_Postcodes.head()

Unnamed: 0,Postcode,Latitude,Longitude,Easting,Northing,Grid Reference,Town/Area,Region,Postcodes,Active postcodes,Population,Households,Nearby districts
267,BR1,51.4107,0.019415,540541.0,169899.0,TQ405698,"Bromley, Bickley, Downham",Bromley,2074.0,1260.0,55962.0,23215.0,"BR2, BR7, SE12, BR3, SE3, BR4, SE6, SE9, SE13,..."
268,BR2,51.3904,0.02164,540758.0,167646.0,TQ407676,"Bickley, Hayes, Bromley Common, Shortlands",Bromley,1404.0,1000.0,44958.0,19024.0,"BR1, BR4, BR7, SE12, BR3, BR5, SE3, SE6, SE9, BR6"
269,BR3,51.4034,-0.031695,537009.0,168990.0,TQ370689,"Beckenham, Eden Park, Elmers End, Park Langley...",Bromley,2095.0,1145.0,47411.0,21134.0,"SE20, SE26, SE6, BR4, SE25, SE23, BR1, SE4, BR..."
270,BR4,51.3757,-0.009917,538607.0,165951.0,TQ386659,West Wickham,Bromley,585.0,368.0,19367.0,7390.0,"BR2, BR3, BR1, SE6, CR0, SE13, SE20, CR9, SE25..."
271,BR5,51.3892,0.102527,546389.0,167670.0,TQ463676,"Orpington, St Mary Cray, Petts Wood",Bromley,1309.0,937.0,46011.0,18938.0,"BR6, DA14, DA15, BR7, BR8, DA16, BR2, DA5, DA6..."


Now we can pull out all of the Lat/Long data to get ready to Parse to Foursquare

In [7]:
# set address that we want the map to be centered on
address = 'London, UK'

# use geopy to get the center of London Lat and Long
geolocator = Nominatim(user_agent="TO_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of London are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of London are 51.5073219, -0.1276474.


In [8]:
# create map of London using latitude and longitude values
map_London = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map by cycling through all the lat long data in the frame
for lat, lng, region, town in zip(LDN_Postcodes['Latitude'], LDN_Postcodes['Longitude'], LDN_Postcodes['Region'], LDN_Postcodes['Town/Area']):
    label = '{}, {}'.format(town, region)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=8,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_London)  

Let's have a look at the map we've made. Each label seems to be correctly identifying the correct area.

In [9]:
map_London

In [10]:
## set up Foursquare for API
CLIENT_ID = 'M1ULFZNV03SCK3OKSY3FLEXTQRXII5WLXFW0VQ4PCXETC3WE' 
CLIENT_SECRET = 'VUIGLGLPVIVNSN5QWTTK3ZSFM2FZBUGQXM3BV51T1JDQNH2F' 
VERSION = '20180605'
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: M1ULFZNV03SCK3OKSY3FLEXTQRXII5WLXFW0VQ4PCXETC3WE
CLIENT_SECRET:VUIGLGLPVIVNSN5QWTTK3ZSFM2FZBUGQXM3BV51T1JDQNH2F


Now we want to get the venues for the 200 meters around each area in the list above. Then we can begin to cluster the areas in London based upon what kind of amenities they have.

In [11]:
# make a function to get the nearby venues
def getNearbyVenues(names, latitudes, longitudes, radius=550):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # try to get the data from foursquare
        try:
        # make the GET request
            results = requests.get(url).json()['response']['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])

            nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
            nearby_venues.columns = ['Neighborhood', 
                      'Neighborhood Latitude', 
                      'Neighborhood Longitude', 
                      'Venue', 
                      'Venue Latitude', 
                      'Venue Longitude', 
                      'Venue Category']
        except:
            pass
    
    return(nearby_venues)

In [None]:
# run the above function and apply it to London Venues
London_venues = getNearbyVenues(names=LDN_Postcodes['Town/Area'],
                                   latitudes=LDN_Postcodes['Latitude'],
                                   longitudes=LDN_Postcodes['Longitude']
                                  )

Bromley, Bickley, Downham
Bickley, Hayes, Bromley Common, Shortlands
Beckenham, Eden Park, Elmers End, Park Langley, Shortlands
West Wickham
Orpington, St Mary Cray, Petts Wood
Orpington, Farnborough, Downe, Pratt's Bottom, Chelsfield, Well Hill
Chislehurst, Elmstead
Croydon, Addiscombe, Shirley, Addington, New Addington, Forestdale, Waddon
South Croydon, Sanderstead, Selsdon, Addington
Caterham, Whyteleafe, Chaldon, Woldingham
Mitcham, Beddington Corner
Coulsdon, Chipstead, Woodmansterne
Warlingham, Chelsham, Farleigh
Thornton Heath
Purley, Kenley
Non-geographic
Bexley, Albany Park, Joydens Wood
Bexleyheath, Upton
Bexleyheath, Barnehurst
Erith, Northumberland Heath, Slade Green
Sidcup, Foots Cray, Albany Park, Longlands
Sidcup, Blackfen, Longlands, Lamorbey, Avery Hill
Welling, Falconwood, East Wickham
Belvedere, Lessness Heath
Erith Marshes, Thamesmead
Aldgate, Bishopsgate, Whitechapel, Shoreditch, Spitalfields, Shadwell, Stepney, Mile End, Portsoken
Wapping
Bethnal Green, Haggerston

In [None]:
# have a look at the top of the data
London_venues.head()

In [None]:
#Now lets do some exploratory data analysis on this data
#we can see there are a lot of pubs in London!
London_venues['Venue Category'].value_counts().head(30).plot(kind='bar', figsize = [15,6])

In [None]:

London_venues.groupby('Neighborhood')['Venue Category'].count().sort_values(ascending = False).head(60).plot(kind = 'bar')

In [None]:
# now let's do one hot encoding to set it up for K-Means Clustering
London_onehot = pd.get_dummies(London_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
London_onehot['Neighborhood'] = London_venues['Neighborhood'] 

# set the index to Neighborhood
London_onehot.set_index('Neighborhood', inplace = True)

In [None]:
# lets check the shape
London_onehot.shape

In [None]:
# now lets group the dummies frame by Neighborhood to give us normalized data for each area
London_grouped = London_onehot.groupby('Neighborhood').mean()
London_grouped.head()

In [None]:
# reset the index
London_grouped = London_grouped.reset_index()

In [None]:
# define a function to give us the top venues in each area
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# we can make a frame of the top 10 venues in each area by using the above function
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = London_grouped['Neighborhood']

# now the data frame is created, populate it with data
for ind in np.arange(London_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(London_grouped.iloc[ind, :], num_top_venues)

# have a check into the data
neighborhoods_venues_sorted.head()

In [None]:
# set number of clusters
kclusters = 9

# make a new frame but drop the Neighborhood col as it is not numeric
London_grouped_clustering = London_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(London_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

LDN_merged = LDN_Postcodes

# merge LDN_merged with LDN neighborhood data to add latitude/longitude for each neighborhood
LDN_merged = LDN_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Town/Area')

LDN_merged.head() # check the last columns!

In [None]:
#Drop any NaN data and change the cluster labels from string to int so the map can read the labels
LDN_merged.dropna(inplace = True)
LDN_merged['Cluster Labels'] = LDN_merged['Cluster Labels'].astype(int)

In [None]:
# the average house prices for the whole of the uk are hosted on the UK gov website here
# we need to read in the excel document from the server, just taking the sheet 'Average Price'
All_UK_prices = pd.read_excel("https://data.london.gov.uk/download/uk-house-price-index/70ac0766-8902-4eb5-aab5-01951aaed773/UK_House_price_index.xlsx", sheet_name = "Average price")

In [None]:
# get the latest house prices for each London Borough
# the last row in the data is the newest price (-1), and the first 33 cols are London Boroughs
LDN_Prices = All_UK_prices.iloc[-1,1:34]

# take that data as a dataframe
LDN_Prices = pd.DataFrame(LDN_Prices)

# reset the index to shift the borough name to a column, this is easier for folium
LDN_Prices.reset_index(inplace = True)

# rename the columns to comething sensible
LDN_Prices.rename(columns={"index": "Borough", 302 : "Price"}, inplace = True, errors="raise")

# price column is a string, so change that to a numeric
LDN_Prices["Price"] = pd.to_numeric(LDN_Prices["Price"])

# the GeoJSON has 'and', whilst the gov data has '&' so rectify the data
LDN_Prices.loc[:,'Borough'] = LDN_Prices.loc[:,'Borough'].str.replace('&', 'and')

# have a look at the data
LDN_Prices.head()

In [None]:
# url path to the geoJSON file listing
LDN_bor_url = 'https://skgrange.github.io/www/data/london_boroughs.json'

# read the geoJson file into the system 
with urllib.request.urlopen(LDN_bor_url) as url:
    data = json.loads(url.read().decode())

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10, tiles = 'cartodbpositron')

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
# colors_array = cm.Set1(np.linspace(0, 1, len(11)))
colors_array = cm.nipy_spectral(np.linspace(0, 1, kclusters))
set1 = [colors.rgb2hex(i) for i in colors_array]

# add the choropleth prices layer as the bottom layer, calling the choropleth class from folium
folium.Choropleth(
    geo_data=data,
    data=LDN_Prices,
    columns=['Borough', 'Price'],
    key_on='feature.properties.name',
    fill_color='YlGnBu', 
    fill_opacity=0.7, 
    line_opacity=0.5,
    legend_name='London Average House Prices'
).add_to(map_clusters)

# add the K-means clustering markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(LDN_merged['Latitude'], LDN_merged['Longitude'], LDN_merged['Town/Area'], LDN_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=set1[cluster-1],
        fill=True,
        fill_color=set1[cluster-1],
        fill_opacity=0.5,
        line_opacity = 0.3).add_to(map_clusters)
    
       
map_clusters

As we can see from the map we have several clusters of data:

CLuster       | Colour | Key Area Features 
------------- | -------------| -------------
0  | Light grey | Out of town retail centres: Furniture Stores, Supermarkets 
1  | Black | Commercial centres with Hotels, Theatres, Restaurants, Coffee shops, Offices and Bars 
2  | Purple | Residential areas with Coffee Shops, Restaurants, Supermarkets | Mid-budget Families
3  | Dark Green/BLue | Out of own Parks, Yoga Studio and Falafel Restaurants (Mill Hill)
4  | Light Green/Blue | Park areas with Transport Links and Restaurants | Wealthy Families
5  | Dark Green | Residential Areas with  Grocery stores/supermarkets (quieter suburbs)
6  | Light Gren | Residential Areas with a high density of pubs
7  | Yellow |  Ethinically diverse areas, predominantly Eastern European (Chingford)
8  | Red | Ethinically diverse areas, predominantly indian

We can tell that by running the below cell but changing the 'Cluster Label' value


In [None]:
LDN_merged.loc[LDN_merged['Cluster Labels'] == 3, LDN_merged.columns[[6] + list(range(14, LDN_merged.shape[1]))]]

In [None]:
map_clusters.save("LDN_Borough_Map.html")

From here we can do our analysis of the neighbourhoods. 

See the blog post:

https://theflyingdatascientist.wordpress.com/2020/05/10/analysing-london-neighbourhood-data-and-house-prices-using-k-means/

Thanks for reading!
Adam Clark