# Capstone Project: Battle of Neighbourhoods

<i>Akhilesh Mistry, 2021</i>

In [1]:
import os
import numpy as np
import pandas as pd
from pandas import json_normalize
import json

import requests
from bs4 import BeautifulSoup
import geopy
from geopy.geocoders import Nominatim
import folium
from folium.features import DivIcon

from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
address = 'Bristol, UK'

geolocator = Nominatim(user_agent="bris_explorer")
location = geolocator.geocode(address)
bris_latitude = location.latitude
bris_longitude = location.longitude
print('The geograpical coordinates of Bristol are {}, {}.'.format(bris_latitude, bris_longitude))

The geograpical coordinates of Bristol are 51.4538022, -2.5972985.


Creating a map of Bristol with middle super output areas displayed

In [3]:
# url = 'https://martinjc.github.io/UK-GeoJSON/json/eng/msoa_by_lad'
geo_url = 'https://raw.githubusercontent.com/missinglink/uk-postcode-polygons/master/geojson/BS.geojson'

# districts_topo = f"{url}/topo_E06000023.json"
post_codes_geojson = f"{geo_url}"

zone_map = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=10) 

post = folium.GeoJson(json.loads(requests.get(geo_url).text), name="Postal").add_to(zone_map)

# folium.TopoJson(
#     json.loads(requests.get(districts_topo).text),
#     "objects.E06000023",
#     name="District Borders").add_to(zone_map)
folium.LayerControl().add_to(zone_map)
folium.GeoJsonTooltip(fields = ['name']).add_to(post)



zone_map

https://tools.ietf.org/html/rfc7946#page-9


In [4]:
# m = folium.Map(location=[latitude, longitude], zoom_start=12)
# filename = 'cycle_geo_data/a.geojson'

# # directory_in_str = 'C:\Users\Akhilesh Mistry\Documents\IBM Data Science Course\Capstone_Project\cycleData'

# # directory = os.fsencode(directory_in_str)

# directory = "C:\Users\Akhilesh Mistry\Documents\IBM Data Science Course\Capstone_Project\cycleData"
    
# for filename in os.listdir(directory):
# #      filename = os.fsdecode(file)
#      if filename.endswith(".geojson"): 
#          print(filename)
#          continue
#      else:
#          continue


# # folium.GeoJson(filename, name="geojson").add_to(m)
# # folium.LayerControl().add_to(m)

# m

# Map for later

folium.Choropleth(
    geo_data=state_geo,
    name="choropleth",
    data=state_data,
    columns=["State", "Unemployment"],
    key_on="feature.id",
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name="Unemployment Rate (%)",
).add_to(m)

# Foursquare Coffee Shop Data

Using the Foursquare API to find an initial idea of the coffee shop location data.

In [43]:
from IPython.display import HTML

def hide_code():
	return HTML('''<script>
	code_show=true; 
	function code_toggle() {
	 if (code_show){
	 $("div.input").hide();
	 } else {
	 $("div.input").show();
	 }
	 code_show = !code_show
	} 
	$( document ).ready(code_toggle);
	</script>
	The raw code for this IPython notebook is by default hidden for easier reading.
	To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

search_query = 'coffee'
radius = 50000

url = """https://api.foursquare.com/v2/{type}/search?
            client_id={client_id}&client_secret={client_secret}&ll={latitude},{longitude}&oauth_token={access_token}&v={version}&query={search_query}&radius={radius}&limit={limit}""".format(
    type = 'venues',
    client_id = '4HWTUHHHKOOTWDPANQ5MUJGAFKXOY5YWCZ5M0U54A0PXXDDU',
    client_secret = '0K3ABDEUDJPLOEA24B3FLJCPXUYL0GCOEVIZBZTK25TPU5RV',
    access_token = "ZVOHVJJLHJR4LU5HLTOL1IICMTOIVEREQNQ3JQQHYIZNQH0N",
    latitude = bris_latitude,
    longitude = bris_longitude,
    version = '20210225',
    search_query = search_query,
    radius = radius,
    limit = '50'
    )

results = requests.get(url).json()

In [6]:
# Bit of code to check the json file has the correct data
# nice_json = 'nice_json.json'
# with open(nice_json, 'w') as f:
#     json.dump(results, f, indent=4)

In [7]:
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = json_normalize(venues)
dataframe.head()
dataframe.shape

(50, 19)

In [8]:
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

print(dataframe_filtered.shape)

venues_map = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=13) # generate map centred at Bristol City Centre

# add a red circle marker to represent the City Centre
folium.CircleMarker(
    [bris_latitude, bris_longitude],
    radius=10,
    color='red',
    popup='Bristol City Centre',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the coffee shops as blue circle markers
for lat, lng, label in zip(dataframe_filtered.lat, dataframe_filtered.lng, dataframe_filtered.categories):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

(50, 16)


# Population Data by Postal Code

In [9]:
import pgeocode

pop_data = pd.read_csv('pop_data.csv')

print(pop_data.head())

# Remove unnecessary data columns
pop_data.drop(['Males', 'Females', 'Occupied_Households'], axis=1, inplace=True)
pop_data.head()

  Postcode  Total  Males  Females  Occupied_Households
0  AL1 1AG     14      6        8                    6
1  AL1 1AJ    124     60       64                   51
2  AL1 1AR     32     17       15                   17
3  AL1 1AS     34     17       17                   13
4  AL1 1BH     52     15       37                   41


Unnamed: 0,Postcode,Total
0,AL1 1AG,14
1,AL1 1AJ,124
2,AL1 1AR,32
3,AL1 1AS,34
4,AL1 1BH,52


In [10]:
# Create new dataframe for total population of each postcode and coordinates
population_data = pd.DataFrame(columns=['Postcode', 'Population Count', 'Latitude', 'Longitude'])

# Initialise Nominatim call to obtain coordinates of each postal code
nomi = pgeocode.Nominatim('gb')

for i in range(1, 37):
    code = 'BS' + str(i)
    regex = r'^' + code
#     print(regex)

    filter = pop_data['Postcode'].str.contains(regex)
    df = pop_data[filter]

    total = df['Total'].sum()

    coords = nomi.query_postal_code(code)
    latitude = coords['latitude']
    longitude = coords['longitude']

    population_data = population_data.append({'Postcode': code, 'Population Count': total, 'Latitude': latitude, 'Longitude': longitude}, ignore_index=True)
    

In [11]:
# Check resulting data frame
population_data.head(20)

Unnamed: 0,Postcode,Population Count,Latitude,Longitude
0,BS1,238211,51.4552,-2.5966
1,BS2,184134,51.4552,-2.5966
2,BS3,231989,51.4552,-2.5966
3,BS4,101631,51.4335,-2.55525
4,BS5,52461,51.462,-2.5519
5,BS6,33615,51.4552,-2.599067
6,BS7,40530,51.4767,-2.5572
7,BS8,26345,51.4611,-2.65032
8,BS9,31160,51.4874,-2.6266
9,BS10,27876,51.5099,-2.635367


In [12]:
# Drop rows with missing data (BS12, BS16 etc. was discontinued)
population_data.dropna(axis=0, inplace=True)
population_data.head()


Unnamed: 0,Postcode,Population Count,Latitude,Longitude
0,BS1,238211,51.4552,-2.5966
1,BS2,184134,51.4552,-2.5966
2,BS3,231989,51.4552,-2.5966
3,BS4,101631,51.4335,-2.55525
4,BS5,52461,51.462,-2.5519


Create a heatmap of the population data

In [13]:
# The data needs to be in list form in order to be plotted.
heat_data = [[row['Latitude'],row['Longitude']] for index, row in population_data.iterrows()]
print(heat_data)

[[51.4552, -2.5966], [51.4552, -2.5966], [51.4552, -2.5966], [51.4335, -2.55525], [51.462, -2.5519], [51.4552, -2.5990666666666664], [51.4767, -2.5572], [51.4611, -2.6503200000000002], [51.4874, -2.6266], [51.5099, -2.635366666666666], [51.5101, -2.68215], [51.4121, -2.612], [51.4061, -2.5627], [51.4528, -2.5083], [51.4817, -2.5036], [51.482, -2.73821], [51.3983, -2.83668], [51.3847, -2.925828571428572], [51.3237, -2.9787], [51.3333, -2.91161], [51.3132, -2.8103599999999997], [51.2667, -2.867269230769231], [51.2754, -2.7549], [51.2273, -2.811271428571428], [51.3289, -2.8691], [51.466, -2.4621636363636363], [51.4084, -2.4918], [51.5441, -2.5582], [51.5241, -2.5661], [51.5975, -2.5485], [51.5233, -2.4900333333333333]]


In [14]:
from folium.plugins import HeatMap

my_map = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=10)

HeatMap(heat_data).add_to(my_map)

my_map

# Comprehensive Foursquare Data

This data obtained from Foursquare makes use of an API search call using the centre of each postal code zone as the location, therefore obtaining a comprehensive set of data from Foursquare, overcoming the limit of 50 results that were obtained using the single API call earlier.

In [42]:
from IPython.display import HTML

def hide_code():
	return HTML('''<script>
	code_show=true; 
	function code_toggle() {
	 if (code_show){
	 $("div.input").hide();
	 } else {
	 $("div.input").show();
	 }
	 code_show = !code_show
	} 
	$( document ).ready(code_toggle);
	</script>
	The raw code for this IPython notebook is by default hidden for easier reading.
	To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

search_query = 'coffee'
radius = 5000

results = []

# Only 23 postcode zones needed to be used. Any more just produced duplicate data and I aimed to reduce the number of Foursquare API calls necessary.
for postcode in range(23):
    
    latitude = heat_data[postcode][0]
    longitude = heat_data[postcode][1]
    
    url = """https://api.foursquare.com/v2/{type}/search?
                client_id={client_id}&client_secret={client_secret}&ll={latitude},{longitude}&oauth_token={access_token}&v={version}&query={search_query}&radius={radius}&limit={limit}""".format(
        type = 'venues',
        client_id = '4HWTUHHHKOOTWDPANQ5MUJGAFKXOY5YWCZ5M0U54A0PXXDDU',
        client_secret = '0K3ABDEUDJPLOEA24B3FLJCPXUYL0GCOEVIZBZTK25TPU5RV',
        access_token = "ZVOHVJJLHJR4LU5HLTOL1IICMTOIVEREQNQ3JQQHYIZNQH0N",
        latitude = latitude,
        longitude = longitude,
        version = '20210225',
        search_query = search_query,
        radius = radius,
        limit = '50'
        )

    results.append(requests.get(url).json())
    
    

In [16]:
# assign relevant part of JSON to venues
venues = []
frames = []
for i in range(len(results)):
    venues.append(results[i]['response']['venues'])

    # tranform venues into a dataframe
    frames.append(json_normalize(venues[i]))
    
dataframe = pd.concat(frames)

dataframe.head()
# dataframe.shape

Unnamed: 0,id,name,categories,referralId,hasPerk,location.address,location.lat,location.lng,location.labeledLatLngs,location.distance,location.cc,location.city,location.state,location.country,location.formattedAddress,location.crossStreet,location.postalCode,location.neighborhood,venuePage.id
0,4c4eb6dcc1f5ef3bf21847ab,Caesar's Coffee,"[{'id': '4bf58dd8d48988d148941735', 'name': 'D...",v-1614734009,False,Welsh Back,51.452915,-2.592586,"[{'label': 'display', 'lat': 51.45291523057879...",377,GB,Bristol,Bristol,United Kingdom,"[Welsh Back, Bristol]",,,,
1,4b7e4505f964a520cce72fe3,Costa Coffee,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",v-1614734009,False,Cathedral Walk,51.449823,-2.60267,"[{'label': 'display', 'lat': 51.4498228560723,...",731,GB,Bristol,Bristol,United Kingdom,"[Cathedral Walk (Millennium Square), Bristol, ...",Millennium Square,BS1 5LW,,
2,4b6d8d4df964a520177c2ce3,Costa Coffee,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",v-1614734009,False,70-78 Queens Rd,51.45715,-2.608039,"[{'label': 'display', 'lat': 51.45715033580132...",822,GB,Bristol,Bristol,United Kingdom,"[70-78 Queens Rd (The Triangle), Bristol, BS8 ...",The Triangle,BS8 1QU,Clifton,
3,4b7901d3f964a520c2e82ee3,Sufis Coffee Shop & Sandwich Bar,"[{'id': '4bf58dd8d48988d143941735', 'name': 'B...",v-1614734009,False,32 Stokes Croft,51.460443,-2.591583,"[{'label': 'display', 'lat': 51.46044261825836...",679,GB,Bristol,Bristol,United Kingdom,"[32 Stokes Croft, Bristol, BS1 3QD]",,BS1 3QD,,
4,4c3469ed3ffc952179da90f5,Two Day Coffee Roasters,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",v-1614734009,False,135 St.Michaels Hill,51.460617,-2.601543,"[{'label': 'display', 'lat': 51.46061715267110...",693,GB,Bristol,,United Kingdom,"[135 St.Michaels Hill (at Highbury Villas), Br...",at Highbury Villas,BS2 8BS,,


In [17]:
filtered_columns = ['name', 'categories'] + [col for col in dataframe.columns if col.startswith('location.')]
dataframe_filtered = dataframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]
dataframe_filtered.drop(['labeledLatLngs', 'formattedAddress', 'cc', 'city', 'state', 'country', 'crossStreet', 'neighborhood', 'distance'], axis=1, inplace=True)
dataframe_filtered.shape

# print(dataframe_filtered)
compression_opts = dict(method='zip', archive_name='out.csv') 
dataframe_filtered.to_csv('out.zip', index=False, compression=compression_opts)


In [18]:
dataframe_filtered.head(50)

Unnamed: 0,name,categories,address,lat,lng,postalCode
0,Caesar's Coffee,Donut Shop,Welsh Back,51.452915,-2.592586,
1,Costa Coffee,Coffee Shop,Cathedral Walk,51.449823,-2.60267,BS1 5LW
2,Costa Coffee,Coffee Shop,70-78 Queens Rd,51.45715,-2.608039,BS8 1QU
3,Sufis Coffee Shop & Sandwich Bar,Breakfast Spot,32 Stokes Croft,51.460443,-2.591583,BS1 3QD
4,Two Day Coffee Roasters,Coffee Shop,135 St.Michaels Hill,51.460617,-2.601543,BS2 8BS
5,Baristas Coffee Collective,Coffee Shop,29 Victoria St,51.452734,-2.589172,BS1 6AA
6,Costa Coffee,Coffee Shop,33 Broadmead,51.457925,-2.588144,BS1 3EU
7,Costa Coffee,Coffee Shop,,51.464369,-2.609983,
8,Isambard's Coffee House,Food Truck,Temple Meads Station,51.449546,-2.581066,BS1 6QF
9,Coffee#1,Coffee Shop,33 Princess Victoria St.,51.454511,-2.620599,BS8 4BU


In [19]:
final_coffee = dataframe_filtered.drop_duplicates(subset=['lat', 'lng'], keep='last')
final_coffee['postalCode'] = final_coffee['postalCode'].replace(np.nan, 'BS1')
final_coffee['postalCode'] = final_coffee['postalCode'].replace('UK', 'BS1')
final_coffee = final_coffee.dropna()
final_coffee.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_coffee['postalCode'] = final_coffee['postalCode'].replace(np.nan, 'BS1')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_coffee['postalCode'] = final_coffee['postalCode'].replace('UK', 'BS1')


(109, 6)

In [20]:
postalcode_data = [row['postalCode'] for index, row in final_coffee.iterrows()]
print(postalcode_data)
print(len(postalcode_data))

['BS1 3JD', 'BS2 8HW', 'BS1 1TP', 'BS1 2EJ', 'BS8 2PL', 'BS9 4JP', 'BS9 4JT', 'BS34 5DG', 'BS34 5DG', 'BS34 5DG', 'BS1', 'BS6 6QP', 'BS34 5TS', 'BS34 5GN', 'BS34 5DG', 'BS6 6TB', 'BS9 4JZ', 'BS34 5UL', 'BS34 5TS', 'BS34 5TQ', 'BS9 3DH', 'BS10 7SR', 'BS9 3AA', 'BS10 7TG', 'BS8 4BU', 'BS8 1QU', 'BS1 5LA', 'BS2 8BS', 'BS8 2NN', 'BS6 6LF', 'BS1 6UT', 'BS8 2PL', 'BS6 5TZ', 'BS1', 'BS1', 'BS1 5LW', 'BS13 7TJ', 'BS3 1HT', 'BS1 5TB', 'BS1', 'BS1', 'BS1', 'BS1', 'BS3', 'BS1 1XA', 'BS1', 'BS13 7TJ', 'BS31 1DS', 'BS1 6QF', 'BS1', 'BS1 3QD', 'BS2 0BH', 'BS1 3EU', 'BS1 6AA', 'BS1', 'BS1', 'BS1 3JL', 'BS1 2DU', 'BS2 9EQ', 'BS2 0SE', 'BS1 3LG', 'BS2 0EJ', 'BS1 6QP', 'BS1 6LS', 'BS1', 'BS1', 'BS1', 'BS1 3BG', 'BS1', 'BS16 7AE', 'BS1', 'BS2 0SP', 'BS16 3TT', 'BS7 8BA', 'BS15 8LP', 'BS7 8AE', 'BS1', 'bs15 8db', 'BS7 8PE', 'BS1', 'BS1', 'BS30 7DA', 'BS5', 'BS1', 'BS36 1JY', 'BS34 7JL', 'BS2 0SZ', 'BS7 8NU', 'BS6 7XW', 'BS1', 'BS34 8JH', 'BS20 7PT', 'BS1', 'B S11', 'BS11 9SA', 'BS21 6NH', 'BS24 7FN', 'BS2

In [21]:
placeholder = pd.DataFrame(columns=['PostCode'])

for code in postalcode_data:
    if code.startswith('BS') or code.startswith('bs'):
        yeet = code.split(' ')
        
    placeholder = placeholder.append({'PostCode': yeet[0]}, ignore_index=True)

print(placeholder.shape)
placeholder.head()

(109, 1)


Unnamed: 0,PostCode
0,BS1
1,BS2
2,BS1
3,BS1
4,BS8


In [22]:
# objs = [final_coffee, placeholder]
# pd.concat([final_coffee, placeholder], axis=1, ignore_index = True)
# final_coffee.head()

final_data = pd.DataFrame(columns=['Name', 'Categories', 'Latitude', 'Longitude', 'PostCode'])

final_data['Name'] = final_coffee['name']
final_data['Categories'] = final_coffee['categories']
final_data['PostCode'] = placeholder['PostCode']
final_data['Latitude'] = final_coffee['lat']
final_data['Longitude'] = final_coffee['lng']

final_data.head()

Unnamed: 0,Name,Categories,Latitude,Longitude,PostCode
43,The Crazy Fox Coffee Bar,Café,51.457546,-2.589841,BS3
25,Costa Coffee,Coffee Shop,51.458344,-2.596426,BS8
28,Playground Coffee House,Café,51.453785,-2.594831,BS8
39,Full Court Press Specialty Coffee,Coffee Shop,51.455157,-2.593258,BS1
35,Costa Coffee,Coffee Shop,51.464561,-2.609786,BS1


In [23]:
venues_map = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=11) # generate map centred at Bristol City Centre

# add a red circle marker to represent the City Centre
folium.CircleMarker(
    [bris_latitude, bris_longitude],
    radius=10,
    color='red',
    popup='Bristol City Centre',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the coffee shops as blue circle markers
for lat, lng, label in zip(final_coffee.lat, final_coffee.lng, final_coffee.categories):
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# display map
venues_map

Plot a heatmap of the coffee shop location data

In [24]:
shop_heat_data = [[row['lat'],row['lng']] for index, row in final_coffee.iterrows()]

In [25]:
shop_heat_map = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=11) # generate map centred at Bristol City Centre

HeatMap(shop_heat_data).add_to(shop_heat_map)

shop_heat_map

<h3>Plotting a chloropleth map of the population data then adding the coffee shop data.</h3>

In [26]:
choropleth = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=10) # generate map centred at Bristol City Centre


choropleth.choropleth(
 geo_data=json.loads(requests.get(geo_url).text),
 name='choropleth',
 data=population_data,
 columns=['Postcode', 'Population Count'],
 key_on='properties.name',
 fill_color='YlGn',
 fill_opacity=0.6,
 line_opacity=0.2,
 legend_name='Population Density')

folium.LayerControl().add_to(choropleth)

choropleth



# Train Station Stuff

Using requests to obtain the html data of the wikipedia page and parse with BeautifulSoup

In [27]:
# Wikipedia page Bristol train station data is available from
url = 'https://en.wikipedia.org/wiki/Category:Railway_stations_in_Bristol'
url2 = 'https://en.wikipedia.org/wiki/Category:Railway_stations_in_Bristol,_Bath_and_South_Gloucestershire'

r = requests.get(url)
r2 = requests.get(url2)
print(f"Status Code 1: {r.status_code}") # This should print '200' if everything goes well
print(f"Status Code 2: {r2.status_code}")

soup = BeautifulSoup(r.text, 'html5lib')
soup2 = BeautifulSoup(r2.text, 'html5lib')
# Use prettify to find tag with relevant information
# print(soup.prettify())

Status Code 1: 200
Status Code 2: 200


Extracting all the train station names from the html text

In [28]:
# Initialise a list of train stations in Bristol
station_list = []

# Use 'find_all' to find all 'href' tags and create a list of train stations
for link in soup.find_all('a',href=True):
    
    if link.get('href').endswith('_railway_station'):
        station_list.append(link.get('href'))

for link in soup2.find_all('a',href=True):
    
    if link.get('href').endswith('_railway_station'):
        station_list.append(link.get('href'))

# Check that our list has only train stations in it
print(station_list)

['/wiki/Avonmouth_railway_station', '/wiki/Bedminster_railway_station', '/wiki/Bristol_Temple_Meads_railway_station', '/wiki/Clifton_Down_railway_station', '/wiki/Lawrence_Hill_railway_station', '/wiki/Montpelier_railway_station', '/wiki/Parson_Street_railway_station', '/wiki/Portway_Parkway_railway_station', '/wiki/Redland_railway_station', '/wiki/St_Andrews_Road_railway_station', '/wiki/Sea_Mills_railway_station', '/wiki/Shirehampton_railway_station', '/wiki/Stapleton_Road_railway_station', '/wiki/Bristol_Parkway_railway_station', '/wiki/Filton_Abbey_Wood_railway_station', '/wiki/Freshford_railway_station', '/wiki/Keynsham_railway_station', '/wiki/Patchway_railway_station', '/wiki/Pilning_railway_station', '/wiki/Severn_Beach_railway_station', '/wiki/Yate_railway_station']


Using Nominatim to find all the train station coordinates and add them to a dataframe

In [29]:
# Initialise the geolocator client
geolocator = Nominatim(user_agent="train_agent")

# Initialse a pandas dataframe and name columns
station_data = pd.DataFrame(columns=['Station Name', 'Latitude', 'Longitude'])

# Create a loop to find the coordinates of each station
for station in station_list:
    # The original plan was to use webscraping on each railway station's wikipedia 
    # page to obtain coordinates but I found that nominatim works just as well.
    # url = 'https://en.wikipedia.org' + station
    # print('\n' + url)

    # Format string to get a name to input into dataframe and use in nominatim
    station = station.lstrip('/wiki/')
    station = station.replace('_', ' ')
    # print(station)

    location = geolocator.geocode(station)
    latitude = location.latitude
    longitude = location.longitude
    # Print a string verifying the station coordinates (just as a check)
    # print(f" The coordinates of {station} are: [{latitude}, {longitude}]")

    # Add all station data to dataframe
    station_data = station_data.append({'Station Name': station, 'Latitude': latitude, 'Longitude': longitude}, ignore_index=True)

# Check resulting dataframe for errors
station_data.head()

Unnamed: 0,Station Name,Latitude,Longitude
0,Avonmouth railway station,51.499722,-2.698862
1,Bedminster railway station,51.440789,-2.593497
2,Bristol Temple Meads railway station,51.449099,-2.580403
3,Clifton Down railway station,51.464453,-2.611157
4,Lawrence Hill railway station,51.45858,-2.56417


Creating a map of Bristol with train station markers available, also with layer control

In [30]:
# REMOVE THIS IS COORDINATES OF BRISTOL ARE FOUND EARLIER: THIS MIGHT BE REDUNDANT CODE
# address = 'Bristol, UK'

# geolocator = Nominatim(user_agent="bristol_agent")
# location = geolocator.geocode(address)
# bris_latitude = location.latitude
# bris_longitude = location.longitude

# Plot map of bristol (basically initialising map to plot station points)
station_map = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=12)

# Set up loop to add circular points to the map of each railway station
for lat, lng, station in zip(station_data['Latitude'], station_data['Longitude'], station_data['Station Name']):
    label = station
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.01,
        parse_html=False).add_to(station_map)

folium.LayerControl().add_to(station_map)
    
station_map

# K-Means Clustering

In [31]:
# one hot encoding
coffee_onehot = pd.get_dummies(final_data[['Categories']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
coffee_onehot['PostCode'] = final_data['PostCode'] 

# move neighbourhood column to the first column
fixed_columns = [coffee_onehot.columns[-1]] + list(coffee_onehot.columns[:-1])
coffee_onehot = coffee_onehot[fixed_columns]

coffee_onehot.head()

Unnamed: 0,PostCode,Breakfast Spot,Café,Coffee Roaster,Coffee Shop,College Cafeteria,Creperie,Distribution Center,Donut Shop,Food & Drink Shop,Food Truck,Furniture / Home Store,Sandwich Place
43,BS3,0,1,0,0,0,0,0,0,0,0,0,0
25,BS8,0,0,0,1,0,0,0,0,0,0,0,0
28,BS8,0,1,0,0,0,0,0,0,0,0,0,0
39,BS1,0,0,0,1,0,0,0,0,0,0,0,0
35,BS1,0,0,0,1,0,0,0,0,0,0,0,0


In [32]:
coffee_onehot.shape

(109, 13)

In [33]:
coffee_grouped = coffee_onehot.groupby('PostCode').mean().reset_index()
coffee_grouped

Unnamed: 0,PostCode,Breakfast Spot,Café,Coffee Roaster,Coffee Shop,College Cafeteria,Creperie,Distribution Center,Donut Shop,Food & Drink Shop,Food Truck,Furniture / Home Store,Sandwich Place
0,BS1,0.0,0.027778,0.0,0.833333,0.027778,0.027778,0.0,0.0,0.0,0.027778,0.055556,0.0
1,BS10,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,BS13,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BS2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BS3,0.0,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,BS31,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,BS34,0.045455,0.090909,0.0,0.818182,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0
7,BS6,0.0,0.363636,0.0,0.454545,0.0,0.0,0.090909,0.090909,0.0,0.0,0.0,0.0
8,BS8,0.0,0.125,0.125,0.6875,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0
9,BS9,0.0,0.0,0.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1


In [34]:
num_top_venues = 5

for code in coffee_grouped['PostCode']:
    print("----"+code+"----")
    temp = coffee_grouped[coffee_grouped['PostCode'] == code].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----BS1----
                    venue  freq
0             Coffee Shop  0.83
1  Furniture / Home Store  0.06
2                    Café  0.03
3       College Cafeteria  0.03
4                Creperie  0.03


----BS10----
               venue  freq
0        Coffee Shop   1.0
1     Breakfast Spot   0.0
2               Café   0.0
3     Coffee Roaster   0.0
4  College Cafeteria   0.0


----BS13----
               venue  freq
0        Coffee Shop   1.0
1     Breakfast Spot   0.0
2               Café   0.0
3     Coffee Roaster   0.0
4  College Cafeteria   0.0


----BS2----
               venue  freq
0        Coffee Shop   1.0
1     Breakfast Spot   0.0
2               Café   0.0
3     Coffee Roaster   0.0
4  College Cafeteria   0.0


----BS3----
               venue  freq
0               Café   0.5
1        Coffee Shop   0.5
2     Breakfast Spot   0.0
3     Coffee Roaster   0.0
4  College Cafeteria   0.0


----BS31----
               venue  freq
0        Coffee Shop   1.0
1     Breakfast Spot 

In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [36]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['PostCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['PostCode'] = coffee_grouped['PostCode']

for ind in np.arange(coffee_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(coffee_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,PostCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,BS1,Coffee Shop,Furniture / Home Store,Food Truck,Creperie,College Cafeteria,Café,Sandwich Place,Food & Drink Shop,Donut Shop,Distribution Center
1,BS10,Coffee Shop,Sandwich Place,Furniture / Home Store,Food Truck,Food & Drink Shop,Donut Shop,Distribution Center,Creperie,College Cafeteria,Coffee Roaster
2,BS13,Coffee Shop,Sandwich Place,Furniture / Home Store,Food Truck,Food & Drink Shop,Donut Shop,Distribution Center,Creperie,College Cafeteria,Coffee Roaster
3,BS2,Coffee Shop,Sandwich Place,Furniture / Home Store,Food Truck,Food & Drink Shop,Donut Shop,Distribution Center,Creperie,College Cafeteria,Coffee Roaster
4,BS3,Coffee Shop,Café,Sandwich Place,Furniture / Home Store,Food Truck,Food & Drink Shop,Donut Shop,Distribution Center,Creperie,College Cafeteria


In [37]:
# set number of clusters
kclusters = 6

coffee_grouped_clustering = coffee_grouped.drop('PostCode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(coffee_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([4, 3, 3, 3, 0, 3, 4, 5, 2, 1])

In [38]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

coffee_merged = final_data

coffee_merged = coffee_merged.join(neighbourhoods_venues_sorted.set_index('PostCode'), on='PostCode')

coffee_merged.head() # show to check the last columns

Unnamed: 0,Name,Categories,Latitude,Longitude,PostCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
43,The Crazy Fox Coffee Bar,Café,51.457546,-2.589841,BS3,0,Coffee Shop,Café,Sandwich Place,Furniture / Home Store,Food Truck,Food & Drink Shop,Donut Shop,Distribution Center,Creperie,College Cafeteria
25,Costa Coffee,Coffee Shop,51.458344,-2.596426,BS8,2,Coffee Shop,Coffee Roaster,Café,Food & Drink Shop,Sandwich Place,Furniture / Home Store,Food Truck,Donut Shop,Distribution Center,Creperie
28,Playground Coffee House,Café,51.453785,-2.594831,BS8,2,Coffee Shop,Coffee Roaster,Café,Food & Drink Shop,Sandwich Place,Furniture / Home Store,Food Truck,Donut Shop,Distribution Center,Creperie
39,Full Court Press Specialty Coffee,Coffee Shop,51.455157,-2.593258,BS1,4,Coffee Shop,Furniture / Home Store,Food Truck,Creperie,College Cafeteria,Café,Sandwich Place,Food & Drink Shop,Donut Shop,Distribution Center
35,Costa Coffee,Coffee Shop,51.464561,-2.609786,BS1,4,Coffee Shop,Furniture / Home Store,Food Truck,Creperie,College Cafeteria,Café,Sandwich Place,Food & Drink Shop,Donut Shop,Distribution Center


In [39]:
print(coffee_merged.shape)

(109, 16)


In [40]:
# create map
map_clusters = folium.Map(location=[bris_latitude, bris_longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

map_clusters.choropleth(
 geo_data=json.loads(requests.get(geo_url).text),
 name='choropleth',
 data=population_data,
 columns=['Postcode', 'Population Count'],
 key_on='properties.name',
 fill_color='YlGn',
 fill_opacity=0.6,
 line_opacity=0.2,
 legend_name='Population Density')

# HeatMap(shop_heat_data).add_to(map_clusters)
HeatMap(heat_data).add_to(map_clusters)



# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(coffee_merged['Latitude'], coffee_merged['Longitude'], coffee_merged['PostCode'], coffee_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

folium.LayerControl().add_to(map_clusters)
       
map_clusters

