In [32]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import folium
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
import geocoder
import requests



### Scraping Wikipedia entry using Beautiful Soup and saving Wikitable Sortable as a new dataframe

In [33]:
url = 'https://en.wikipedia.org/wiki/List_of_London_boroughs'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html5lib")
borough_table_soup=soup.find_all('table', class_='wikitable sortable')


In [34]:
A=[]
B=[]
C=[]
D=[]
E=[]
F=[]
G=[]
H=[]
I=[]
J=[]

for row in borough_table_soup[0].findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==10:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
        D.append(cells[3].find(text=True))
        E.append(cells[4].find(text=True))
        F.append(cells[5].find(text=True))
        G.append(cells[6].find(text=True))
        H.append(cells[7].find(text=True))
        I.append(cells[8].find(text=True))
        J.append(cells[9].find(text=True))

borough_table=pd.DataFrame(A,columns=['Borough'])
borough_table['Inner']=B
borough_table['Status']=C
borough_table['Local authority']=D
borough_table['Political control']=E
borough_table['Headquarters']=F
borough_table['Area (sq mi)']=G
borough_table['Population (2019 est)']=H
borough_table['Co-ordinates']=I
borough_table['Nr. in map']=J



### Data cleaning and preperation

In [35]:
borough_table.drop(['Inner', 'Status', 'Co-ordinates','Nr. in map',  'Headquarters'], axis=1, inplace=True)
borough_table['Area (sq mi)'] = borough_table['Area (sq mi)'].str.replace(r'\n', '')
borough_table['Population (2019 est)'] = borough_table['Population (2019 est)'].str.replace(r'\n', '')

  borough_table['Area (sq mi)'] = borough_table['Area (sq mi)'].str.replace(r'\n', '')
  borough_table['Population (2019 est)'] = borough_table['Population (2019 est)'].str.replace(r'\n', '')


In [36]:
COL = {'Borough':'City of London', 'Local authority':'Corporation of London', 'Political control':'N/A', 'Area (sq mi)':'1.12', 'Population (2019 est)':'9721'}
borough_table = borough_table.append(COL, ignore_index=True)
borough_coordinates = pd.read_csv('London Boroughs.csv')
borough_table = pd.merge(borough_table, borough_coordinates, on='Borough', how='inner')
borough_table

Unnamed: 0,Borough,Local authority,Political control,Area (sq mi),Population (2019 est),Latitude,Longitude
0,Barking and Dagenham,Barking and Dagenham London Borough Council,Labour,13.93,212906,51.5607,0.1557
1,Barnet,Barnet London Borough Council,Conservative,33.49,395896,51.6252,-0.1517
2,Bexley,Bexley London Borough Council,Conservative,23.38,248287,51.4549,0.1505
3,Brent,Brent London Borough Council,Labour,16.7,329771,51.5588,-0.2817
4,Bromley,Bromley London Borough Council,Conservative,57.97,332336,51.4039,0.0198
5,Camden,Camden London Borough Council,Labour,8.4,270029,51.529,-0.1255
6,Croydon,Croydon London Borough Council,Labour,33.41,386710,51.3714,-0.0977
7,Ealing,Ealing London Borough Council,Labour,21.44,341806,51.513,-0.3089
8,Enfield,Enfield London Borough Council,Labour,31.74,333794,51.6538,-0.0799
9,Greenwich,Greenwich London Borough Council,Labour,18.28,287942,51.4892,0.0648


### Creating new dataframe using Geocoder to get coordinates of geographic center or boroughs

In [37]:
def get_latlng(Borough):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, United Kingdom'.format(Borough))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [38]:
coords = [get_latlng(Borough) for Borough in borough_table["Borough"].tolist() ]

In [39]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
ldn_df = pd.DataFrame (borough_table['Borough'])
ldn_df['Latitude'] = df_coords['Latitude']
ldn_df['Longitude'] = df_coords['Longitude']
ldn_df.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Barking and Dagenham,51.537452,0.07204
1,Barnet,51.6273,-0.25376
2,Bexley,51.452078,0.069931
3,Brent,51.609783,-0.194672
4,Bromley,51.601511,-0.066365


In [40]:
address = 'London, United Kingdom'

geolocator = Nominatim(user_agent="http")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London, United Kingdom is {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London, United Kingdom is 51.5073219, -0.1276474.


### Mapping coordinates

In [41]:
london_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, Borough in zip(ldn_df['Latitude'], ldn_df['Longitude'], ldn_df['Borough']):
    label = '{}'.format(Borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(london_map)  
    
london_map

In [42]:
CLIENT_ID = 'xxxxxxxxxxxxxxxxxxxxxxxxxxx'
CLIENT_SECRET = 'xxxxxxxxxxxxxxxxxxxxxxxxxxx'
VERSION = '20180605'
LIMIT = 200


### Using Foursquare API to get list of venues around geographic center of boroughs

In [43]:
def getNearbyVenues(names, latitudes, longitudes, radius=6000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [44]:
london_venues = getNearbyVenues(names=ldn_df['Borough'], latitudes=ldn_df['Latitude'], longitudes=ldn_df['Longitude'])
london_venues

Barking and Dagenham
Barnet
Bexley
Brent
Bromley
Camden
Croydon
Ealing
Enfield
Greenwich
Hackney
Hammersmith and Fulham
Haringey
Harrow
Havering
Hillingdon
Hounslow
Islington
Kensington and Chelsea
Kingston upon Thames
Lambeth
Lewisham
Merton
Newham
Redbridge
Richmond upon Thames
Southwark
Sutton
Tower Hamlets
Waltham Forest
Wandsworth
Westminster


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barking and Dagenham,51.537452,0.072040,Barking Abbey,51.535352,0.076054,Park
1,Barking and Dagenham,51.537452,0.072040,Barking Park,51.545217,0.086134,Park
2,Barking and Dagenham,51.537452,0.072040,McDonald's,51.534031,0.053797,Fast Food Restaurant
3,Barking and Dagenham,51.537452,0.072040,Wanstead Park,51.567301,0.041202,Park
4,Barking and Dagenham,51.537452,0.072040,Pets at Home,51.520473,0.070494,Pet Store
...,...,...,...,...,...,...,...
3195,Westminster,51.628249,0.012986,M&S Simply Food,51.593830,0.024610,Grocery Store
3196,Westminster,51.628249,0.012986,Loughton Leisure Centre,51.648444,0.058119,Pool
3197,Westminster,51.628249,0.012986,GAIL's Bakery,51.578719,0.025727,Bakery
3198,Westminster,51.628249,0.012986,George's Souvlaki Bar,51.593929,0.024862,Greek Restaurant


### Listing unique venues produced from list

In [45]:
london_venues['Venue Category'].unique()


array(['Park', 'Fast Food Restaurant', 'Pet Store', 'Go Kart Track',
       'Pub', 'Gastropub', 'Restaurant', 'History Museum',
       'Gym / Fitness Center', 'Supermarket', 'Bar', 'Hotel',
       'Grocery Store', 'Café', 'Bakery', 'Thai Restaurant',
       'Dim Sum Restaurant', 'Pool', 'Ice Cream Shop', 'Music Venue',
       'Dam', 'Lingerie Store', 'Soccer Field', 'Fish & Chips Shop',
       'Theater', 'Flea Market', 'Toy / Game Store', 'Clothing Store',
       'Coffee Shop', 'Pier', 'Canal Lock', 'Butcher', 'Nightclub',
       'Indie Theater', 'Modern European Restaurant', 'Beer Bar',
       'Lighthouse', 'Asian Restaurant', 'Rafting', 'Italian Restaurant',
       'Burger Joint', 'Mediterranean Restaurant', 'Pizza Place',
       'Mexican Restaurant', 'Nature Preserve', 'Sandwich Place',
       'Department Store', 'Wine Shop', 'Electronics Store', 'Brewery',
       'Vietnamese Restaurant', 'Gym', 'Golf Course',
       'Turkish Restaurant', 'Chinese Restaurant',
       'Portuguese Res

### Performing onehot on all venues

In [46]:
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")
london_onehot['Borough'] = london_venues['Borough']
fixed_columns = [london_onehot.columns[-1]] + list(london_onehot.columns[:-1])
london_onehot = london_onehot[fixed_columns]

In [47]:
london_grouped = london_onehot.groupby('Borough').mean().reset_index()
london_grouped

Unnamed: 0,Borough,Afghan Restaurant,American Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,...,Vietnamese Restaurant,Warehouse Store,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Barking and Dagenham,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,Barnet,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bexley,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01
3,Brent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0
4,Bromley,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Camden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0
6,Croydon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0
7,Ealing,0.0,0.0,0.01,0.02,0.0,0.0,0.01,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Enfield,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.02,0.0,0.0,0.0,0.0
9,Greenwich,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.01,...,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.01


### Creating new dataframe with onehot results for Movie Theater and Indie Movie Theater

In [48]:
len(london_grouped[london_grouped["Movie Theater"] > 0])
london_cinema = london_grouped[["Borough","Movie Theater", "Indie Movie Theater"]]
london_cinema


Unnamed: 0,Borough,Movie Theater,Indie Movie Theater
0,Barking and Dagenham,0.0,0.0
1,Barnet,0.01,0.0
2,Bexley,0.0,0.0
3,Brent,0.02,0.0
4,Bromley,0.0,0.01
5,Camden,0.02,0.01
6,Croydon,0.0,0.01
7,Ealing,0.0,0.0
8,Enfield,0.01,0.0
9,Greenwich,0.0,0.0


### Identifying best K for clustering

In [None]:
london_cinema_clustering = london_cinema.drop('Borough', 1)

def cluster_variance(n):
    variances=[]
    kmeans=[]
    outputs=[]
    K=[i for i in range(1,n+1)]
    for i in range(1,n+1):
        variance=0
        model=KMeans(n_clusters=i,random_state=82,verbose=2).fit(london_cinema_clustering)
        kmeans.append(model)
        variances.append(model.inertia_)
        
    return variances,K,n

variances,K,n=cluster_variance(10)
plt.plot(K,variances)
plt.ylabel("Inertia (Total Distance)")
plt.xlabel("K Value")
plt.xticks([i for i in range(1,n+1)])
plt.show()

### Clustering using K=3

In [50]:
kclusters = 3
cinema_clustering = london_cinema.drop(["Borough"], 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(cinema_clustering)
kmeans.labels_[0:20]

array([1, 2, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 0, 1, 1, 0, 1, 1, 2, 1])

In [51]:
london_merged = london_cinema.copy()
london_merged["Cluster Labels"] = kmeans.labels_
london_merged

Unnamed: 0,Borough,Movie Theater,Indie Movie Theater,Cluster Labels
0,Barking and Dagenham,0.0,0.0,1
1,Barnet,0.01,0.0,2
2,Bexley,0.0,0.0,1
3,Brent,0.02,0.0,2
4,Bromley,0.0,0.01,0
5,Camden,0.02,0.01,2
6,Croydon,0.0,0.01,0
7,Ealing,0.0,0.0,1
8,Enfield,0.01,0.0,2
9,Greenwich,0.0,0.0,1


In [52]:
london_merged = london_merged.join(ldn_df.set_index("Borough"), on="Borough")
london_merged

Unnamed: 0,Borough,Movie Theater,Indie Movie Theater,Cluster Labels,Latitude,Longitude
0,Barking and Dagenham,0.0,0.0,1,51.537452,0.07204
1,Barnet,0.01,0.0,2,51.6273,-0.25376
2,Bexley,0.0,0.0,1,51.452078,0.069931
3,Brent,0.02,0.0,2,51.609783,-0.194672
4,Bromley,0.0,0.01,0,51.601511,-0.066365
5,Camden,0.02,0.01,2,51.59118,-0.16504
6,Croydon,0.0,0.01,0,51.59347,-0.08338
7,Ealing,0.0,0.0,1,51.508383,-0.3052
8,Enfield,0.01,0.0,2,51.540024,-0.077502
9,Greenwich,0.0,0.0,1,51.47789,-0.01334


### Mapping all clusters

In [53]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(london_merged['Latitude'], london_merged['Longitude'], london_merged['Borough'], london_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

### Breaking down clusters

In [54]:
london_merged.loc[london_merged['Cluster Labels'] == 0]

Unnamed: 0,Borough,Movie Theater,Indie Movie Theater,Cluster Labels,Latitude,Longitude
4,Bromley,0.0,0.01,0,51.601511,-0.066365
6,Croydon,0.0,0.01,0,51.59347,-0.08338
11,Hammersmith and Fulham,0.01,0.01,0,51.4826,-0.21288
12,Haringey,0.0,0.01,0,51.58927,-0.106405
15,Hillingdon,0.0,0.01,0,51.48423,-0.096477
21,Lewisham,0.0,0.01,0,51.46528,-0.01321
24,Redbridge,0.0,0.01,0,51.475773,-0.080698
25,Richmond upon Thames,0.0,0.01,0,51.48027,-0.23754


In [55]:
london_merged.loc[london_merged['Cluster Labels'] == 1]

Unnamed: 0,Borough,Movie Theater,Indie Movie Theater,Cluster Labels,Latitude,Longitude
0,Barking and Dagenham,0.0,0.0,1,51.537452,0.07204
2,Bexley,0.0,0.0,1,51.452078,0.069931
7,Ealing,0.0,0.0,1,51.508383,-0.3052
9,Greenwich,0.0,0.0,1,51.47789,-0.01334
13,Harrow,0.0,0.0,1,51.51318,-0.10698
14,Havering,0.0,0.0,1,51.54461,-0.14426
16,Hounslow,0.0,0.0,1,51.471393,-0.351374
17,Islington,0.0,0.0,1,51.53438,-0.10894
19,Kingston upon Thames,0.0,0.0,1,51.410881,-0.291933
20,Lambeth,0.0,0.0,1,51.494471,-0.120066


In [56]:
london_merged.loc[london_merged['Cluster Labels'] == 2]

Unnamed: 0,Borough,Movie Theater,Indie Movie Theater,Cluster Labels,Latitude,Longitude
1,Barnet,0.01,0.0,2,51.6273,-0.25376
3,Brent,0.02,0.0,2,51.609783,-0.194672
5,Camden,0.02,0.01,2,51.59118,-0.16504
8,Enfield,0.01,0.0,2,51.540024,-0.077502
10,Hackney,0.01,0.0,2,51.53182,-0.06178
18,Kensington and Chelsea,0.02,0.0,2,51.52266,-0.20793
22,Merton,0.02,0.0,2,51.54452,-0.16686
28,Tower Hamlets,0.01,0.0,2,51.49999,-0.01045


### Identifying best cluster for further analysis, based on total lack of current amenities

In [59]:
no_theater = london_merged.loc[london_merged['Cluster Labels'] == 1]
borough_joined = pd.merge(borough_table, no_theater, on='Borough', how='inner')
borough_joined.drop(['Indie Movie Theater', 'Movie Theater'], axis=1, inplace=True)
borough_joined.sort_values(['Population (2019 est)'], ascending=False, inplace=True)
borough_joined

Unnamed: 0,Borough,Local authority,Political control,Area (sq mi),Population (2019 est),Latitude_x,Longitude_x,Cluster Labels,Latitude_y,Longitude_y
10,Newham,Newham London Borough Council,Labour,13.98,353134,51.5077,0.0469,1,51.519937,0.055882
2,Ealing,Ealing London Borough Council,Labour,21.44,341806,51.513,-0.3089,1,51.508383,-0.3052
14,Wandsworth,Wandsworth London Borough Council,Conservative,13.23,329677,51.4567,-0.191,1,51.467826,-0.144992
9,Lambeth,Lambeth London Borough Council,Labour,10.36,326034,51.4607,-0.1163,1,51.494471,-0.120066
11,Southwark,Southwark London Borough Council,Labour,11.14,318830,51.5035,-0.0804,1,51.505734,-0.100002
3,Greenwich,Greenwich London Borough Council,Labour,18.28,287942,51.4892,0.0648,1,51.47789,-0.01334
13,Waltham Forest,Waltham Forest London Borough Council,Labour,14.99,276983,51.5908,-0.0134,1,51.581765,-0.276968
6,Hounslow,Hounslow London Borough Council,Labour,21.61,271523,51.4746,-0.368,1,51.471393,-0.351374
15,Westminster,Westminster City Council,Conservative,8.29,261317,51.4973,-0.1372,1,51.628249,0.012986
5,Havering,Havering London Borough Council,Conservative,43.35,259552,51.5812,0.1837,1,51.54461,-0.14426
