# Data Science Capstone Project
<h2>Identifying a good location for a new restaurant in London</h2>

notes - find number of restaurants per household/person in an area

<H2>PART 1 - Downloading and Cleaning the Data</H2>

In [44]:
import pandas as pd

In [45]:
# Read the csv file into a dataframe
df = pd.read_csv('London postcodes.csv')
df.head()

Unnamed: 0,Postcode,District,Population,Households,DistrictPC,Latitude,Longitude
0,BR1 1AA,Bromley,,,BR1,51.401546,0.015415
1,BR1 1AB,Bromley,,,BR1,51.406333,0.015208
2,BR1 1AD,Bromley,,,BR1,51.400057,0.016715
3,BR1 1AE,Bromley,34.0,21.0,BR1,51.404543,0.014195
4,BR1 1AF,Bromley,,,BR1,51.401392,0.014948


In [47]:
# Population and households data is 0 where NaN - replace. These should be integers too.
df.fillna(0,inplace=True)
df = df.astype({'Population':int,'Households':int})
df.head()

Unnamed: 0,Postcode,District,Population,Households,DistrictPC,Latitude,Longitude
0,BR1 1AA,Bromley,0,0,BR1,51.401546,0.015415
1,BR1 1AB,Bromley,0,0,BR1,51.406333,0.015208
2,BR1 1AD,Bromley,0,0,BR1,51.400057,0.016715
3,BR1 1AE,Bromley,34,21,BR1,51.404543,0.014195
4,BR1 1AF,Bromley,0,0,BR1,51.401392,0.014948


In [48]:
# Grouping the data into a useful format - we want the mean location of a district and its total population. Specific postcodes are not necessary - only the postcode district

# This assumes that the mean of all postcode locations in the postcode district approximate the centre of the postcode district

london_districts = df.groupby(['DistrictPC']).agg({'Population':'sum', 'Latitude':'mean', 'Longitude':'mean'})

london_districts.head()

Unnamed: 0_level_0,Population,Latitude,Longitude
DistrictPC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BR1,55962,51.410732,0.019429
BR2,44958,51.390378,0.021642
BR3,47411,51.403392,-0.031774
BR4,19367,51.375569,-0.009892
BR5,46011,51.389195,0.102525


<h2>PART 2 - Clustering Analysis</h2>

In [49]:
%matplotlib inline
import folium

In [52]:
df = london_districts
df.reset_index(inplace=True)
df.head()

Unnamed: 0,DistrictPC,Population,Latitude,Longitude
0,BR1,55962,51.410732,0.019429
1,BR2,44958,51.390378,0.021642
2,BR3,47411,51.403392,-0.031774
3,BR4,19367,51.375569,-0.009892
4,BR5,46011,51.389195,0.102525


<h3>Creating a map of neighbourhoods</h3>

In [53]:
# Create map of London
map_london = folium.Map(location=[51.500153, -0.1262362], zoom_start=10)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['DistrictPC']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_london)

map_london

<h3>Analysing the neighbourhoods using Foursquare</h3>

In [54]:
# Defining foursquare credentials (using .env to hide)
import os
from dotenv import load_dotenv

load_dotenv('.env')

CLIENT_ID = os.getenv('CLIENT_ID')
CLIENT_SECRET = os.getenv('CLIENT_SECRET')
VERSION = '20180605'
LIMIT = 100

In [136]:
import requests

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        # API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            LIMIT)

        # GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return relevant information
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [137]:
london_venues = getNearbyVenues(names=df['DistrictPC'],
                                latitudes=df['Latitude'],
                                longitudes=df['Longitude']
                                )

BR1
BR2
BR3
BR4
BR5
BR6
BR7
BR8
CM13
CM14
CM23
CR0
CR2
CR3
CR4
CR44
CR5
CR6
CR7
CR8
CR9
CR90
DA1
DA14
DA15
DA16
DA17
DA18
DA5
DA6
DA7
DA8
E1
E10
E11
E12
E13
E14
E15
E16
E17
E18
E1W
E2
E20
E3
E4
E5
E6
E7
E77
E8
E9
E98
EC1A
EC1M
EC1N
EC1P
EC1R
EC1V
EC1Y
EC2A
EC2M
EC2N
EC2P
EC2R
EC2V
EC2Y
EC3A
EC3B
EC3M
EC3N
EC3P
EC3R
EC3V
EC4A
EC4M
EC4N
EC4P
EC4R
EC4V
EC4Y
EC50
EC88
EN1
EN2
EN3
EN4
EN5
EN6
EN7
EN8
EN9
HA0
HA1
HA2
HA3
HA4
HA5
HA6
HA7
HA8
HA9
IG1
IG11
IG2
IG3
IG4
IG5
IG6
IG7
IG8
IG9
KT1
KT17
KT18
KT19
KT2
KT22
KT3
KT4
KT5
KT6
KT7
KT8
KT9
N1
N10
N11
N12
N13
N14
N15
N16
N17
N18
N19
N1C
N1P
N2
N20
N21
N22
N3
N4
N5
N6
N7
N8
N81
N9
NW1
NW10
NW11
NW1W
NW2
NW26
NW3
NW4
NW5
NW6
NW7
NW8
NW9
RM1
RM10
RM11
RM12
RM13
RM14
RM15
RM2
RM3
RM4
RM5
RM50
RM6
RM7
RM8
RM9
SE1
SE10
SE11
SE12
SE13
SE14
SE15
SE16
SE17
SE18
SE19
SE1P
SE2
SE20
SE21
SE22
SE23
SE24
SE25
SE26
SE27
SE28
SE3
SE4
SE5
SE6
SE7
SE8
SE9
SE99
SM1
SM2
SM3
SM4
SM5
SM6
SM7
SW10
SW11
SW12
SW13
SW14
SW15
SW16
SW17
SW18
SW19
SW1A
SW1E
SW1H
SW1P
SW1

In [96]:
london_venues

Unnamed: 0,Neighbourhood,Neighbourhood_Latitude,Neighbourhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,BR1,51.410732,0.019429,Cinnamon Culture,51.414196,0.020883,Indian Restaurant
1,BR1,51.410732,0.019429,Anglesey Arms,51.408703,0.020846,Pub
2,BR1,51.410732,0.019429,O'Neill's,51.407091,0.015784,Irish Pub
3,BR1,51.410732,0.019429,The Railway Tavern,51.407902,0.016725,Bar
4,BR1,51.410732,0.019429,Cow and Pig,51.407401,0.016438,English Restaurant
...,...,...,...,...,...,...,...
11104,WD3,51.625126,-0.492695,Springwell Lock No83,51.625535,-0.494634,Canal Lock
11105,WD6,51.642800,-0.256293,Morrisons,51.644229,-0.255991,Supermarket
11106,WD6,51.642800,-0.256293,Miller & Carter,51.642270,-0.254592,Steakhouse
11107,WD6,51.642800,-0.256293,Smyths Toys,51.644423,-0.255462,Toy / Game Store


In [102]:
# Extract all pubs
london_pubs = london_venues.query("Venue_Category == 'Pub'")
london_pubs.reset_index(drop=True, inplace=True)
london_pubs

Unnamed: 0,Neighbourhood,Neighbourhood_Latitude,Neighbourhood_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,BR1,51.410732,0.019429,Anglesey Arms,51.408703,0.020846,Pub
1,BR1,51.410732,0.019429,Crown & Anchor,51.407753,0.021074,Pub
2,BR3,51.403392,-0.031774,Bricklayers Arms,51.406735,-0.031739,Pub
3,BR4,51.375569,-0.009892,The Swan,51.375924,-0.014897,Pub
4,BR6,51.367569,0.091802,The Maxwell,51.371964,0.090931,Pub
...,...,...,...,...,...,...,...
648,WC2R,51.512226,-0.118394,The Cheshire Cheese,51.512531,-0.113178,Pub
649,WC2R,51.512226,-0.118394,The George,51.513211,-0.113205,Pub
650,WC2R,51.512226,-0.118394,George IV,51.514754,-0.116801,Pub
651,WC2R,51.512226,-0.118394,Ye Old White Horse,51.514607,-0.117135,Pub


In [124]:
pubs_pop = london_pubs.groupby('Neighbourhood').agg({'Venue_Category':'count'}).reset_index()
pubs_pop.columns = ['DistrictPC','Pubs']
pubs_pop

Unnamed: 0,DistrictPC,Pubs
0,BR1,2
1,BR3,1
2,BR4,1
3,BR6,1
4,BR7,1
...,...,...
209,WC2A,11
210,WC2B,2
211,WC2E,1
212,WC2N,2


In [126]:
merged = pd.merge(pubs_pop, df, on=['DistrictPC'])
merged

Unnamed: 0,DistrictPC,Pubs,Population,Latitude,Longitude
0,BR1,2,55962,51.410732,0.019429
1,BR3,1,47411,51.403392,-0.031774
2,BR4,1,19367,51.375569,-0.009892
3,BR6,1,45775,51.367569,0.091802
4,BR7,1,17322,51.411787,0.059159
...,...,...,...,...,...
209,WC2A,11,151,51.516091,-0.115392
210,WC2B,2,1831,51.514962,-0.120952
211,WC2E,1,647,51.512478,-0.123816
212,WC2N,2,861,51.509824,-0.125336


In [127]:
merged['PopPerPub'] = merged['Population']/merged['Pubs']
merged

Unnamed: 0,DistrictPC,Pubs,Population,Latitude,Longitude,PopPerPub
0,BR1,2,55962,51.410732,0.019429,27981.000000
1,BR3,1,47411,51.403392,-0.031774,47411.000000
2,BR4,1,19367,51.375569,-0.009892,19367.000000
3,BR6,1,45775,51.367569,0.091802,45775.000000
4,BR7,1,17322,51.411787,0.059159,17322.000000
...,...,...,...,...,...,...
209,WC2A,11,151,51.516091,-0.115392,13.727273
210,WC2B,2,1831,51.514962,-0.120952,915.500000
211,WC2E,1,647,51.512478,-0.123816,647.000000
212,WC2N,2,861,51.509824,-0.125336,430.500000


In [133]:
merged.sort_values(by='PopPerPub', ascending=False, inplace=True)
merged.reset_index(drop=True, inplace=True)
merged

Unnamed: 0,DistrictPC,Pubs,Population,Latitude,Longitude,PopPerPub
0,CR0,1,153812,51.373244,-0.078287,153812.0
1,NW10,1,91959,51.539231,-0.248872,91959.0
2,E14,1,82542,51.507595,-0.020758,82542.0
3,IG1,1,63585,51.559172,0.072751,63585.0
4,SE9,1,63082,51.446107,0.054583,63082.0
...,...,...,...,...,...,...
209,SE1P,3,0,51.492775,-0.080151,0.0
210,EC50,7,0,51.524503,-0.112088,0.0
211,EC3B,2,0,51.514030,-0.081823,0.0
212,E98,4,0,51.508604,-0.064100,0.0


In [147]:
locdata = merged[['Latitude','Longitude','PopPerPub']]
locdata

Unnamed: 0,Latitude,Longitude,PopPerPub
0,51.373244,-0.078287,153812.0
1,51.539231,-0.248872,91959.0
2,51.507595,-0.020758,82542.0
3,51.559172,0.072751,63585.0
4,51.446107,0.054583,63082.0
...,...,...,...
209,51.492775,-0.080151,0.0
210,51.524503,-0.112088,0.0
211,51.514030,-0.081823,0.0
212,51.508604,-0.064100,0.0


In [148]:
from folium.plugins import HeatMap
hmap = folium.Map(location=[51.500153, -0.1262362], tiles='stamentoner', control_scale=True,zoom_start=10)
hmap.add_child(HeatMap(merged[['Latitude','Longitude','PopPerPub']], radius=25, gradient={.4:'blue',.65:'lime',1:'red'}))