In [None]:
try:
    print("Importing libraries...\n")
    from progressbar import ProgressBar
    from bs4 import BeautifulSoup as bts # library for web scraping
    import numpy as np # library to handle data in a vectorized manner
    import pandas as pd # library for data analysis
    from pandas.io.json import json_normalize
    import matplotlib.cm as cm
    import matplotlib.colors as colors
    import requests # library to handle requests
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    import matplotlib as mp # library for visualization
    from sklearn.cluster import KMeans # import k-means from clustering stage
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
    import folium # map rendering library
    import lxml
    import re
    from time import sleep
    
    from matplotlib import pyplot as plt
    from matplotlib.pyplot import figure
    
    import datetime
    import dateutil
    print("All libraries imported successfully!\n")
except:
    print("ERROR: Could not import all libraries!\n")

%matplotlib inline


# Mapping London

In [None]:
address = 'London'

geolocator = Nominatim(user_agent="ldn_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London are {}, {}.'.format(latitude, longitude))

## 1. Boroughs

London can be subdivided into boroughs.  This is available as an geojson overlay 'london_boroughs_proper.geojson' Much of the data provided at the London data store is a borough level.

In [None]:
# create map
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [latitude, longitude], zoom_start = 10)

folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)

lnd_map

## 2. Wards

In [None]:
# create map
lnd_ward_geo = r'london-wards-2014.geojson'
lnd_map = folium.Map(location = [latitude, longitude], zoom_start = 10)

#london_merged['Borough label']= london_merged.index


folium.GeoJson(lnd_ward_geo, name="geojson").add_to(lnd_map)
lnd_map

## 3. Neighbourhoods

Rather than modern political divisions, london communities centre around neighbourhhods, often based on old parishes and monor houses, oftem predating the expansion of London.

No official data on these neighbourhoods sourced, but a list located on Wikipedia.  Many of these neighbourhoods are geocodable, so the wikipedia list can be webscrapped and geolocator used to obtain coordinates.

> Scrape the table of neighbourhoods from wikipedia using beautiful soup

In [None]:
london_source  = requests.get('https://en.wikipedia.org/wiki/List_of_areas_of_London').text
london_soup = bts(london_source,'lxml')
london_table = london_soup.findAll('table',  class_="wikitable sortable")    
wiki_df = pd.read_html(str(london_table))
wiki_df = pd.concat(wiki_df)
wiki_df.head()

**Observation:** Neighbourhoods can cross London boroughs

In [None]:
print( 'The list contains {} London Neighbourhoods'.format(len(wiki_df['Location'].unique())))

### Clean neighbourhood data

> Check whether any locations contain characters other than letters and spaces

In [None]:
wiki_df[wiki_df['Location'].str.replace(" ", "").str.isalpha()==False]

> Clean the data in the location column - remove notes in brackets and additional spaces

In [None]:
# remove (...) from location
wiki_df['Location'] =  wiki_df['Location'].str.replace(r"\(.*\)","")
#remove trailing/leading spaces
wiki_df['Location'] = wiki_df['Location'].str.strip()

wiki_df[wiki_df['Location'].str.replace(" ", "").str.isalpha()==False]  #check the cleaned values to confirm brackets have been removed

> Obtain coordinates for each neighbourhood.  This is done using geolocator.  Due to timeout issues, not all coordinates have been retrieved first pass, so this code has been designed to be rerunable to retry locations where data is absent

In [None]:
wiki_df['latitude'] = np.nan # add lattitude column, set to not a number, this can then be used to test whether a neighbourhood has had co-ordinates added

In [None]:
##Rerunable if geolacter errors.

for  idx,ref, place, lat in zip(wiki_df.index, wiki_df['OS grid ref'], wiki_df['Location'], wiki_df['latitude']):
    
    if np.isnan(wiki_df.loc[idx, 'latitude'])==True:  #If no latitude value recorded for the location
        geolocator = Nominatim(user_agent="ldn_explorer")    
        try:
            location = geolocator.geocode(place + ", London, United Kingdom")
        except:
            print('Geocoder error ' + place) # This error indicates issue accessing geolocator
    
        try:
        
            wiki_df.loc[idx, 'latitude'] = location.latitude
            wiki_df.loc[idx, 'longitude'] = location.longitude
            print('The geograpical coordinate of {} is {}, {}.'.format(place,location.latitude, location.longitude))
        except:
            print("Error " + place) #This indicates geolocator returned a response, but it was not coordinates - ie the neighbourhood was not geocodable
    else:
        print(place +' already has latitude ' + str(lat))

**Observation:** Two neighbourhoods Somerstown and Aldborough Hatch are not geocodable. Coordinates have been obtained for the other neigbourhoods. 

**Data Validation:**    A sense check has been applied to validate this data, all laitidues  truncate to 51, all longitigues are around 0.  This indicates geolocater has correctly decoded the neighbourhoods.

In [None]:
wiki_df.tail()

> Remove the two neighbourhoods which were not geocodable (ie have no latitude data)

In [None]:
wiki_df.dropna(axis=0, inplace=True)

> Save scraped Neigbourhood list and obtained cooridinates for use in later analysis

In [None]:
#HACK save/reload data to avoid unnessary API calls and enable analysis off line
wiki_df.to_csv('london_neighbourhoods.csv')
#wiki_df=pd.read_csv('london_neighbourhoods.csv')

### Neighbourhoods in relation to wards

The neighbourhoods can be plotted with the ward boundaries superimposed

In [None]:


lnd_ward_geo =  r'london-wards-2014.geojson'

lnd_map = folium.Map(location = [latitude, longitude], zoom_start = 10)

#london_merged['Borough label']= london_merged.index


folium.GeoJson(lnd_ward_geo, name="geojson").add_to(lnd_map)

# add markers to the map
markers_colors = []
for lat, lon, name in zip(wiki_df['latitude'], wiki_df['longitude'], wiki_df['Location']):
    label = folium.Popup(name , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

**Observation:** Both ward and neighbourhoods provide a reasonable granular coverage of london. Ward data is comprehensive, covering all of London, whereas the neighbourhoods provide a good sample across London.

**Conclusion:** A visualisation of ward level data compared with neighbourhood data, may have potential for providing meaningful insights

# Obtaining Foursquare data

## Introduction to Foursquare API

The Foursquare API returns venues in a location based on either a location coordinate or a geocodable place name.  The explore endpoint proritises recommended locations.  

In [None]:
 
CLIENT_ID = 'LHONXSF1T2PNUTZN2Q1S0RQNLFWUB1HJPLNCLUFKI52SKHLN' #  Foursquare ID
CLIENT_SECRET = 'R4ODYYK4LQSB1MC1QHY0BP1CQJN2XO0NP5BDZASI55KS4JPE' #  Foursquare Secret (note this will be manually reset after uploading to GITHUB)


In [None]:

VERSION = '20180605' # Foursquare API version
LIMIT = 150 # A default Foursquare API limit value

## 1. Borough

An assessment of the best method of obtaining London borough level venue data from Foursquare API

### Using Borough Coordinates with Foursquare API

A point within each London borough can be located, and venues returned within a radius around this point.  To use this approach suitable coordinates should be identified.  

#### Borough Cooridinates obtained from Geolocator

> First obtian a list of London Boroughs, sourcing this from the UK government London datastore. Also retrieving population data for use in data exploration

In [None]:
#london borough list
file ='https://data.london.gov.uk/download/london-borough-profiles/c1693b82-68b1-44ee-beb2-3decf17dc1f8/london-borough-profiles.csv'
boroughsData= pd.read_csv(file, encoding='latin1')
#lots of interesting data, but for this just want list of london boroughs
boroughs=pd.DataFrame(columns= ['Borough'])
boroughs['Borough']=boroughsData['Area_name']
boroughs['Population']=boroughsData['GLA_Population_Estimate_2017']
boroughs.set_index('Borough', inplace=True)
boroughs.head()

> The Borough list also contains summary statistics, remove these rows.  

In [None]:

boroughs.drop(index=['Inner London','Outer London', 'London', 'England', 'United Kingdom'], axis = 0, inplace=True)

> Add longitude and lattidue columns to the borough list, and loop through using geolocator to obtain co-ordinates.  Note that "City of London is not technically a London borough so is geocoded differently.

In [None]:
boroughs['longitude']=0
boroughs['latitude']=0

In [None]:

for borough in boroughs.index:
    geolocator = Nominatim(user_agent="ldn_explorer")    
    if borough!= 'City of London':
        location = geolocator.geocode("The London Borough of " + borough+ ", London")
    else:
        location = geolocator.geocode(borough+ ", London")
    boroughs.loc[borough, 'latitude'] = location.latitude
    boroughs.loc[borough, 'longitude'] = location.longitude
    print('The geograpical coordinate of {} is {}, {}.'.format(borough,location.latitude, location.longitude))

In [None]:
boroughs.head()

In [None]:
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [latitude, longitude], zoom_start = 10)

boroughs['Borough']= boroughs.index.str.strip()

lnd_map.choropleth(
    geo_data=lnd_geo,
    data=boroughs,
    columns=['Borough','Population'],
    key_on='feature.properties.name',
    fill_color='RdPu', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Population'
)
# add markers to the map
markers_colors = []
for lat, lon, poi in zip(boroughs['latitude'], boroughs['longitude'], boroughs.index):
    label = folium.Popup(str(poi) , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

With the exception of Westminster the coordinates appear to be in the centre of each borough - based on the extremes.  The Wiestminster location looks wrong - showing maybe tube station not centre of borough - this could be because it is coomonly refered to as "The city of Westminster" rather than as a London borough.

> Search for better Westminster coordinates and add to map

In [None]:

address = 'City of Westminster'

geolocator = Nominatim(user_agent="ldn_explorer")
wmlocation = geolocator.geocode(address)
wmlatitude = wmlocation.latitude
wmlongitude = wmlocation.longitude
print('The geograpical coordinate of Westminster are {}, {}.'.format(wmlatitude, wmlongitude))

boroughs.loc['Westminster', 'longitude']=wmlongitude
boroughs.loc['Westminster', 'latitude']=wmlatitude

In [None]:
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [latitude, longitude], zoom_start = 10)

lnd_map.choropleth(
    geo_data=lnd_geo,
    data=boroughs,
    columns=['Borough','Population'],
    key_on='feature.properties.name',
    fill_color='RdPu', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Population'
)
# add markers to the map
markers_colors = []
for lat, lon, poi in zip(boroughs['latitude'], boroughs['longitude'], boroughs.index):
    label = folium.Popup(str(poi) , parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

Wesminster is now located more within the Borough.

**Observation:**
These locations could be used for obtaining venue data (ie by searching for venues within a fixed radius around the borough), subject to the following challenges:
1. The centre of the borough may or may not be close to where venues are located, this is particularly relevant in larger outer London boroughs.
2. The slice of a borough taken by searching a radius around a central point may or may not be representative of the borough.
3. Due to the irregular shapes of London bororughs the centre points can be very close to borough boundaries, so venues returned are likely to be from other boroughs.
4. An apporiate radius of inner London boroughs (which are smaller, and may have venues more densely located)  may not be appropriate for outer London boroughs

**Conclusion:** Given these limitations, the robustness of any analysis on the areas will be compromised, therefore a better approach is requrired

### Using Borough Name with Foursquare API

The Foursquare API recognises London Boroughs.  This is a validation and exploration of the data returned

> Define a function to obtain data per borough.  Note that Forusquare does not recognise Westminster as a borough

In [None]:
def getLondonBoroughVenues(boroughs):
    
    venues_list=[]
    for borough in boroughs:
        print(borough)
        
        if borough == 'Westminster':
            search_borough = "City of Westminster, Greater London, United Kingdom"
      #  elif borough == 'Lambeth':
       #     borough + ", London, Greater London, United Kingdom"
        else:
            search_borough = borough + ", Greater London, United Kingdom"
            
        search_borough =search_borough.replace(" ", "%20")
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&near={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            search_borough, 
            LIMIT)
        print(url)      
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
      #  print(results)
        # return only relevant information for each nearby venue
        venues_list.append([(
            borough, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough',  
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

> Retrieve venues for each London borough

In [None]:
borough_venues = getLondonBoroughVenues(boroughs=boroughs.index)
#london_venues = getLondonBoroughVenues(['City of London'])

print(borough_venues.shape)
borough_venues.head()

**Observation:** Immediate inspection of this, applying local knowledge, shows an issue - the City of London results are from all over London, not just within the "City of London".  This can be demonstrated by plotting on a map. 

As the city of London is a unique case, a different coordinate approach could be used.

In [None]:
localBorough ="City of London"

localLng = boroughs.loc[localBorough, "longitude"]
locallat = boroughs.loc[localBorough, "latitude"]
local_venues = borough_venues.set_index('Borough').loc[[localBorough]]

# create map
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [locallat, localLng], zoom_start = 10, tiles="cartodbpositron",)


folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)


# add markers to the map
markers_colors = local_venues
for lat, lon, poi, venue in zip(local_venues['Venue Latitude'], local_venues['Venue Longitude'], local_venues.index, local_venues['Venue']):
    label = folium.Popup(str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

In [None]:
#london_venues.to_csv('borough_venues17Juneall.csv')
borough_venues = pd.read_csv('borough_venues17Juneall.csv')

Looking at the data returned

In [None]:
print('There are {} uniques categories of venue.'.format(len(borough_venues['Venue Category'].unique())))

In [None]:
borough_venues.groupby('Borough').count()['Venue']

**Observation:** There is substantial variation in the amount of data avaiable at borough level, many exceed the max Foursquare venue limit (100), whereas other have significantly less.  Where the limit is exceeded, the results will be the most recommended venues, but may not represent the diverity of venue present.

Looking more closely at Newham results

In [None]:
localBorough ="Newham"

localLng = boroughs.loc[localBorough, "longitude"]
locallat = boroughs.loc[localBorough, "latitude"]
local_venues = borough_venues.set_index('Borough').loc[[localBorough]]

# create map
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [locallat, localLng], zoom_start = 12, tiles="cartodbpositron",)

#london_merged['Borough label']= london_merged.index


folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)


# add markers to the map
markers_colors = local_venues
for lat, lon, poi, venue in zip(local_venues['Venue Latitude'], local_venues['Venue Longitude'], local_venues.index, local_venues['Venue']):
    label = folium.Popup(str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

Newham venues are not exclusively in Newham, including a sizeable cluster in Tower Hamlets. For those not familiar with London, Newham is the borough containing West Ham and Eastham. Tower Hamlets is the borough containing Bow and Whitechapel.

Looking more closely at Richmond upon Thames results

In [None]:
localBorough ="Richmond upon Thames"

localLng = boroughs.loc[localBorough, "longitude"]
locallat = boroughs.loc[localBorough, "latitude"]
local_venues = borough_venues.set_index('Borough').loc[[localBorough]]

# create map
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [locallat, localLng], zoom_start = 12, tiles="cartodbpositron",)

#london_merged['Borough label']= london_merged.index


folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)


# add markers to the map
markers_colors = local_venues
for lat, lon, poi, venue in zip(local_venues['Venue Latitude'], local_venues['Venue Longitude'], local_venues.index, local_venues['Venue']):
    label = folium.Popup(str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

These venues are mainly in the borough (borough containing Teddington), but not exclusively, and do not seem to be taken from the entire borough.

Looking more closely at Hounslow results

In [None]:
localBorough ="Hounslow"

localLng = boroughs.loc[localBorough, "longitude"]
locallat = boroughs.loc[localBorough, "latitude"]
local_venues = borough_venues.set_index('Borough').loc[[localBorough]]

# create map
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [locallat, localLng], zoom_start = 12, tiles="cartodbpositron",)

#london_merged['Borough label']= london_merged.index


folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)


# add markers to the map
markers_colors = local_venues
for lat, lon, poi, venue in zip(local_venues['Venue Latitude'], local_venues['Venue Longitude'], local_venues.index, local_venues['Venue']):
    label = folium.Popup(str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
       
lnd_map

**Observation:** The foursquare API  is not returning comprehensive or exclusive results for each borough. 

In some instances it appears to be using a simplified representation of a London borough, defined by just two coordinates, due to the highly irregular shapes of London boroughs this resulting in inaccurate responses.

In other instances results appear to be clustered around a location within the borough.

### Conclusion
Foursquares API is not suitable to obtain data at a borough level, for comparison with borough level crime data.  However other datasources exist which indicate venues and industries present at a borough level. These can be used for further investigation.

## 2. Wards

Wards are not geocodable and no set of coordinates located for central points. Further they do not represent communitiy centres, so may lack venues

## 3. Neighbourhoods

> Define a function to obtain neighbourhood venues from foursquare.  This takes the following parameters


Parameter | Usage
---|---
 names| A list of neighbourhood names to search for. These will be presumed to be in London.
 latitudes| The coresponding list of latitude coordinates, to be used if name search fails. 
 longitudes| The coresponding list of longitude coordinates, to be used if name search fails.  
 isLondons| The coresponding flag as to whether or not the neighbourhood has a London postal address. This will be used to determine search radius if coordinates used

In [None]:
def getNearbyVenues(names, latitudes, longitudes, isLondons):
    errCount =0
    errTolerance=500 #allow a tolerance failed requests before exiting as there is probably a systematic issue with connection to investigate (for example exceed foursquare account limit)
    venues_list=[]
   # for name in names:
    for name, lat, lng, isLondon in zip(names, latitudes, longitudes, isLondons):
       
        neighbourhood = name + " London, United Kingdom"
        neighbourhood = neighbourhood.replace(' ', '%20')
        # create the API request URL, based on rhe neigbhourhood name
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&near={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            neighbourhood, 
            LIMIT)

        radius=0
        # make the GET request
        try:
            response = requests.get(url)
            results = response.json()["response"]['groups'][0]['items']
            
                
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name'],
                True) for v in results])
        except:
            
            errType = response.json()["meta"]['errorType']
            if  errType =='failed_geocode':
                    #if the neighbourhood wasn't recognised use central coordinate and radius - use a larger radius for neighbourhoods outside central london
                if isLondon ==True:
                    radius = 600
                else:
                    radius= 1200
                        
                try:
                    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                        CLIENT_ID, 
                        CLIENT_SECRET, 
                        VERSION, 
                        lat, 
                        lng,
                        radius, 
                        LIMIT)                        
                        
                    response = requests.get(url)
                    results = response.json()["response"]['groups'][0]['items']
            
                
                    # return only relevant information for each nearby venue
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['venue']['name'], 
                        v['venue']['location']['lat'], 
                        v['venue']['location']['lng'],  
                        v['venue']['categories'][0]['name'], 
                        False) for v in results])     
  
                        
                except: 
           
                    errCount = errCount +1
               # if the request has failed continue with next item and provide url
                    print('Error with  coordinate request for ' + name +' ' + url )
                    print(str(response.json()["meta"]))                    
                    if errCount>errTolerance:
                        return(pd.DataFrame())
            else:        
                errCount = errCount +1
            # if the request has failed continue with next item and provide url
                print('Error with neighbourhood request for ' + name +' ' + url + ' ' + str(response.json()["response"]))
                print(str(response.json()["meta"]))
            
                if errCount>errTolerance:
                    return(pd.DataFrame()) #This is only expected to happen with connection issues
            
            
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category',
                  'Neighborhood Geocodable']
    
    return(nearby_venues)

> Obtain venues for all neighbourhoods

In [None]:

neighbourhood_venues = getNearbyVenues(names=wiki_df['Location'],
                            latitudes=wiki_df['latitude'],
                            longitudes=wiki_df['longitude'],
                            isLondons=wiki_df['Post town']=='LONDON'
                            )

print(neighbourhood_venues.shape)
neighbourhood_venues.head()

> The defined function distinguishes between neighbourhoods foursquare could and could not geocode. Radius search has been used where foursquare could not geocode

In [None]:
#summarise how many results from each request type (geocode/coords)

neighbourhood_venues.groupby(['Neighborhood Geocodable', 'Neighborhood']).count().groupby(['Neighborhood Geocodable']).count()['Venue']

>Save venue data to csv for use in later analysis

In [None]:
neighbourhood_venues.to_csv('london_venues.csv')
#neighbourhood_venues=pd.read_csv('london_venues.csv')  #HACK if want to rerun without foursquare API

In [None]:
print('There are {} uniques categories of venue.'.format(len(neighbourhood_venues['Venue Category'].unique())))

> remove any duplicates or rows with blank values

In [None]:
neighbourhood_venues=neighbourhood_venues.dropna(axis=0)
neighbourhood_venues=neighbourhood_venues.drop_duplicates()

> Look at what the most popular categories are (ie once where there are more than 100 such venues)

In [None]:
category_count=neighbourhood_venues.groupby(['Venue Category']).size()
popular = category_count[category_count>100]
popular.sort_values(ascending=False)

**Observation:** Pubs, Coffee Shops, Grocery Stores, Cafés and Parks are the most popular.  Noted that bus stops and petrol (gas) stations feature highly.


    

Peform a sense check on the data - consider a neighbourhood where venues are known and see if those returned make sense.  Plot on map to visualise.

In [None]:
sense_check = neighbourhood_venues[neighbourhood_venues['Neighborhood']=='Leyton']
sense_check

In [None]:

neighborhoodLng=list(sense_check['Neighborhood Longitude'])[0]
neighborhoodLat=list(sense_check['Neighborhood Latitude'])[0]
# create map centre on the neighbourhood under examination
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [neighborhoodLat, neighborhoodLng], zoom_start = 13)



folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)


# add markers to the map for venues

for lat, lon, poi, venue in zip(sense_check['Venue Latitude'], sense_check['Venue Longitude'], sense_check.index, sense_check['Venue']):
    label = folium.Popup(str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
    
#add marker to map for neighbourhood coordinates
folium.CircleMarker(
    [neighborhoodLat, neighborhoodLng],
    radius=5,
    color="red",
    fill=True,
    fill_color="red",
    fill_opacity=0.7).add_to(lnd_map)      
lnd_map

The red circle shows the geocoder cordinates for leyton, the black dots show the venues returned. Applying local knowledge with a couple of exceptions the venues are within what would be considered leyton.

Repeat for Forest Gate

In [None]:
sense_check = neighbourhood_venues[neighbourhood_venues['Neighborhood']=='Forest Gate']
sense_check

In [None]:

neighborhoodLng=list(sense_check['Neighborhood Longitude'])[0]
neighborhoodLat=list(sense_check['Neighborhood Latitude'])[0]
# create map centred on neighbourhood
lnd_geo = r'london_boroughs_proper.geojson'
lnd_map = folium.Map(location = [neighborhoodLat, neighborhoodLng], zoom_start = 13)

folium.GeoJson(lnd_geo, name="geojson").add_to(lnd_map)


# add markers to the map for venues

for lat, lon, poi, venue in zip(sense_check['Venue Latitude'], sense_check['Venue Longitude'], sense_check.index, sense_check['Venue']):
    label = folium.Popup(str(venue), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=1,
        fill=True,
        fill_color=1,
        fill_opacity=0.7).add_to(lnd_map)
    
folium.CircleMarker(
    [neighborhoodLat, neighborhoodLng],
    radius=5,
    color="red",
    fill=True,
    fill_color="red",
    fill_opacity=0.7).add_to(lnd_map)      
lnd_map

The venues follow a not unreasonable but quite generous definition of forest gate

## Remove neighbourhoods with less than 10 categories of venues retrieved, as there is insufficient data to conduct meaningful analysis

In [None]:
venue_count =neighbourhood_venues.groupby(['Neighborhood', 'Venue Category'])['Venue'].count()
venue_category_count=venue_count.groupby(['Neighborhood']).count()
limited_venues_categories = venue_category_count[venue_category_count<10]
limited_venues_categories

In [None]:
venue_count =neighbourhood_venues.groupby(['Neighborhood'])['Venue'].count()
limited_venues = venue_count[venue_count<10]
limited_venues

Remove neighbourhoods with limited venue data

In [None]:
#remove neighbourhoods with limited venue data
#insufficient categories
indexNames = neighbourhood_venues[neighbourhood_venues['Neighborhood'].isin(limited_venues_categories.index)].index
neighbourhood_venues_with_data=neighbourhood_venues.drop(indexNames)

In [None]:
neighbourhood_venues_with_data.head()

In [None]:
neighbourhood_venues_with_data.to_csv('cleaned_neighborhood_venue.csv')

In [None]:
full_neigbourhoods = neighbourhood_venues[neighbourhood_venues['Neighborhood Geocodable']==True]
full_neigbourhoods.groupby(['Neighborhood'])['Venue Category'].size()