### Capstone Project
## London's Asian Restaurants: A Cluster Analysis

In [1]:
import pandas as pd
import numpy as np
import json # library to handle JSON files
import re
from urllib.request import urlopen
from bs4 import BeautifulSoup


#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


#### Creating the first dataframe for London Postcodes and their geo coordinates

#### 1. Parsing Wikipedia page with Beautiful Soup to extract postcodes for London 

In [3]:

# shorter list of postcodes for the london area from Wikipedia
url= 'https://en.wikipedia.org/wiki/London_postal_district'
wiki= urlopen(url).read()
soup= BeautifulSoup(wiki, 'lxml')
soup.prettify()
# retrieve postcode table 

tables= soup.findAll('table', class_='wikitable')

table1=soup.find('table', class_='wikitable')
table2= table1.findNext('table', class_='wikitable')
table3= table2.findNext('table', class_='wikitable')
tdic= [table1, table2, table3]



def dictify(ul):
    results={}
    for v in ul.findAll('a'):
        for v in ul.findAll('i'):
            key = next(v.stripped_strings)
            ul= v.findNext('td')
            if ul:          
                results[key]= [line for line in ul if line.name !='br']
            else:
                results[key]= None
    return(results)

new_d={}
for chunk in iter(tdic):
    c= dictify(chunk)
    new_d.update(c)


df= pd.DataFrame.from_dict(new_d, orient='index')

names= []
for x in range(len(df.index)):
    for sid in range(len(df.loc[df.index[x]])):
        try:
            names.append(df.loc[df.index[x]][sid].text)
        except:
            names.append(df.loc[df.index[x]][sid])
            #print(names)           

data=[]
for w in names:
    try:
        data.append(w.lstrip().rstrip().split(' ',1))
        #print(data)
    except:
        pass

df2= pd.DataFrame(data, columns=['postcode','Name'])
df2

Unnamed: 0,postcode,Name
0,E1,Head district
1,E2,Bethnal Green
2,E3,Bow
3,E4,Chingford
4,E5,Clapton
5,E6,East Ham
6,E7,Forest Gate
7,E8,Hackney
8,E9,Homerton
9,E10,Leyton


#### 2. Parsing Directory from London Official Statistics to match geocoordinates to postcodes

In [6]:
# dataset containing greater London Official Statistics
#from https://data.london.gov.uk/dataset/postcode-directory-for-london
df1= pd.read_excel('London_postcode.xlsx', names=['postcode2', 'lat', 'long'])

lat_m= []
long_m= []

for q in df2.postcode:
    lat_arr= []
    long_arr=[] 
    lat_arr.append(df1[df1['postcode2'].str.startswith(q, na=False)]['lat'])
    long_arr.append(df1[df1['postcode2'].str.startswith(q, na=False)]['long'])
    lat_m.append(np.mean(lat_arr))
    long_m.append(np.mean(long_arr))

df2['lat']= lat_m
df2['long']= long_m
print(df2)


    postcode                               Name        lat      long
0         E1                      Head district  51.540401 -0.013949
1         E2                      Bethnal Green  51.531427 -0.058075
2         E3                                Bow  51.528275 -0.026031
3         E4                          Chingford  51.622955 -0.003646
4         E5                            Clapton  51.559278 -0.054478
5         E6                           East Ham  51.525164  0.055107
6         E7                        Forest Gate  51.546236  0.022561
7         E8                            Hackney  51.541439 -0.065154
8         E9                           Homerton  51.539590 -0.047696
9        E10                             Leyton  51.568695 -0.012815
10       E11                        Leytonstone  51.567447  0.011728
11       E12                         Manor Park  51.551006  0.050935
12       E13                           Plaistow  51.528114  0.025558
13       E14                      

#### Define Foursquare Credentials and Version

#### Use geopy library to get the latitude and longitude values of London

In [7]:
address = 'London, UK'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of London are 51.5073219, -0.1276474.


#### Create a map of London with neighborhoods superimposed on top.

In [8]:
# create map of London using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df2['lat'], df2['long'], df2['postcode'], df2['Name']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_london)  
    
map_london

### Downloading Foursquare data

#### Define Foursquare Credentials and Version

In [9]:
CLIENT_ID = '0RXEV5SNS5DY5OOWCMCRSWFVX52ZLTQO2SGLGDIYJ2DT23GJ' # your Foursquare ID
CLIENT_SECRET = 'OW1MMRBDTYJEKUB42MH0NRQA3H3HEUXMW24JBPXFFOPP5QVF' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0RXEV5SNS5DY5OOWCMCRSWFVX52ZLTQO2SGLGDIYJ2DT23GJ
CLIENT_SECRET:OW1MMRBDTYJEKUB42MH0NRQA3H3HEUXMW24JBPXFFOPP5QVF


In [10]:
df2.loc[0, 'Name']

'Head district'

Get the neighborhood's latitude and longitude values.

In [11]:
neighborhood_latitude = df2.loc[0, 'lat'] # neighborhood latitude value
neighborhood_longitude = df2.loc[0, 'long'] # neighborhood longitude value

neighborhood_name = df2.loc[0, 'Name'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Head district are 51.54040111158825, -0.013949019513735396.


#### Now, let's get the top 100 venues that are in Head District within a radius of 800 meters.

In [12]:

LIMIT=100
radius=800
url= 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=0RXEV5SNS5DY5OOWCMCRSWFVX52ZLTQO2SGLGDIYJ2DT23GJ&client_secret=OW1MMRBDTYJEKUB42MH0NRQA3H3HEUXMW24JBPXFFOPP5QVF&v=20180605&ll=51.54040111158825,-0.013949019513735396&radius=800&limit=100'

In [13]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cf23a9ff594df57ed58ef88'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Stratford and New Town',
  'headerFullLocation': 'Stratford and New Town, London',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 86,
  'suggestedBounds': {'ne': {'lat': 51.54760111878826,
    'lng': -0.002394364127987889},
   'sw': {'lat': 51.53320110438825, 'lng': -0.025503674899482902}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '530d3b2c498e64025c0cb3bb',
       'name': 'Better London Aquatics Centre',
       'location': {'address': 'Queen Elizabeth Olympic Park',
        'lat': 51.54016340113069,
        'lng': -0.01144062660283663,
        'l

Now we are ready to clean the json and structure it into a *pandas* dataframe.

In [14]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Better London Aquatics Centre,Pool,51.540163,-0.011441
1,Queen Elizabeth Olympic Park,Park,51.540296,-0.012938
2,Olympic Stadium (London Stadium),Stadium,51.538628,-0.016565
3,The Slide,Outdoor Sculpture,51.538354,-0.012858
4,Gymbox,Gym / Fitness Center,51.542276,-0.008089
5,Stour Space,Art Gallery,51.540082,-0.020939
6,West Ham Stadium Store,Sporting Goods Shop,51.537453,-0.015113
7,Waitrose & Partners,Supermarket,51.543524,-0.009548
8,John Lewis & Partners,Department Store,51.543532,-0.009546
9,Crate Brewery,Brewery,51.542973,-0.022063


In [15]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

86 venues were returned by Foursquare.


## 2. Explore Neighborhoods in London

#### Let's create a function to repeat the same process to all the neighborhoods in Manhattan

In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Run the above function on each neighborhood and create a new dataframe called *london_venues*.

In [17]:
london_venues= getNearbyVenues(names= df2['Name'], latitudes= df2['lat'], longitudes=df2['long'])


Head district
Bethnal Green
Bow
Chingford
Clapton
East Ham
Forest Gate
Hackney
 Homerton
Leyton
Leytonstone
Manor Park
Plaistow
Poplar
Stratford
Victoria Docks and North Woolwich
Walthamstow
Woodford and South Woodford
Olympic Park
Head district
Bishopsgate
Fenchurch Street
Fleet Street
Head district
East Finchley
Finchley
Finsbury Park
Highbury
Highgate
Holloway
Hornsey
Lower Edmonton
Muswell Hill
New Southgate
North Finchley
Palmers Green
Southgate
South Tottenham
Stoke Newington
Tottenham
Upper Edmonton
Upper Holloway
Whetstone
Winchmore Hill
Wood Green
Head district
Cricklewood
Hampstead
Hendon
Kentish Town
Kilburn
Mill Hill
St John's Wood
The Hyde
Willesden
Golders Green
Head district
Abbey Wood
Blackheath
Brockley
Camberwell
Catford
Charlton
Deptford
Eltham
Greenwich
Kennington
Lee
Lewisham
New Cross
Peckham
Rotherhithe
Walworth
Woolwich
Norwood
Anerley
Dulwich
East Dulwich
Forest Hill
Herne Hill
South Norwood
Sydenham
West Norwood
Thamesmead
Head district
Brixton
Chelsea
Clapham

#### Let's check the size of the resulting dataframe

In [18]:
print(london_venues.shape)
london_venues.head()

(4103, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Head district,51.540401,-0.013949,Better London Aquatics Centre,51.540163,-0.011441,Pool
1,Head district,51.540401,-0.013949,Queen Elizabeth Olympic Park,51.540296,-0.012938,Park
2,Head district,51.540401,-0.013949,Olympic Stadium (London Stadium),51.538628,-0.016565,Stadium
3,Head district,51.540401,-0.013949,The Slide,51.538354,-0.012858,Outdoor Sculpture
4,Head district,51.540401,-0.013949,Gymbox,51.542276,-0.008089,Gym / Fitness Center


Let's check how many venues were returned for each neighborhood

In [19]:
london_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Homerton,22,22,22,22,22,22
Abbey Wood,7,7,7,7,7,7
Acton,27,27,27,27,27,27
Anerley,14,14,14,14,14,14
Balham,53,53,53,53,53,53
Barnes,18,18,18,18,18,18
Bethnal Green,42,42,42,42,42,42
Bishopsgate,74,74,74,74,74,74
Blackheath,3,3,3,3,3,3
Bow,14,14,14,14,14,14


#### Let's find out how many unique categories can be curated from all the returned venues

In [20]:
print('There are {} uniques categories.'.format(len(london_venues['Venue Category'].unique())))


There are 312 uniques categories.


## 3. Analyze Each Neighborhood


Let's first analyze unique categories to determine which ones to keep, in this case Asian denominated restaurants

In [21]:
# one hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_onehot['Neighborhood'] = london_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = list(london_onehot.columns)
fixed_columns.insert(0, fixed_columns.pop(fixed_columns.index('Neighborhood')))
london_onehot = london_onehot[fixed_columns]
london_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,...,Video Game Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio
0,Head district,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Head district,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Head district,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Head district,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Head district,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
london_onehot.columns.to_list()

['Neighborhood',
 'Accessories Store',
 'Afghan Restaurant',
 'African Restaurant',
 'American Restaurant',
 'Antique Shop',
 'Arcade',
 'Arepa Restaurant',
 'Argentinian Restaurant',
 'Art Gallery',
 'Art Museum',
 'Arts & Crafts Store',
 'Asian Restaurant',
 'Athletics & Sports',
 'Australian Restaurant',
 'Austrian Restaurant',
 'Auto Garage',
 'BBQ Joint',
 'Baby Store',
 'Bagel Shop',
 'Bakery',
 'Bank',
 'Bar',
 'Baseball Field',
 'Bed & Breakfast',
 'Beer Bar',
 'Beer Garden',
 'Beer Store',
 'Belgian Restaurant',
 'Bike Rental / Bike Share',
 'Bike Shop',
 'Bistro',
 'Boarding House',
 'Bookstore',
 'Botanical Garden',
 'Boutique',
 'Bowling Alley',
 'Boxing Gym',
 'Brasserie',
 'Brazilian Restaurant',
 'Breakfast Spot',
 'Brewery',
 'Bridal Shop',
 'Bubble Tea Shop',
 'Burger Joint',
 'Burrito Place',
 'Bus Station',
 'Bus Stop',
 'Butcher',
 'Cable Car',
 'Café',
 'Canal',
 'Canal Lock',
 'Candy Store',
 'Cantonese Restaurant',
 'Caribbean Restaurant',
 'Casino',
 'Cemetery',

Let's filter categories of interest and regroup under *general cuisine label*:

In [23]:

a= london_venues[['Venue Category']].values

me= ['Middle Eastern Restaurant','Turkish Restaurant','Persian Restaurant','Lebanese Restaurant','Israeli Restaurant','Iraqi Restaurant','Afghan Restaurant' ]
chinese=['Chinese','Chinese Restaurant','Ramen Restaurant','Szechuan Restaurant','Xinjiang Restaurant','Cantonese Restaurant']
se_asian=['Thai Restaurant','Vietnamese Restaurant','Malay Restaurant']
indian=['Pakistani Restaurant','South Indian Restaurant','North Indian Restaurant','Indian Restaurant','Himalaya']
japanese= ['Okonomiyaki Restaurant','Sushi Restaurant','Japanese Restaurant']
korean= ['Korean Restaurant']
# just for fun, and as a constant let's add fast foods as a constant as they are a mainstay of London's food scene
fast_food= ['Fried Chicken Joint','Doner Restaurant','Fast Food Restaurant']


for k in range(0, len(a)):
    if a[k] in me:
        a[k]= 'Middle_Eastern'
    elif a[k] in chinese:
        a[k]= 'Chinese' 
    elif a[k] in se_asian:
        a[k]= 'South_East_Asian'
    elif a[k] in indian:
        a[k]= 'Indian'
    elif a[k] in japanese:
        a[k]= 'Japanese'
    elif a[k] in korean:
        a[k]= 'Korean'
    #elif a[k] in fast_food:
        #a[k]= 'Fast_Food'
    london_venues.loc[k,'Venue Category']= a[k]
#print(london_venues)


In [24]:
# one hot encoding
london_onehot2 = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_onehot2['Neighborhood'] = london_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns2 = list(london_onehot2.columns)
fixed_columns2.insert(0, fixed_columns2.pop(fixed_columns2.index('Neighborhood')))
london_onehot2 = london_onehot2[fixed_columns2]

london_grouped= london_onehot2.groupby('Neighborhood')['Middle_Eastern','South_East_Asian','Indian', 'Chinese', 'Japanese', 'Korean'].mean()[lambda x: x>0].reset_index()
london_grouped= london_grouped.set_index('Neighborhood')
london_grouped= london_grouped.dropna(axis=0, how='all')
london_grouped= london_grouped.fillna(0).reset_index()
london_grouped


Unnamed: 0,Neighborhood,Middle_Eastern,South_East_Asian,Indian,Chinese,Japanese,Korean
0,Homerton,0.045455,0.000000,0.045455,0.000000,0.000000,0.000000
1,Acton,0.037037,0.000000,0.000000,0.000000,0.000000,0.000000
2,Balham,0.000000,0.000000,0.056604,0.000000,0.000000,0.000000
3,Bethnal Green,0.000000,0.000000,0.023810,0.000000,0.023810,0.000000
4,Bishopsgate,0.000000,0.027027,0.000000,0.013514,0.027027,0.000000
5,Bow,0.000000,0.000000,0.000000,0.071429,0.000000,0.000000
6,Brixton,0.037037,0.000000,0.074074,0.000000,0.037037,0.000000
7,Brockley,0.000000,0.028571,0.057143,0.057143,0.000000,0.000000
8,Camberwell,0.035714,0.035714,0.017857,0.071429,0.000000,0.000000
9,Catford,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000


#### Let's confirm the new size

In [25]:
london_grouped.shape

(81, 7)

#### Let's print each neighborhood along with the top most common venue

In [27]:
num_top_venues = 1

for hood in london_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = london_grouped[london_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']    
    temp = temp.iloc[1:] 
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})

    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

---- Homerton----
            venue  freq
0  Middle_Eastern  0.05


----Acton----
            venue  freq
0  Middle_Eastern  0.04


----Balham----
    venue  freq
0  Indian  0.06


----Bethnal Green----
    venue  freq
0  Indian  0.02


----Bishopsgate----
              venue  freq
0  South_East_Asian  0.03


----Bow----
     venue  freq
0  Chinese  0.07


----Brixton----
    venue  freq
0  Indian  0.07


----Brockley----
    venue  freq
0  Indian  0.06


----Camberwell----
     venue  freq
0  Chinese  0.07


----Catford----
            venue  freq
0  Middle_Eastern  0.25


----Charlton----
              venue  freq
0  South_East_Asian  0.08


----Chelsea----
      venue  freq
0  Japanese  0.05


----Chiswick----
              venue  freq
0  South_East_Asian  0.05


----Clapham----
            venue  freq
0  Middle_Eastern  0.02


----Clapton----
            venue  freq
0  Middle_Eastern  0.05


----Cricklewood----
            venue  freq
0  Middle_Eastern  0.04


----Deptford----
    

#### Let's put that into a *pandas* dataframe

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 4 venues for each neighborhood.

In [29]:
num_top_venues = 1

indicators = ['st'] #'nd''rd'

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = london_grouped['Neighborhood']

for ind in np.arange(london_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(london_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue
0,Homerton,Indian
1,Acton,Middle_Eastern
2,Balham,Indian
3,Bethnal Green,Japanese
4,Bishopsgate,Japanese
5,Bow,Chinese
6,Brixton,Indian
7,Brockley,Chinese
8,Camberwell,Chinese
9,Catford,Middle_Eastern


## 4. Cluster Neighborhoods

In [30]:
# set number of clusters
kclusters = 6

london_clustering = london_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(london_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 5, 2, 2, 0, 5, 0, 0, 1], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [31]:
london_merged = london_venues.groupby('Neighborhood')['Neighborhood Latitude','Neighborhood Longitude'].mean() 

df4= pd.merge(london_merged, neighborhoods_venues_sorted, on= 'Neighborhood', how='right')
df4['Cluster Labels'] = kmeans.labels_
df4

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,1st Most Common Venue,Cluster Labels
0,Homerton,51.539590,-0.047696,Indian,3
1,Acton,51.508427,-0.270080,Middle_Eastern,3
2,Balham,51.446179,-0.149888,Indian,5
3,Bethnal Green,51.531427,-0.058075,Japanese,2
4,Bishopsgate,51.519560,-0.091753,Japanese,2
5,Bow,51.528275,-0.026031,Chinese,0
6,Brixton,51.437822,-0.155505,Indian,5
7,Brockley,51.461696,-0.035418,Chinese,0
8,Camberwell,51.474077,-0.092799,Chinese,0
9,Catford,51.438467,-0.017474,Middle_Eastern,1


Finally, let's visualize the resulting clusters

In [32]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df4['Neighborhood Latitude'], df4['Neighborhood Longitude'], df4['Neighborhood'], df4['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

#### Cluster 0

In [33]:
df4.loc[df4['Cluster Labels'] == 0, df4.columns[[0] + list(range(3, df4.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,Cluster Labels
5,Bow,Chinese,0
7,Brockley,Chinese,0
8,Camberwell,Chinese,0
23,Forest Gate,Chinese,0
35,Holloway,Chinese,0
54,Rotherhithe,Chinese,0
58,Southgate,Chinese,0
69,Victoria Docks and North Woolwich,Chinese,0
79,Woodford and South Woodford,Chinese,0
80,Woolwich,Chinese,0


#### Cluster 1

In [34]:
df4.loc[df4['Cluster Labels'] == 1, df4.columns[[0] + list(range(3, df4.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,Cluster Labels
9,Catford,Middle_Eastern,1
21,Finchley,Middle_Eastern,1
31,Hendon,Middle_Eastern,1
43,Lower Edmonton,Middle_Eastern,1
67,Upper Edmonton,Middle_Eastern,1
73,West Ealing,Middle_Eastern,1
78,Winchmore Hill,Middle_Eastern,1


#### Cluster 2

In [35]:
df4.loc[df4['Cluster Labels'] == 2, df4.columns[[0] + list(range(3, df4.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,Cluster Labels
3,Bethnal Green,Japanese,2
4,Bishopsgate,Japanese,2
11,Chelsea,Japanese,2
19,East Dulwich,Japanese,2
22,Fleet Street,Japanese,2
28,Hammersmith,Japanese,2
34,Highgate,Japanese,2
46,Muswell Hill,Japanese,2
48,North Finchley,Japanese,2
53,Poplar,Japanese,2


#### Cluster 3

In [36]:
df4.loc[df4['Cluster Labels'] == 3, df4.columns[[0] + list(range(3, df4.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,Cluster Labels
0,Homerton,Indian,3
1,Acton,Middle_Eastern,3
13,Clapham,Japanese,3
14,Clapton,Middle_Eastern,3
15,Cricklewood,Middle_Eastern,3
20,Fenchurch Street,Japanese,3
25,Fulham,South_East_Asian,3
26,Golders Green,Middle_Eastern,3
30,Head district,Middle_Eastern,3
32,Herne Hill,Middle_Eastern,3


#### Cluster 4

In [37]:
df4.loc[df4['Cluster Labels'] == 4, df4.columns[[0] + list(range(3, df4.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,Cluster Labels
10,Charlton,South_East_Asian,4
12,Chiswick,South_East_Asian,4
16,Deptford,South_East_Asian,4
17,Ealing,South_East_Asian,4
24,Forest Hill,South_East_Asian,4
29,Hampstead,Indian,4
33,Highbury,South_East_Asian,4
39,Kentish Town,South_East_Asian,4
49,Norwood,South_East_Asian,4
55,Shepherds Bush,Indian,4


#### Cluster 5

In [38]:
df4.loc[df4['Cluster Labels'] == 5, df4.columns[[0] + list(range(3, df4.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,Cluster Labels
2,Balham,Indian,5
6,Brixton,Indian,5
18,Earls Court,Indian,5
27,Greenwich,Indian,5
37,Kennington,Indian,5
40,Kilburn,Indian,5
57,South Norwood,Indian,5
60,Stockwell,Indian,5
64,Sydenham,Indian,5
66,Tooting,Indian,5
