# Data Science Capstone - The Battle of Neighborhoods - Final Assignment

##  Data Wrangling 

In [None]:
!pip install beautifulsoup4
!pip install geopy
!pip install folium

In [1]:
# import libraries
import pandas as pd                           # library for data analysis
import requests                               # library to handle requests
from bs4 import BeautifulSoup                 # library to parse HTML documents
from geopy.geocoders import Nominatim         # convert an address into latitude and longitude values
import folium                                 # map rendering library
from pandas.io.json import json_normalize     # tranform JSON file into a pandas dataframe
import numpy as np                            # library to handle data in a vectorized manner
from sklearn.cluster import KMeans            # import k-means from clustering stage
import matplotlib.cm as cm                    # Matplotlib and associated plotting modules
import matplotlib.colors as colors

#### Scrape the Wikipedia page containing the List of municipalities in Brussels-Capital region.

In [2]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_municipalities_of_the_Brussels-Capital_Region"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
municipalities_table=soup.find('table',{'class':"wikitable"})

In [4]:
municipalities_list=pd.read_html(str(municipalities_table))
# convert list to dataframe
df=pd.DataFrame(municipalities_list[0])
df.head(2)

Unnamed: 0.1,Unnamed: 0,French name,Dutch name,Flag,CoA,postcode,Population(1/1/2017),Area,Population density(km²),Ref.
0,1,Anderlecht,Anderlecht,,,1070,118241,2 (6.8 sq mi),6680,[7]
1,2,Auderghem,Oudergem,,,1160,33313,2 (3.5 sq mi),3701,[8]


#### Prepare dataframe for geocoder

In [5]:
# select columns you need to a new dataframe and rename "French name" column to "Municipality" and 'postcode' to 'PostalCode'
df_bxl=df.filter(['French name','postcode']).rename(columns = {'French name':'Municipality','postcode':'PostalCode'})
df_bxl.head(4)

Unnamed: 0,Municipality,PostalCode
0,Anderlecht,1070
1,Auderghem,1160
2,Berchem-Sainte-Agathe,1082
3,Ville de Bruxelles*,1000102010301040105011201130


In [6]:
# clean the value of 'Municipality' and 'PostalCode' in 4th row
df_bxl['Municipality']=df_bxl['Municipality'].replace(['Ville de Bruxelles*'],'Bruxelles')
df_bxl['PostalCode']=df_bxl['PostalCode'].replace(['1000102010301040105011201130'],'1000')
df_bxl.head(4)

Unnamed: 0,Municipality,PostalCode
0,Anderlecht,1070
1,Auderghem,1160
2,Berchem-Sainte-Agathe,1082
3,Bruxelles,1000


In [7]:
# add 'Address' column that will contain the concatenation of values from 'PostalCode' and 'Municipality' columns
df_bxl['Address']=df_bxl['PostalCode'].astype(str)+', '+df_bxl['Municipality']
df_bxl.head(2)

Unnamed: 0,Municipality,PostalCode,Address
0,Anderlecht,1070,"1070, Anderlecht"
1,Auderghem,1160,"1160, Auderghem"


#### Use geocoder to get coordinates for each municipality

In [19]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_explorer")

from geopy.extra.rate_limiter import RateLimiter
# function to delay between geocoding calls as required by Nominatim
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
# create location column
df_bxl['location'] = df_bxl['Address'].apply(geocode)
# create point column from location column (returns tuple)
df_bxl['point'] = df_bxl['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# split point column into latitude, longitude and altitude columns
df_bxl[['Latitude', 'Longitude', 'altitude']] = pd.DataFrame(df_bxl['point'].tolist(), index=df_bxl.index)

In [20]:
df_bxl

Unnamed: 0,Municipality,PostalCode,Address,location,point,Latitude,Longitude,altitude
1,Anderlecht,1070,"1070, Anderlecht","(Anderlecht, Brussel-Hoofdstad - Bruxelles-Cap...","(50.8390983, 4.3296526, 0.0)",50.839098,4.329653,0.0
2,Auderghem,1160,"1160, Auderghem","(Auderghem - Oudergem, Brussel-Hoofdstad - Bru...","(50.8172355, 4.4268982, 0.0)",50.817236,4.426898,0.0
3,Berchem-Sainte-Agathe,1082,"1082, Berchem-Sainte-Agathe","(Berchem-Sainte-Agathe - Sint-Agatha-Berchem, ...","(50.8649231, 4.2946734, 0.0)",50.864923,4.294673,0.0
4,Bruxelles,1000,"1000, Bruxelles","(Ville de Bruxelles - Stad Brussel, Brussel-Ho...","(50.8465573, 4.351697, 0.0)",50.846557,4.351697,0.0
5,Etterbeek,1040,"1040, Etterbeek","(Etterbeek, Brussel-Hoofdstad - Bruxelles-Capi...","(50.8361447, 4.3861737, 0.0)",50.836145,4.386174,0.0
6,Evere,1140,"1140, Evere","(Evere, Brussel-Hoofdstad - Bruxelles-Capitale...","(50.8720096, 4.4034182, 0.0)",50.87201,4.403418,0.0
7,Forest,1190,"1190, Forest","(Forest - Vorst, Brussel-Hoofdstad - Bruxelles...","(50.811795, 4.3181187, 0.0)",50.811795,4.318119,0.0
8,Ganshoren,1083,"1083, Ganshoren","(Ganshoren, Brussel-Hoofdstad - Bruxelles-Capi...","(50.8703273, 4.307798, 0.0)",50.870327,4.307798,0.0
9,Ixelles,1050,"1050, Ixelles","(Ixelles - Elsene, Brussel-Hoofdstad - Bruxell...","(50.8331141, 4.3668279, 0.0)",50.833114,4.366828,0.0
10,Jette,1090,"1090, Jette","(Jette, Brussel-Hoofdstad - Bruxelles-Capitale...","(50.8759585, 4.3245702, 0.0)",50.875959,4.32457,0.0


In [21]:
# take a backup in case Nominatim number of retries is exceeded
df_bxl.to_csv(r'Data\\Municipalities with Latitude and Longitude.csv', index = False, sep = ';')

In [8]:
# read from backup if needed
df_bxl=pd.read_csv(r'Data\\Municipalities with Latitude and Longitude.csv', sep = ';')
df_bxl.head(2)

Unnamed: 0,Municipality,PostalCode,Address,location,point,Latitude,Longitude,altitude
0,Anderlecht,1070,"1070, Anderlecht","Anderlecht, Brussel-Hoofdstad - Bruxelles-Capi...","(50.8390983, 4.3296526, 0.0)",50.839098,4.329653,0.0
1,Auderghem,1160,"1160, Auderghem","Auderghem - Oudergem, Brussel-Hoofdstad - Brux...","(50.8172355, 4.4268982, 0.0)",50.817235,4.426898,0.0


In [9]:
# select columns you need to a new dataframe
neighborhoods=df_bxl.filter(['Municipality','PostalCode','Latitude','Longitude'])
neighborhoods.head()

Unnamed: 0,Municipality,PostalCode,Latitude,Longitude
0,Anderlecht,1070,50.839098,4.329653
1,Auderghem,1160,50.817235,4.426898
2,Berchem-Sainte-Agathe,1082,50.864923,4.294673
3,Bruxelles,1000,50.846557,4.351697
4,Etterbeek,1040,50.836145,4.386174


In [29]:
# take a backup of the file ready for further processing
neighborhoods.to_csv(r'Data\\Municipalities_ready.csv', index = False, sep = ';')

In [42]:
# read from backup if you need to restart from this point on
# see how index start at 0 again
neighborhoods=pd.read_csv(r'Data\\Municipalities_ready.csv', sep=';')
neighborhoods.head()

Unnamed: 0,Municipality,PostalCode,Latitude,Longitude
0,Anderlecht,1070,50.839098,4.329653
1,Auderghem,1160,50.817235,4.426898
2,Berchem-Sainte-Agathe,1082,50.864923,4.294673
3,Bruxelles,1000,50.846557,4.351697
4,Etterbeek,1040,50.836145,4.386174


#### This ends "Data Wrangling" part - we have a file with 19 municipalities in Brussels with latitude and longitude ready for further processing

## Segmentation and Clustering

Use geopy to get the latitude and longitude values of Brussels-Capital.

In [10]:
# for the map get coordinates of Brussel-Capital

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='my_explorer')
location = geolocator.geocode('Bruxelles-Capitale')
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

50.83879505 4.375304132256188


In [11]:
# in case Nominatim number of retries is exceeded keep values for 'Bruxelles-Capitale' longitude and latitude
#latitude=50.83879505
#longitude=4.375304132256188

In [12]:
# create map of Brussels-Capital using latitude and longitude values
import folium 
map_brussels_capital = folium.Map(location=[latitude, longitude], zoom_start=12, control_scale=True)

For the map of Brussels, "folium.Circle" has been used instead of "folium.CircleMarker" in order to visualize the radius in meters rather than in pixels. It gives us an idea about the overlaping areas of municipalities.

In [16]:
map_brussels_capital = folium.Map(location=[latitude, longitude], zoom_start=12, control_scale=True)
# add markers to map
for lat, lng, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Municipality']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.Circle(
        [lat, lng],
        radius=1000,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.2,
        parse_html=False).add_to(map_brussels_capital)
map_brussels_capital

Use the Foursquare API to explore the neighborhoods and segment them.

In [17]:
# Define Foursquare Credentials and Version
CLIENT_ID = '5NNQESMYHJBEXIEAEL4UR5G5P2QFQK0KYN3VALWTGGEEASSW'     # your Foursquare ID
CLIENT_SECRET = 'KV4AIAK1HQ3N11RDTM2OLRDQJHM13DRA0ZD32C4HVNDI4GBB' # your Foursquare Secret
VERSION = '20180605'                                               # Foursquare API version
LIMIT = 100                                                        # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 5NNQESMYHJBEXIEAEL4UR5G5P2QFQK0KYN3VALWTGGEEASSW
CLIENT_SECRET:KV4AIAK1HQ3N11RDTM2OLRDQJHM13DRA0ZD32C4HVNDI4GBB


Let's explore the first neighborhood in our dataframe.

In [18]:
# Get the neighborhood's name.
neighborhoods.loc[0, 'Municipality']

'Anderlecht'

In [19]:
#Get the neighborhood's latitude and longitude values.
neighborhood_latitude = neighborhoods.loc[0, 'Latitude']   # neighborhood latitude value
neighborhood_longitude = neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = neighborhoods.loc[0, 'Municipality']   # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Anderlecht are 50.839098299999996, 4.3296526.


Get top 100 venues that are in Anderlecht within a radius of 1000 meters.¶

In [20]:
# create the GET request URL.

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=5NNQESMYHJBEXIEAEL4UR5G5P2QFQK0KYN3VALWTGGEEASSW&client_secret=KV4AIAK1HQ3N11RDTM2OLRDQJHM13DRA0ZD32C4HVNDI4GBB&v=20180605&ll=50.839098299999996,4.3296526&radius=1000&limit=100'

In [21]:
import requests
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '6025448d3916cb7a77fa0ae2'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Anderlecht',
  'headerFullLocation': 'Anderlecht, Brussels',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 86,
  'suggestedBounds': {'ne': {'lat': 50.84809830900001,
    'lng': 4.343877778376644},
   'sw': {'lat': 50.830098290999985, 'lng': 4.315427421623356}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c29f0aded0ac9b6ee7b62aa',
       'name': 'Chez Rosario',
       'location': {'address': 'Rue Eloystraat 22',
        'lat': 50.83624016401138,
        'lng': 4.331006922553949,
        'labeledLatLngs': [{'label': 'display',
          'lat': 50.83

As all the information is in the items key, we will use a function to get the category type.

In [22]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Now we are ready to clean the json and structure it into a pandas dataframe.

In [23]:
from pandas import json_normalize # tranform JSON file into a pandas dataframe

In [24]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Chez Rosario,Deli / Bodega,50.83624,4.331007
1,Brasserie Cantillon Brouwerij (Cantillon - Bro...,Brewery,50.841487,4.335451
2,Maharaja Tandoori Restaurant I,Indian Restaurant,50.839015,4.332212
3,Crep' & Cream,Creperie,50.83905,4.330786
4,Boeremet,Cocktail Bar,50.842882,4.326992


In [25]:
# Number of venues returned by Foursquare
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

86 venues were returned by Foursquare.


### Explore Neighborhoods in Brussels-Capital Region

Let's create a function to repeat the same process to all the neighborhoods in Brussels-Capital

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Now write the code to run the above function on each neighborhood and create a new dataframe called brussels_venues.

In [27]:
brussels_venues = getNearbyVenues(names=neighborhoods['Municipality'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )

Anderlecht
Auderghem
Berchem-Sainte-Agathe
Bruxelles
Etterbeek
Evere
Forest
Ganshoren
Ixelles
Jette
Koekelberg
Molenbeek-Saint-Jean
Saint-Gilles
Saint-Josse-ten-Noode
Schaerbeek
Uccle
Watermael-Boitsfort
Woluwe-Saint-Lambert
Woluwe-Saint-Pierre


Let's check the size of the resulting dataframe

In [28]:
brussels_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Anderlecht,50.839098,4.329653,Chez Rosario,50.836240,4.331007,Deli / Bodega
1,Anderlecht,50.839098,4.329653,Brasserie Cantillon Brouwerij (Cantillon - Bro...,50.841487,4.335451,Brewery
2,Anderlecht,50.839098,4.329653,Maharaja Tandoori Restaurant I,50.839015,4.332212,Indian Restaurant
3,Anderlecht,50.839098,4.329653,Crep' & Cream,50.839050,4.330786,Creperie
4,Anderlecht,50.839098,4.329653,Boeremet,50.842882,4.326992,Cocktail Bar
...,...,...,...,...,...,...,...
1594,Woluwe-Saint-Pierre,50.837025,4.427464,Pharmacie Soetaert,50.845165,4.426873,Pharmacy
1595,Woluwe-Saint-Pierre,50.837025,4.427464,Atlantische Oceaan (MIVB),50.831150,4.418478,Bus Station
1596,Woluwe-Saint-Pierre,50.837025,4.427464,Louis Delhaize - Kelle,50.833627,4.439888,Grocery Store
1597,Woluwe-Saint-Pierre,50.837025,4.427464,Terrain de Foot Rue Kellestraat Voetbalterrein,50.834163,4.440236,Soccer Field


In [29]:
brussels_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Anderlecht,86,86,86,86,86,86
Auderghem,80,80,80,80,80,80
Berchem-Sainte-Agathe,72,72,72,72,72,72
Bruxelles,100,100,100,100,100,100
Etterbeek,100,100,100,100,100,100
Evere,70,70,70,70,70,70
Forest,51,51,51,51,51,51
Ganshoren,72,72,72,72,72,72
Ixelles,100,100,100,100,100,100
Jette,96,96,96,96,96,96


In [30]:
# Number of unique categories from all the returned venues
print('There are {} uniques categories.'.format(len(brussels_venues['Venue Category'].unique())))

There are 243 uniques categories.


In [31]:
brussels_venues.shape

(1599, 7)

### Analyze Each Neighborhood

In [32]:
# one hot encoding
brussels_onehot = pd.get_dummies(brussels_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
brussels_onehot['Neighborhood'] = brussels_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [brussels_onehot.columns[-1]] + list(brussels_onehot.columns[:-1])
brussels_onehot = brussels_onehot[fixed_columns]
brussels_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Vegetarian / Vegan Restaurant,Vehicle Inspection Station,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Anderlecht,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Anderlecht,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Anderlecht,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Anderlecht,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Anderlecht,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# dataframe size
brussels_onehot.shape

(1599, 244)

In [34]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
brussels_grouped = brussels_onehot.groupby('Neighborhood').mean().reset_index()
brussels_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Vegetarian / Vegan Restaurant,Vehicle Inspection Station,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,Anderlecht,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,...,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Auderghem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berchem-Sainte-Agathe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013889,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bruxelles,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0
4,Etterbeek,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,...,0.02,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0,0.0


In [35]:
# dataframe size
brussels_grouped.shape

(19, 244)

Let's print each neighborhood along with the top 5 most common venues

In [36]:
num_top_venues = 5

for hood in brussels_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = brussels_grouped[brussels_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Anderlecht----
               venue  freq
0     Sandwich Place  0.09
1              Hotel  0.07
2        Coffee Shop  0.06
3  French Restaurant  0.05
4        Supermarket  0.03


----Auderghem----
                venue  freq
0  Italian Restaurant  0.06
1              Bakery  0.05
2      Sandwich Place  0.04
3          Restaurant  0.04
4               Plaza  0.04


----Berchem-Sainte-Agathe----
               venue  freq
0         Restaurant  0.06
1  Electronics Store  0.06
2        Supermarket  0.06
3   Greek Restaurant  0.04
4                Gym  0.04


----Bruxelles----
            venue  freq
0     Coffee Shop  0.06
1  Chocolate Shop  0.06
2             Bar  0.06
3        Beer Bar  0.05
4           Plaza  0.05


----Etterbeek----
                venue  freq
0  Italian Restaurant  0.11
1              Bakery  0.04
2               Plaza  0.03
3               Hotel  0.03
4      Sandwich Place  0.03


----Evere----
          venue  freq
0          Park  0.07
1   Supermarket  0.06
2  

Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [37]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [38]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = brussels_grouped['Neighborhood']

for ind in np.arange(brussels_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(brussels_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anderlecht,Sandwich Place,Hotel,Coffee Shop,French Restaurant,Supermarket,Greek Restaurant,Bar,Seafood Restaurant,Food Court,Metro Station
1,Auderghem,Italian Restaurant,Bakery,Pharmacy,Pizza Place,Plaza,Sandwich Place,Park,Fast Food Restaurant,Restaurant,French Restaurant
2,Berchem-Sainte-Agathe,Electronics Store,Supermarket,Restaurant,Bar,Greek Restaurant,Gym,Cosmetics Shop,Furniture / Home Store,Brasserie,Plaza
3,Bruxelles,Chocolate Shop,Coffee Shop,Bar,Plaza,Bookstore,Beer Bar,Hotel,Italian Restaurant,Seafood Restaurant,Sandwich Place
4,Etterbeek,Italian Restaurant,Bakery,Plaza,History Museum,Greek Restaurant,Coffee Shop,Sandwich Place,Hotel,Restaurant,Bar


### Cluster Neighborhoods

Run k-means to cluster the neighborhood into 4 clusters.

In [39]:
from sklearn.cluster import KMeans        # import k-means from clustering stage
# set number of clusters
kclusters = 4
brussels_grouped_clustering = brussels_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(brussels_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:18] 

array([0, 2, 1, 0, 0, 3, 3, 1, 0, 1, 3, 0, 0, 0, 1, 1, 2, 2])

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [40]:
neighborhoods_venues_sorted.head(2)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anderlecht,Sandwich Place,Hotel,Coffee Shop,French Restaurant,Supermarket,Greek Restaurant,Bar,Seafood Restaurant,Food Court,Metro Station
1,Auderghem,Italian Restaurant,Bakery,Pharmacy,Pizza Place,Plaza,Sandwich Place,Park,Fast Food Restaurant,Restaurant,French Restaurant


In [41]:
neighborhoods.head(2)

Unnamed: 0,Municipality,PostalCode,Latitude,Longitude
0,Anderlecht,1070,50.839098,4.329653
1,Auderghem,1160,50.817235,4.426898


In [42]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

brussels_merged = neighborhoods

brussels_merged = brussels_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Municipality')

brussels_merged.head()

Unnamed: 0,Municipality,PostalCode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anderlecht,1070,50.839098,4.329653,0,Sandwich Place,Hotel,Coffee Shop,French Restaurant,Supermarket,Greek Restaurant,Bar,Seafood Restaurant,Food Court,Metro Station
1,Auderghem,1160,50.817235,4.426898,2,Italian Restaurant,Bakery,Pharmacy,Pizza Place,Plaza,Sandwich Place,Park,Fast Food Restaurant,Restaurant,French Restaurant
2,Berchem-Sainte-Agathe,1082,50.864923,4.294673,1,Electronics Store,Supermarket,Restaurant,Bar,Greek Restaurant,Gym,Cosmetics Shop,Furniture / Home Store,Brasserie,Plaza
3,Bruxelles,1000,50.846557,4.351697,0,Chocolate Shop,Coffee Shop,Bar,Plaza,Bookstore,Beer Bar,Hotel,Italian Restaurant,Seafood Restaurant,Sandwich Place
4,Etterbeek,1040,50.836145,4.386174,0,Italian Restaurant,Bakery,Plaza,History Museum,Greek Restaurant,Coffee Shop,Sandwich Place,Hotel,Restaurant,Bar


In [43]:
# display all rows so we are able to explore clusters
pd.set_option('display.max_rows', None)
brussels_merged

Unnamed: 0,Municipality,PostalCode,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anderlecht,1070,50.839098,4.329653,0,Sandwich Place,Hotel,Coffee Shop,French Restaurant,Supermarket,Greek Restaurant,Bar,Seafood Restaurant,Food Court,Metro Station
1,Auderghem,1160,50.817235,4.426898,2,Italian Restaurant,Bakery,Pharmacy,Pizza Place,Plaza,Sandwich Place,Park,Fast Food Restaurant,Restaurant,French Restaurant
2,Berchem-Sainte-Agathe,1082,50.864923,4.294673,1,Electronics Store,Supermarket,Restaurant,Bar,Greek Restaurant,Gym,Cosmetics Shop,Furniture / Home Store,Brasserie,Plaza
3,Bruxelles,1000,50.846557,4.351697,0,Chocolate Shop,Coffee Shop,Bar,Plaza,Bookstore,Beer Bar,Hotel,Italian Restaurant,Seafood Restaurant,Sandwich Place
4,Etterbeek,1040,50.836145,4.386174,0,Italian Restaurant,Bakery,Plaza,History Museum,Greek Restaurant,Coffee Shop,Sandwich Place,Hotel,Restaurant,Bar
5,Evere,1140,50.87201,4.403418,3,Park,Bar,Snack Place,Supermarket,Sandwich Place,Tram Station,Paper / Office Supplies Store,Hotel,Brasserie,Train Station
6,Forest,1190,50.811795,4.318119,3,Park,Supermarket,Sandwich Place,Furniture / Home Store,Brasserie,Bus Stop,Pet Store,Bookstore,Snack Place,Bus Station
7,Ganshoren,1083,50.870327,4.307798,1,Bar,Restaurant,Italian Restaurant,Plaza,Pizza Place,Park,Supermarket,Chinese Restaurant,Hockey Field,Electronics Store
8,Ixelles,1050,50.833114,4.366828,0,Boutique,Hotel,Italian Restaurant,Vegetarian / Vegan Restaurant,Bar,Tea Room,Bakery,Coffee Shop,Sandwich Place,Wine Bar
9,Jette,1090,50.875959,4.32457,1,Bar,Plaza,Bakery,Pizza Place,Sandwich Place,Italian Restaurant,Park,Snack Place,Convenience Store,Platform


Finally, let's visualize the resulting clusters: hover over the circle marker to display municipality name and cluster.

In [45]:
import folium
import matplotlib.cm as cm                # Matplotlib and associated plotting modules
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12, control_scale=True, tiles='OpenStreetMap')

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.ocean(np.linspace(0, 1, len(ys)))
ocean = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(brussels_merged['Latitude'], brussels_merged['Longitude'], brussels_merged['Municipality'], brussels_merged['Cluster Labels']):
    text = folium.Tooltip(str(poi)+', Cluster '+ str(cluster))
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        color=ocean[cluster-0],
        tooltip=text,
        fill=True,
        fill_color=ocean[cluster-0],
        fill_opacity=0.5).add_to(map_clusters)
       
map_clusters

### Examine Clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster.

Cluster 0

In [46]:
brussels_merged.loc[brussels_merged['Cluster Labels'] == 0, brussels_merged.columns[[0] + list(range(3, brussels_merged.shape[1]))]]

Unnamed: 0,Municipality,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Anderlecht,4.329653,0,Sandwich Place,Hotel,Coffee Shop,French Restaurant,Supermarket,Greek Restaurant,Bar,Seafood Restaurant,Food Court,Metro Station
3,Bruxelles,4.351697,0,Chocolate Shop,Coffee Shop,Bar,Plaza,Bookstore,Beer Bar,Hotel,Italian Restaurant,Seafood Restaurant,Sandwich Place
4,Etterbeek,4.386174,0,Italian Restaurant,Bakery,Plaza,History Museum,Greek Restaurant,Coffee Shop,Sandwich Place,Hotel,Restaurant,Bar
8,Ixelles,4.366828,0,Boutique,Hotel,Italian Restaurant,Vegetarian / Vegan Restaurant,Bar,Tea Room,Bakery,Coffee Shop,Sandwich Place,Wine Bar
11,Molenbeek-Saint-Jean,4.338636,0,Bar,Seafood Restaurant,Hotel,Belgian Restaurant,French Restaurant,Theater,Restaurant,Plaza,Bakery,Burger Joint
12,Saint-Gilles,4.345484,0,Bar,Italian Restaurant,Brasserie,French Restaurant,Bakery,Restaurant,Plaza,Pizza Place,Hotel,Coffee Shop
13,Saint-Josse-ten-Noode,4.369163,0,Italian Restaurant,Hotel,Concert Hall,Sandwich Place,Bookstore,Pizza Place,Park,Plaza,Turkish Restaurant,Japanese Restaurant


Cluster 1

In [47]:
brussels_merged.loc[brussels_merged['Cluster Labels'] == 1, brussels_merged.columns[[0] + list(range(3, brussels_merged.shape[1]))]]

Unnamed: 0,Municipality,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Berchem-Sainte-Agathe,4.294673,1,Electronics Store,Supermarket,Restaurant,Bar,Greek Restaurant,Gym,Cosmetics Shop,Furniture / Home Store,Brasserie,Plaza
7,Ganshoren,4.307798,1,Bar,Restaurant,Italian Restaurant,Plaza,Pizza Place,Park,Supermarket,Chinese Restaurant,Hockey Field,Electronics Store
9,Jette,4.32457,1,Bar,Plaza,Bakery,Pizza Place,Sandwich Place,Italian Restaurant,Park,Snack Place,Convenience Store,Platform
14,Schaerbeek,4.373712,1,Snack Place,Supermarket,Bar,Turkish Restaurant,Gym / Fitness Center,Tram Station,Italian Restaurant,Coffee Shop,Diner,Music Venue
15,Uccle,4.333844,1,Supermarket,French Restaurant,Sandwich Place,Plaza,Bar,Bakery,Park,Pizza Place,Chinese Restaurant,Italian Restaurant


Cluster 2

In [48]:
brussels_merged.loc[brussels_merged['Cluster Labels'] == 2, brussels_merged.columns[[0] + list(range(3, brussels_merged.shape[1]))]]

Unnamed: 0,Municipality,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Auderghem,4.426898,2,Italian Restaurant,Bakery,Pharmacy,Pizza Place,Plaza,Sandwich Place,Park,Fast Food Restaurant,Restaurant,French Restaurant
16,Watermael-Boitsfort,4.417644,2,Restaurant,Bus Stop,Italian Restaurant,French Restaurant,Park,Ice Cream Shop,Gastropub,Chinese Restaurant,Farmers Market,Event Service
17,Woluwe-Saint-Lambert,4.425673,2,Italian Restaurant,French Restaurant,Supermarket,Sushi Restaurant,Fast Food Restaurant,Restaurant,Park,Gourmet Shop,Asian Restaurant,Bakery
18,Woluwe-Saint-Pierre,4.427464,2,Italian Restaurant,Park,Restaurant,Pharmacy,Supermarket,French Restaurant,Sandwich Place,Sushi Restaurant,Belgian Restaurant,Bistro


Cluster 3

In [49]:
brussels_merged.loc[brussels_merged['Cluster Labels'] == 3, brussels_merged.columns[[0] + list(range(3, brussels_merged.shape[1]))]]

Unnamed: 0,Municipality,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Evere,4.403418,3,Park,Bar,Snack Place,Supermarket,Sandwich Place,Tram Station,Paper / Office Supplies Store,Hotel,Brasserie,Train Station
6,Forest,4.318119,3,Park,Supermarket,Sandwich Place,Furniture / Home Store,Brasserie,Bus Stop,Pet Store,Bookstore,Snack Place,Bus Station
10,Koekelberg,4.33155,3,Supermarket,Snack Place,Sandwich Place,French Restaurant,Convenience Store,Hostel,Gym / Fitness Center,Italian Restaurant,Park,History Museum


The first cluster (cluster label 0, green marker on the map, 7 municipalities)
All municipalities from this cluster are regrouped in the center or near the center of Brussels. They are adjacent
one to another. With its 7 municipalities it is the largest cluster. We notice that the most common venues are
restaurants serving dishes from all over the world, bars, fast-foods and hotels. It suggests lively urban areas and
the presence of many touristes. These kinds of places might be attractive to younger expats, singles or couples
without children enjoying to go out and have fun with friends after their work, without having to do long distances to
get there.

The second cluster (cluster label 1, indigo marker on the map, 5 municipalities)
The majority of municipalities from this cluster - 4 of them - are located outside the center, in the northern part of
Brussels. They are adjacent one to another. One municipality being located outside as well but at the extreme
southern part and not adjacent to others. We count less restaurants and bars than in the first cluster and we start
to have among the most common venues parks, gyms and even a hockey field. Supermarkets are also present.
The municipalities from this cluster being located outside the center and well equipped for day to day life suggest
that they are better suited for expats families with children.

The third cluster (cluster label 2, blue marker on the map, 4 municipalities)
All municipalities from this cluster are located outside the center and close to The Sonian Forest at the southeast
edge of Brussels. They are adjacent one to another. The most common venues are parks and all kinds of
restaurants. There are also bakeries, pharmacies, supermarkets and even an ice cream shop. These municipalities
are very well suited for expats who enjoy green areas and value the near presence of the large forest where they
can go walking or biking after work.

The fourth cluster (cluster label 3, white marker on the map, 3 municipalities)
All 3 municipalities from this cluster are spread from nord to south and are not adjacent one to each other.
The most common venues are supermarkets and fast-foods. It is difficult to distinguish dominating characteristics
here and furthermore give any general recommendations.