# Clustering Frankfurt's Neighborhoods

Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

import requests # Return HTML pages as string
import json # Extract data from JSON files
import folium # Map plotting library
from geopy.geocoders import Nominatim # Convert an address into latitude and longitude values
from bs4 import BeautifulSoup # Extract data from HTML page
import warnings # Suppres Pandas Warnings

import the neighborhoods csv file and preprocess the dataset

In [2]:
# Suppress pandas warning on .loc
warnings.filterwarnings('ignore')

# Read the csv file containing the names of districts in the Frankfurt Municipality
neighborhoods = pd.read_csv('ffm_neighborhoods.csv', index_col=False)

neighborhoods['Latitude'] = np.nan
neighborhoods['Longitude'] = np.nan

geolocator = Nominatim(user_agent='to_explorer')

for neighborhood in neighborhoods['Neighborhood']:
    location = geolocator.geocode(neighborhood + ', Frankfurt am Main')
    if location:
        neighborhoods.loc[neighborhoods['Neighborhood']==neighborhood, 'Latitude'] = location.latitude
        neighborhoods.loc[neighborhoods['Neighborhood']==neighborhood, 'Longitude'] = location.longitude
        
neighborhoods.head()        

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Gallus,50.10384,8.643101
1,Gutleutviertel,50.099203,8.663941
2,Bahnhofsviertel,50.099203,8.663941
3,Altstadt,50.108052,8.682161
4,Innenstadt,50.108052,8.682161


Clean the dataframe

In [3]:
neighborhoods.dropna(inplace=True)
neighborhoods.drop_duplicates(subset=['Latitude', 'Longitude'], inplace=True)
neighborhoods.reset_index(drop=True, inplace=True)
neighborhoods.shape

(40, 3)

Visualize districts on the map

In [4]:
geolocator = Nominatim(user_agent='to_explorer')
location = geolocator.geocode('Frankfurt am Main')


map_ffm = folium.Map(location=[location.latitude, location.longitude], zoom_start=12)

for latitude, longitude, district in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Neighborhood']):
    label = folium.Popup(str(district))
    folium.CircleMarker([latitude, longitude], radius=5, popup=label, fill=True, fill_opacity=0.7).add_to(map_ffm)
       
map_ffm

<br>
Defining a function that will exctract venue data with the Fourquare API
<br>
<br>

In [4]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 
                  'Venue Latitude', 'Venue Longitude', 'Venue Category']
    print('Venues extracted!')
    return(nearby_venues)

In [5]:
CLIENT_ID = '#################################' # your Foursquare ID
CLIENT_SECRET = '#################################' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 200 # limit of number of venues returned by Foursquare API

ffm = getNearbyVenues(names=neighborhoods['Neighborhood'], latitudes=neighborhoods['Latitude'], longitudes=neighborhoods['Longitude'])
ffm.head()

Venues extracted!


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Gallus,50.10384,8.643101,DORMERO Hotel,50.108106,8.644107,Hotel
1,Gallus,50.10384,8.643101,Basaglia,50.104273,8.64717,Café
2,Gallus,50.10384,8.643101,Capri by Fraser Frankfurt,50.108896,8.64879,Hotel
3,Gallus,50.10384,8.643101,Alim's Fischimbiss,50.102392,8.63699,Seafood Restaurant
4,Gallus,50.10384,8.643101,Melody Karaoke,50.10106,8.634424,Karaoke Bar


In [6]:
# Eliminate duplicate venues
ffm.drop_duplicates(subset='Venue', inplace=True)

print('The number of venues extracted is: ' + str(ffm.shape[0]) + '.')
print('There are {} uniques venue categories.'.format(len(ffm['Venue Category'].unique())))
print('Those venues are located in {} different neighborhoods.'.format(len(ffm['Neighborhood'].unique())))

# Export the results in csv to avoid querying Foursquare too much
#ffm.to_csv('ffm_venues.csv', index=False)

The number of venues extracted is: 1198.
There are 239 uniques venue categories.
Those venues are located in 38 different neighborhoods.


In [7]:
geolocator = Nominatim(user_agent='to_explorer')
location = geolocator.geocode('Frankfurt am Main')

map_ffm = folium.Map(location=[location.latitude, location.longitude], zoom_start=12)

for latitude, longitude, venue in zip(ffm['Venue Latitude'], ffm['Venue Longitude'], ffm['Venue']):
    label = folium.Popup(str(venue))
    folium.CircleMarker([latitude, longitude], radius=4, popup=label, fill=True, fill_opacity=0.7).add_to(map_ffm)
       
map_ffm

In [8]:
ffm.groupby('Neighborhood')['Venue'].count()

Neighborhood
Altstadt             71
Bergen-Enkheim        8
Berkersheim           9
Bockenheim           82
Bonames              15
Dornbusch            17
Eckenheim            23
Eschersheim           6
Fechenheim           16
Flughafen            92
Gallus              100
Ginnheim             18
Gutleutviertel       77
Hausen               14
Heddernheim          14
Höchst               15
Kalbach-Riedberg     13
Nied                 17
Nieder-Erlenbach      5
Nieder-Eschbach       5
Niederrad            58
Niederursel          16
Nordend-Ost          55
Nordend-West         21
Oberrad              88
Ostend               46
Praunheim            29
Preungesheim          4
Riederwald           28
Rödelheim            40
Schwanheim           27
Seckbach             26
Sindlingen            5
Sossenheim           18
Unterliederbach      43
Westend-Nord         46
Westend-Süd          25
Zeilsheim             6
Name: Venue, dtype: int64

## Analyze Neighborhoods

In [9]:
# lets get a one hot encoding of all differen
ffm_onehot = pd.get_dummies(ffm[['Venue Category']], prefix="", prefix_sep="")

# add District column to dataframe
ffm_onehot.insert(0, 'Neighborhood', ffm['Neighborhood'])
ffm_onehot.shape

(1198, 240)

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [10]:
ffm_grouped = ffm_onehot.groupby('Neighborhood').mean().reset_index()
ffm_grouped.head(10)

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Apple Wine Pub,Arcade,...,Transportation Service,Trattoria/Osteria,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Water Park,Waterfront,Wine Bar,Wine Shop
0,Altstadt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056338,0.0,...,0.0,0.028169,0.0,0.0,0.0,0.014085,0.0,0.0,0.028169,0.0
1,Bergen-Enkheim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0
2,Berkersheim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bockenheim,0.0,0.0,0.0,0.0,0.0,0.0,0.012195,0.0,0.0,...,0.0,0.0,0.02439,0.0,0.0,0.012195,0.0,0.0,0.036585,0.0
4,Bonames,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Dornbusch,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0
6,Eckenheim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Eschersheim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Fechenheim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Flughafen,0.01087,0.0,0.01087,0.195652,0.119565,0.01087,0.01087,0.0,0.0,...,0.01087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01087,0.0


In [11]:
ffm_grouped.shape

(38, 240)

In [12]:
num_top_venues = 5

for hood in ffm_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ffm_grouped[ffm_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 3})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Altstadt----
               venue   freq
0               Café  0.099
1     Apple Wine Pub  0.056
2  German Restaurant  0.056
3                Bar  0.042
4              Plaza  0.042


----Bergen-Enkheim----
                venue   freq
0      Ice Cream Shop  0.125
1  Italian Restaurant  0.125
2             Taverna  0.125
3   German Restaurant  0.125
4               Trail  0.125


----Berkersheim----
                venue   freq
0         Supermarket  0.111
1            Pharmacy  0.111
2   German Restaurant  0.111
3  Light Rail Station  0.111
4            Bus Stop  0.111


----Bockenheim----
                venue   freq
0  Italian Restaurant  0.122
1                Café  0.122
2    Asian Restaurant  0.061
3    Botanical Garden  0.049
4                 Bar  0.037


----Bonames----
                venue   freq
0                Café  0.133
1        Burger Joint  0.067
2  Chinese Restaurant  0.067
3         Golf Course  0.067
4   Electronics Store  0.067


----Dornbusch----
             

Let's put that into a pandas dataframe
First, let's write a function to sort the venues in descending order.

In [13]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:] # exclude District column
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [14]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
district_venues_sorted = pd.DataFrame(columns=columns)
district_venues_sorted['Neighborhood'] = ffm_grouped['Neighborhood']

for ind in np.arange(ffm_grouped.shape[0]):
    district_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ffm_grouped.iloc[ind, :], num_top_venues)

district_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Altstadt,Café,Apple Wine Pub,German Restaurant,Plaza,Bar,Scenic Lookout,Thai Restaurant,Falafel Restaurant,Electronics Store,Italian Restaurant
1,Bergen-Enkheim,Trail,Taverna,Water Park,Plaza,Ice Cream Shop,Paper / Office Supplies Store,Italian Restaurant,German Restaurant,Wine Shop,Donut Shop
2,Berkersheim,Bakery,Train Station,Hotel,Soccer Field,Light Rail Station,Pharmacy,Bus Stop,Supermarket,German Restaurant,Duty-free Shop
3,Bockenheim,Italian Restaurant,Café,Asian Restaurant,Botanical Garden,Bakery,Wine Bar,Bar,Spanish Restaurant,Pizza Place,Japanese Restaurant
4,Bonames,Café,Italian Restaurant,Metro Station,Event Service,Electronics Store,Burger Joint,Garden Center,Doner Restaurant,Athletics & Sports,Golf Course


## Cluster Neighborhoods

In [15]:
num_clusters = 5

X = ffm_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(X)

Create a new dataframe that includes the cluster and the top 10 venues for each neighborhood.

In [16]:
# add clustering labels
district_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ffm_merged = neighborhoods

# merge labels and data about venues to district data and latitude plus longitude data to have all in one dataframe
ffm_merged = ffm_merged.join(district_venues_sorted.set_index('Neighborhood'), how='inner', on='Neighborhood')

ffm_merged.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Gallus,50.10384,8.643101,0,Hotel,Italian Restaurant,Restaurant,Asian Restaurant,Pizza Place,Café,Indian Restaurant,Coffee Shop,Supermarket,Japanese Restaurant
1,Gutleutviertel,50.099203,8.663941,0,Café,Hotel,Seafood Restaurant,Vietnamese Restaurant,Indian Restaurant,Bar,Art Museum,Asian Restaurant,Cocktail Bar,Bakery
2,Altstadt,50.108052,8.682161,0,Café,Apple Wine Pub,German Restaurant,Plaza,Bar,Scenic Lookout,Thai Restaurant,Falafel Restaurant,Electronics Store,Italian Restaurant
3,Bockenheim,50.123311,8.646056,0,Italian Restaurant,Café,Asian Restaurant,Botanical Garden,Bakery,Wine Bar,Bar,Spanish Restaurant,Pizza Place,Japanese Restaurant
4,Westend-Süd,50.115245,8.66227,0,Café,Steakhouse,Park,Japanese Restaurant,Italian Restaurant,Breakfast Spot,Bookstore,Food & Drink Shop,Market,Lounge


Visualize clusters in a Map

In [17]:
map_clusters = folium.Map(location=[location.latitude, location.longitude], zoom_start=12)

# set color scheme for the clusters
indian_red = '#CD5C5C'
blue = '#2980B9'
purple = '#5B2C6F'
gold = '#F1C40F'
green = '#239B56'
x = np.arange(num_clusters)
rainbow = [indian_red, blue, purple, gold, green]
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ffm_merged['Latitude'], ffm_merged['Longitude'], ffm_merged['Neighborhood'], ffm_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

In [18]:
cluster0 = ffm_merged.loc[ffm_merged['Cluster Labels'] == 0, ffm_merged.columns[[0] + list(range(4, ffm_merged.shape[1]))]]
cluster0['1st Most Common Venue'].value_counts()

Café                  7
Italian Restaurant    3
Airport Lounge        1
Clothing Store        1
Hotel                 1
Mexican Restaurant    1
Restaurant            1
Park                  1
Name: 1st Most Common Venue, dtype: int64

In [19]:
cluster1 = ffm_merged.loc[ffm_merged['Cluster Labels'] == 1, ffm_merged.columns[[0] + list(range(4, ffm_merged.shape[1]))]]
cluster1['1st Most Common Venue'].value_counts()

Intersection    1
Name: 1st Most Common Venue, dtype: int64

In [20]:
cluster2 = ffm_merged.loc[ffm_merged['Cluster Labels'] == 2, ffm_merged.columns[[0] + list(range(4, ffm_merged.shape[1]))]]
cluster2['1st Most Common Venue'].value_counts()

Furniture / Home Store    1
Name: 1st Most Common Venue, dtype: int64

In [21]:
cluster3 = ffm_merged.loc[ffm_merged['Cluster Labels'] == 3, ffm_merged.columns[[0] + list(range(4, ffm_merged.shape[1]))]]
cluster3['1st Most Common Venue'].value_counts()

Hotel                  4
German Restaurant      3
Metro Station          3
Tram Station           2
Bakery                 2
Sporting Goods Shop    1
Park                   1
Bowling Alley          1
Trail                  1
Intersection           1
Name: 1st Most Common Venue, dtype: int64

In [22]:
cluster4 = ffm_merged.loc[ffm_merged['Cluster Labels'] == 4, ffm_merged.columns[[0] + list(range(4, ffm_merged.shape[1]))]]
cluster4['1st Most Common Venue'].value_counts()

Hotel    1
Name: 1st Most Common Venue, dtype: int64