In [1]:
import pandas as pd
import numpy as np
import wikipedia # Wikipedia is a Python library that makes it easy to access and parse data from Wikipedia
import geopandas as gpd
import requests
import shapely
from shapely.geometry import Point, Polygon
from sklearn.cluster import KMeans # import k-means for clustering 
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library
import json
from geopandas import GeoDataFrame
import sys

# Create a pandas dataframe including the boroughs, and their including neighbourhoods, for the city of Munich, Germany

## Collect the required data from a wikipedia page

In [2]:
wikipedia.set_lang('de') # we need to change the language since the page is in german

In [3]:
wiki_munich=wikipedia.page("Liste der Stadtteile Münchens")

In [4]:
df_boroughs = pd.read_html(wiki_munich.url, header =0)[1] # we want to get the data from the second table on the page and use the first row as header
df_boroughs.head(2)

Unnamed: 0,Stadtteil,"Stadtbezirk, in dem der Stadtteil (größtenteils) liegt",Stadt-bezirks-nr.,Quartiere und Siedlungen im Stadtteil
0,Allach,Allach-Untermenzing,23,"Allach, Gerberau"
1,Altstadt,Altstadt-Lehel,1,"Angerviertel, Graggenauviertel, Hackenviertel,..."


In [5]:
df_boroughs.columns=['Neighbourhood', 'Borough', 'Borough #', 'Drop'] #rename the columns
df_boroughs=df_boroughs.drop(columns='Drop')# drop the 4th column
df_boroughs.head(2)

Unnamed: 0,Neighbourhood,Borough,Borough #
0,Allach,Allach-Untermenzing,23
1,Altstadt,Altstadt-Lehel,1


## Group the Neighbourhoods by the Boroughs and join them into one row

In [6]:
df_boroughs=df_boroughs.groupby(['Borough', 'Borough #'])['Neighbourhood'].apply(list).apply(lambda x:', '.join(x)).reset_index()

In [7]:
df_boroughs.head(2)

Unnamed: 0,Borough,Borough #,Neighbourhood
0,Allach-Untermenzing,23,"Allach, Untermenzing"
1,Altstadt-Lehel,1,"Altstadt, Lehel"


## In order to get geodata for the Boroughs we use https://geoconverter.hsr.ch to convert a WFS from https://www.opengov-muenchen.de/dataset/verwaltungseinheiten-der-landeshauptstadt-muenchen into a .shp file. Then we import that file and add the geometry data to df_boroughs, for that we order df_boroughs in alphabetical order and concat the column from df_bg.

In [8]:
fname='/Users/achimpeichl/Downloads/4293d9ed-3d60-4db1-ac58-8cb6998dd3a8/converted.shp/ms:bgl0.shp'

In [9]:
df_bg=gpd.read_file(fname)

In [10]:
df_bg.head(2)

Unnamed: 0,gml_id,X,Y,SB_NUMMER,NAME,FLAECHE_QM,SHAPE_AREA,SHAPE_LEN,geometry
0,,4472560.1,5332114.7,14,Berg am Laim,6314554.17067163,0.0,0.0,"POLYGON ((11.64551067005214 48.13598140892672,..."
1,,4469603.81,5329427.85,17,Obergiesing-Fasangarten,5720359.98324519,0.0,0.0,"POLYGON ((11.5980388917605 48.12258581610786, ..."


In [11]:
df_bg.X=df_bg.geometry.centroid.x
df_bg.Y=df_bg.geometry.centroid.y

In [12]:
df_bg.sort_values(by=['NAME'], inplace=True)
df_boroughs=df_boroughs.reset_index(drop=True)

In [13]:
df_boroughs = pd.concat([df_boroughs, df_bg['geometry'],df_bg['X'],df_bg['Y']], axis=1, join_axes=[df_boroughs.index])

In [14]:
df_boroughs.head(2)

Unnamed: 0,Borough,Borough #,Neighbourhood,geometry,X,Y
0,Allach-Untermenzing,23,"Allach, Untermenzing","POLYGON ((11.64551067005214 48.13598140892672,...",11.632368,48.126861
1,Altstadt-Lehel,1,"Altstadt, Lehel","POLYGON ((11.5980388917605 48.12258581610786, ...",11.593314,48.102699


# Collect data on the rents and housing quality/ living conditions in each of the Boroughs/Neighbourhoods

![title](https://suedbayerische-immobilien.de/sites/default/files/Wohnqualitaet-Muenchen-Toplagen/Wohnqualitaet-Muenchen-Wohnviertel-Toplagen-Stadtteile.png)


## I have used the above image to rate the locations from 1 (worst) to 4 (best) and the written the data into a csv file. We will now import that file and create a pandas dataframe

In [15]:
location='/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/location_rating.csv'
location_df=pd.read_csv(location, sep=';')
location_df.head(2)

Unnamed: 0,Location,Points
0,Altstadt-Lehel,4
1,Maxvorstadt,4


## Retrieve the Zip Codes and add them to the dataframe

In [16]:
url_zip = 'https://www.muenchen.de/leben/service/postleitzahlen.html'

In [17]:
df_zip = pd.read_html(url_zip, header =0)[0]
df_zip.rename({'Stadtteil':'Borough', 'Postleitzahl': 'ZIP'}, axis='columns', inplace=True)
df_zip.head(2)

Unnamed: 0,Borough,ZIP
0,Allach-Untermenzing,"80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,"80331, 80333, 80335, 80336, 80469, 80538, 80539"


#### Since the Names ob the Boroughs are in alphabetical order we can just concat the ZIP column from df_zip to  df_boroughs.

In [18]:
df_muc = pd.concat([df_boroughs, df_zip['ZIP']], axis=1, join_axes=[df_boroughs.index])

In [19]:
df_muc.head(2)

Unnamed: 0,Borough,Borough #,Neighbourhood,geometry,X,Y,ZIP
0,Allach-Untermenzing,23,"Allach, Untermenzing","POLYGON ((11.64551067005214 48.13598140892672,...",11.632368,48.126861,"80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,1,"Altstadt, Lehel","POLYGON ((11.5980388917605 48.12258581610786, ...",11.593314,48.102699,"80331, 80333, 80335, 80336, 80469, 80538, 80539"


## Next we will collect rent data for the Neighbourhoods

In [20]:
url_rent='https://www.miet-check.de/mietpreise/plz/muenchen/6562/'

In [21]:
df_rent=pd.read_html(url_rent, header=0)[0]
df_rent.head(2)

Unnamed: 0,#,PLZ,Mietpreis pro m2,Anzahl Einträge,Informationen
0,1,80331,23.2 Euro,178,mehr Infos
1,2,80333,22.9 Euro,244,mehr Infos


#### We only need the columns "PLZ"-->"ZIP" and "Mietpreis pro m2"-->"rpm2" (rpm2 = rent per m2)

In [22]:
df_rent=df_rent.drop(columns= ['#', 'Anzahl Einträge', 'Informationen'])

In [23]:
df_rent.columns=['ZIP', 'rpm2']

#### Remove the Euro

In [24]:
df_rent['rpm2'] = df_rent['rpm2'].map(lambda x: str(x)[:-5])

## Now we will need to add the rent data to the existing data

### Since the rent data does not match the bouroughs nor the neighbourhoods exactly we will have to join the data using the ZIP codes. To achieve this we will first create a dictionary out of df_rent (with the ZIP code as the key) and then use that dictonary to replace the ZIP code values in  df_zip with the rpm2 values for the corresponding ZIP code. At the end we will calculate the mean/average rent for each Borough.

#### Create the dictonary

In [25]:
rdic=dict(zip(df_rent.ZIP,df_rent.rpm2))

#### Replace the Zip codes in df_zip  with the rpm2 values

In [26]:
df_zip.head(2)

Unnamed: 0,Borough,ZIP
0,Allach-Untermenzing,"80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,"80331, 80333, 80335, 80336, 80469, 80538, 80539"


##### First we need to split up the ZIP code column into one  ZIP code per column

In [27]:
z=df_zip

In [28]:
z=pd.concat([z[['Borough']], z['ZIP'].str.split(',', expand=True)], axis=1)

In [29]:
z.set_index('Borough', inplace=True)
z.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Allach-Untermenzing,80995,80997,80999,81247,81249,,,,
Altstadt-Lehel,80331,80333,80335,80336,80469,80538.0,80539.0,,


In [30]:
df_zip=z

##### Then we can replace the values

In [31]:
df_zip = df_zip.apply(pd.to_numeric)

In [32]:
df_zip.replace(rdic, inplace=True)
df_zip.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Allach-Untermenzing,15.95,16.95,15.56,17.29,16.11,,,,
Altstadt-Lehel,23.2,22.9,21.74,20.38,23.22,22.76,22.25,,


##### Next we will calculate the average rent per Borough (we will need to convert the df to numeric values before we can calculate the average)

In [33]:
df_zip=df_zip.apply(pd.to_numeric)

In [34]:
df_zip['average rent'] = df_zip.mean(numeric_only=True, axis=1)
df_zip.head(2)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,average rent
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Allach-Untermenzing,15.95,16.95,15.56,17.29,16.11,,,,,16.372
Altstadt-Lehel,23.2,22.9,21.74,20.38,23.22,22.76,22.25,,,22.35


In [35]:
rent=df_zip[['average rent']]

In [36]:
rent.reset_index(inplace=True)
rent.head(2)


Unnamed: 0,Borough,average rent
0,Allach-Untermenzing,16.372
1,Altstadt-Lehel,22.35


# Miete mit der Wohnqualität vergleichen (daten zum DF hinzufügen und dann nach Mieten (absteigend) ordnen. Korrelation zu Wohnqualität gegeben?

## In the next step we will collect data from foursquare and use that data for clustering

#### Foursquare credentials

In [37]:
CLIENT_ID = '0MJA3NYYG3U2ZY1LTZN2OYEHS3Y3WVSON2GBSO3IL4EDYVIR' # your Foursquare ID
CLIENT_SECRET = 'WGWSAF2TKVUQPE3PD0N3EOITFVBY5EYP1VCZI3BMUG0ROUS5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

#### Import geodata from https://public.opendatasoft.com/explore/dataset/postleitzahlen-deutschland/table/ using geopandas

In [38]:
fname='/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/muc.geojson'
df_geo=gpd.read_file(fname)
df_geo.head(2)

Unnamed: 0,note,plz,geometry
0,München,80997,"POLYGON ((11.4597967 48.2117073, 11.460079 48...."
1,München,81669,"POLYGON ((11.5825324 48.1269235, 11.5837205 48..."


#### Get the latitude/y and longitude/x of the centroid for each polygon/ZIP

In [39]:
df_geo['latitude']=df_geo['geometry'].centroid.y
df_geo['longitude']=df_geo['geometry'].centroid.x
df_geo['point']=df_geo['geometry'].centroid

In [40]:
df_geo['geometry'].dtypes

dtype('O')

In [41]:
df_geo.head(2)

Unnamed: 0,note,plz,geometry,latitude,longitude,point
0,München,80997,"POLYGON ((11.4597967 48.2117073, 11.460079 48....",48.191879,11.482502,POINT (11.48250248003875 48.19187885565856)
1,München,81669,"POLYGON ((11.5825324 48.1269235, 11.5837205 48...",48.120075,11.601146,POINT (11.60114591581964 48.12007527984551)


In [42]:
df_lalo=df_geo[['plz','latitude','longitude']]

### Collect the Data from foursquare

In [70]:
def getNearbyVenues(latitudes, longitudes):
    radius=1150
    LIMIT=700
    venues_list=[]
    for lat, lng in zip(latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([( 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [71]:
df_venues_ = getNearbyVenues(latitudes=df_lalo['latitude'], longitudes=df_lalo['longitude'])

In [45]:
df_venues = df_venues_.copy()

In [46]:
df_venues.head(2)

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,48.191879,11.482502,Trattoria Olive,48.189905,11.46697,Trattoria/Osteria
1,48.191879,11.482502,Dehner,48.192386,11.500665,Garden Center


### Since not every Venue has a ZIP code in it's description we need to find another way to assign a ZIP code to each venue. To achieve this we use geopandas and shapley. First we use the longitude and latitude values of each venue to create a POINT. The df_geo dataframe contains a POLYGON that describes the boundaries for each ZIP code. This allows us then to check for each POLYGON (in the df_geo dataframe) if any of the POINTS in  df_venues is within that POLYGON. If True we then  write the correspondending ZIP code into a new column 'ZIP' in df_venues.

In [47]:
df_venues = gpd.GeoDataFrame(
    df_venues, geometry=gpd.points_from_xy(df_venues['Venue Longitude'], df_venues['Venue Latitude']))

In [48]:
df_venues.head(2)

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry
0,48.191879,11.482502,Trattoria Olive,48.189905,11.46697,Trattoria/Osteria,POINT (11.46697010051472 48.18990470844716)
1,48.191879,11.482502,Dehner,48.192386,11.500665,Garden Center,POINT (11.50066461188822 48.19238559597119)


In [49]:
x=0
for row in df_geo.iterrows():
    df_venues.loc[df_venues.geometry.within(df_geo.geometry[x]), 'ZIP'] = df_geo.plz[x]
    x=x+1

In [50]:
df_venues


Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry,ZIP
0,48.191879,11.482502,Trattoria Olive,48.189905,11.466970,Trattoria/Osteria,POINT (11.46697010051472 48.18990470844716),80999
1,48.191879,11.482502,Dehner,48.192386,11.500665,Garden Center,POINT (11.50066461188822 48.19238559597119),80993
2,48.191879,11.482502,Sport Bittl,48.191447,11.466553,Sporting Goods Shop,POINT (11.46655298953007 48.19144707663582),80999
3,48.191879,11.482502,Harem - The art of turkish cuisine,48.179742,11.485618,Turkish Restaurant,POINT (11.48561753386705 48.1797424336464),80997
4,48.191879,11.482502,Hit,48.192460,11.499244,Supermarket,POINT (11.49924428639979 48.19245989722315),80997
5,48.191879,11.482502,dm-drogerie markt,48.194118,11.465640,Drugstore,POINT (11.46563990065636 48.1941177124669),80999
6,48.191879,11.482502,NORMA,48.183488,11.478840,Supermarket,POINT (11.47884015458715 48.18348755505669),80997
7,48.191879,11.482502,Rossmann,48.193301,11.466388,Drugstore,POINT (11.46638833281933 48.19330073453552),80999
8,48.191879,11.482502,Lidl,48.194468,11.465456,Supermarket,POINT (11.46545648574829 48.19446839874346),80999
9,48.191879,11.482502,EDEKA,48.181741,11.476108,Supermarket,POINT (11.47610792 48.18174069),80997


## Now the same for bouroughs from df_muc

In [51]:
x=0
for row in df_boroughs.iterrows():
    df_venues.loc[df_venues.geometry.within(df_boroughs.geometry[x]), 'Borough'] = df_boroughs['Borough'][x]
    x=x+1

In [52]:
df_venues.head()

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,geometry,ZIP,Borough
0,48.191879,11.482502,Trattoria Olive,48.189905,11.46697,Trattoria/Osteria,POINT (11.46697010051472 48.18990470844716),80999,Aubing-Lochhausen-Langwied
1,48.191879,11.482502,Dehner,48.192386,11.500665,Garden Center,POINT (11.50066461188822 48.19238559597119),80993,Pasing-Obermenzing
2,48.191879,11.482502,Sport Bittl,48.191447,11.466553,Sporting Goods Shop,POINT (11.46655298953007 48.19144707663582),80999,Aubing-Lochhausen-Langwied
3,48.191879,11.482502,Harem - The art of turkish cuisine,48.179742,11.485618,Turkish Restaurant,POINT (11.48561753386705 48.1797424336464),80997,Aubing-Lochhausen-Langwied
4,48.191879,11.482502,Hit,48.19246,11.499244,Supermarket,POINT (11.49924428639979 48.19245989722315),80997,Pasing-Obermenzing


#### Since we did a search by radius we will have duplicates in the dataframe (some venues might be within the radius from more than one ZIP centroid). Right now df_venues.drop_duplicates() will not eliminated those duplicates since they differ in the 'Neighborhood Latitude/Longitude' columns ('geometry' is also dropped since it creates problems performin .drop_duplicates ). So we can drop those columns and then use .drop_duplicates

In [53]:
df_venues.drop(columns=['Neighborhood Latitude', 'Neighborhood Longitude', 'geometry'], inplace=True)

In [54]:
df_venues.drop_duplicates(inplace=True)

In [55]:
df_venues.shape

(2766, 6)

In [56]:
df_venues.groupby('Borough').count()

Unnamed: 0_level_0,Venue,Venue Latitude,Venue Longitude,Venue Category,ZIP
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Allach-Untermenzing,76,76,76,76,76
Altstadt-Lehel,71,71,71,71,71
Au-Haidhausen,161,161,161,161,161
Aubing-Lochhausen-Langwied,36,36,36,36,36
Berg am Laim,150,150,150,150,150
Bogenhausen,52,52,52,52,52
Feldmoching-Hasenbergl,99,99,99,99,99
Hadern,160,160,160,160,160
Laim,80,80,80,80,79
Ludwigsvorstadt-Isarvorstadt,38,38,38,38,38


# Consolidate all important data into one df

In [57]:
df_muc['latitude']=df_muc['Y']
df_muc['longitude']=df_muc['X']

In [58]:
df_munich=df_muc.drop(columns='geometry')

In [59]:
df_munich = df_munich.join(rent.set_index('Borough'), on='Borough')

In [60]:
df_munich.head(2)

Unnamed: 0,Borough,Borough #,Neighbourhood,X,Y,ZIP,latitude,longitude,average rent
0,Allach-Untermenzing,23,"Allach, Untermenzing",11.632368,48.126861,"80995, 80997, 80999, 81247, 81249",48.126861,11.632368,16.372
1,Altstadt-Lehel,1,"Altstadt, Lehel",11.593314,48.102699,"80331, 80333, 80335, 80336, 80469, 80538, 80539",48.102699,11.593314,22.35


# Now lets analyze the Boroughs

In [61]:
# one hot encoding
muc_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
muc_onehot['Borough'] = df_venues['Borough'] 

# move neighborhood column to the first column
cols=list(muc_onehot.columns.values)
cols.pop(cols.index('Borough'))
muc_onehot=muc_onehot[['Borough']+cols]

muc_onehot.head(2)

Unnamed: 0,Borough,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,...,Vietnamese Restaurant,Volleyball Court,Water Park,Waterfall,Wine Bar,Wine Shop,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Aubing-Lochhausen-Langwied,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Pasing-Obermenzing,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [62]:
muc_grouped = muc_onehot.groupby('Borough').mean().reset_index()
muc_grouped

Unnamed: 0,Borough,ATM,Accessories Store,Afghan Restaurant,African Restaurant,American Restaurant,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,...,Vietnamese Restaurant,Volleyball Court,Water Park,Waterfall,Wine Bar,Wine Shop,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Allach-Untermenzing,0.0,0.013158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Altstadt-Lehel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Au-Haidhausen,0.0,0.0,0.0,0.0,0.006211,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aubing-Lochhausen-Langwied,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Berg am Laim,0.0,0.0,0.006667,0.0,0.0,0.0,0.006667,0.0,0.0,...,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Bogenhausen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Feldmoching-Hasenbergl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.010101,0.0,0.0,0.0,0.0,0.010101,0.0,0.0,0.0,0.0
7,Hadern,0.0,0.0,0.00625,0.00625,0.0,0.0,0.0,0.0,0.0125,...,0.01875,0.0,0.0,0.0,0.0125,0.0,0.00625,0.00625,0.0,0.0
8,Laim,0.0,0.0,0.0,0.0,0.0,0.0125,0.0,0.0,0.0,...,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0125,0.2
9,Ludwigsvorstadt-Isarvorstadt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's print each neighborhood along with the top 5 most common venues

In [63]:
num_top_venues = 5

for borough in muc_grouped['Borough']:
    print("----"+borough+"----")
    temp = muc_grouped[muc_grouped['Borough'] == borough].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Allach-Untermenzing----
                venue  freq
0         Supermarket  0.11
1               Hotel  0.07
2  Italian Restaurant  0.07
3           Drugstore  0.05
4        Tram Station  0.05


----Altstadt-Lehel----
               venue  freq
0        Supermarket  0.08
1   Greek Restaurant  0.07
2              Hotel  0.07
3  German Restaurant  0.06
4     Ice Cream Shop  0.06


----Au-Haidhausen----
               venue  freq
0        Supermarket  0.09
1             Bakery  0.07
2  German Restaurant  0.06
3           Bus Stop  0.05
4          Drugstore  0.05


----Aubing-Lochhausen-Langwied----
                venue  freq
0         Supermarket  0.14
1            Bus Stop  0.11
2              Bakery  0.08
3  Italian Restaurant  0.06
4  Light Rail Station  0.06


----Berg am Laim----
                venue  freq
0                Café  0.15
1  Italian Restaurant  0.05
2          Steakhouse  0.03
3          Art Museum  0.03
4      Ice Cream Shop  0.03


----Bogenhausen----
         venu

### Let's put that into a pandas dataframe

#### First, let's write a function to sort the venues in descending order.

In [64]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [65]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = muc_grouped['Borough']

for ind in np.arange(muc_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(muc_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allach-Untermenzing,Supermarket,Hotel,Italian Restaurant,Tram Station,Bakery,Drugstore,Asian Restaurant,Bank,Doner Restaurant,Gym / Fitness Center
1,Altstadt-Lehel,Supermarket,Hotel,Greek Restaurant,Ice Cream Shop,German Restaurant,Café,Bakery,Pizza Place,Plaza,Indian Restaurant
2,Au-Haidhausen,Supermarket,Bakery,German Restaurant,Bus Stop,Drugstore,Italian Restaurant,Hotel,Ice Cream Shop,Greek Restaurant,Gym / Fitness Center
3,Aubing-Lochhausen-Langwied,Supermarket,Bus Stop,Bakery,Sporting Goods Shop,Hotel,Light Rail Station,Italian Restaurant,Bavarian Restaurant,Drugstore,Automotive Shop
4,Berg am Laim,Café,Italian Restaurant,Bar,Art Museum,Vietnamese Restaurant,Plaza,Steakhouse,Ice Cream Shop,Restaurant,Sushi Restaurant


## Cluster Neighbourhoods

#### Run k-means to cluster the neighbourhood into 5 clusters.

In [66]:
# set number of clusters
kclusters = 5

muc_grouped_clustering = muc_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(muc_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 1, 3, 2, 0, 2, 3, 0, 4, 2, 1, 3, 1, 2, 3, 3, 1, 0, 1, 0, 0, 3,
       3, 1, 3], dtype=int32)

#### Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [67]:
muc_merged = df_munich

# add clustering labels
muc_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
muc_merged = muc_merged.join(boroughs_venues_sorted.set_index('Borough'), on='Borough')

muc_merged.head() # check the last columns!

Unnamed: 0,Borough,Borough #,Neighbourhood,X,Y,ZIP,latitude,longitude,average rent,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Allach-Untermenzing,23,"Allach, Untermenzing",11.632368,48.126861,"80995, 80997, 80999, 81247, 81249",48.126861,11.632368,16.372,3,Supermarket,Hotel,Italian Restaurant,Tram Station,Bakery,Drugstore,Asian Restaurant,Bank,Doner Restaurant,Gym / Fitness Center
1,Altstadt-Lehel,1,"Altstadt, Lehel",11.593314,48.102699,"80331, 80333, 80335, 80336, 80469, 80538, 80539",48.102699,11.593314,22.35,1,Supermarket,Hotel,Greek Restaurant,Ice Cream Shop,German Restaurant,Café,Bakery,Pizza Place,Plaza,Indian Restaurant
2,Au-Haidhausen,5,"Au, Haidhausen",11.513538,48.086758,"81541, 81543, 81667, 81669, 81671, 81675, 81677",48.086758,11.513538,19.818571,3,Supermarket,Bakery,German Restaurant,Bus Stop,Drugstore,Italian Restaurant,Hotel,Ice Cream Shop,Greek Restaurant,Gym / Fitness Center
3,Aubing-Lochhausen-Langwied,22,"Aubing, Freiham, Langwied, Lochhausen",11.46332,48.193009,"81243, 81245, 81249",48.193009,11.46332,16.42,2,Supermarket,Bus Stop,Bakery,Sporting Goods Shop,Hotel,Light Rail Station,Italian Restaurant,Bavarian Restaurant,Drugstore,Automotive Shop
4,Berg am Laim,14,Berg am Laim,11.564687,48.148558,"81671, 81673, 81735, 81825",48.148558,11.564687,16.5625,0,Café,Italian Restaurant,Bar,Art Museum,Vietnamese Restaurant,Plaza,Steakhouse,Ice Cream Shop,Restaurant,Sushi Restaurant


# Clusters visualization - Munich Boroughs

### Let's create a map of Munich showing the clusters

In [68]:
# Latitude and longitude for Munich:
latitude=48.137154
longitude=11.576124

In [69]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(muc_merged['latitude'], muc_merged['longitude'], muc_merged['Borough'], muc_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
folium.GeoJson(
    bezirke,
    name='geojson'
).add_to(map_clusters)

folium.LayerControl().add_to(map_clusters)
                   
    
map_clusters

NameError: name 'bezirke' is not defined

#### Create a choropleth map using the shapefile for the Boroughs and the rent data

In [None]:
# I used https://mapshaper.org to convert the shapefile int a geojson file
#gjson='/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/ms_bgl0.json'

In [None]:
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar
from bokeh.palettes import brewer

In [76]:
df_choro=pd.concat([df_munich[['Borough', 'average rent']], df_muc['geometry']], axis=1)

In [77]:
gdf_choro=GeoDataFrame(df_choro) #convert df_choro to a geodataframe

In [82]:
gdf_choro.head(1)

Unnamed: 0,Borough,average rent,geometry
0,Allach-Untermenzing,16.372,"POLYGON ((11.64551067005214 48.13598140892672,..."


In [83]:
gdf_choro.to_file('/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/choro.geojson',driver='GeoJSON')
#write the geodataframe to a geojson file

In [89]:
bezirke = '/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/choro.geojson'

In [90]:
# Latitude and longitude for Munich:
latitude=48.137154
longitude=11.576124
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
    
#create a choropleth layer displaying the average rent
map_clusters.choropleth(
    geo_data=bezirke,
    name='choropleth',
    data=gdf_choro,
    columns=['Borough', 'average rent'],
    key_on='feature.properties.Borough',
    fill_color='YlGn',
    fill_opacity=0.5,
    line_opacity=0.2,
    legend_name='average rent'
)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(muc_merged['latitude'], muc_merged['longitude'], muc_merged['Borough'], muc_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_clusters)

folium.LayerControl().add_to(map_clusters)


#display the map
map_clusters

# Use ZIP instead of Borough