In [1]:
import pandas as pd
import numpy as np
import wikipedia # Wikipedia is a Python library that makes it easy to access and parse data from Wikipedia
import geopandas as gpd
import requests
import shapely
from shapely.geometry import Point, Polygon

# Create a pandas dataframe including the boroughs, and their including neighbourhoods, for the city of Munich, Germany

## Collect the required data from a wikipedia page

In [2]:
wikipedia.set_lang('de') # we need to change the language since the page is in german

In [3]:
wiki_munich=wikipedia.page("Liste der Stadtteile Münchens")

In [4]:
df = pd.read_html(wiki_munich.url, header =0)[1] # we want to get the data from the second table on the page and use the first row as header
df.head()

Unnamed: 0,Stadtteil,"Stadtbezirk, in dem der Stadtteil (größtenteils) liegt",Stadt-bezirks-nr.,Quartiere und Siedlungen im Stadtteil
0,Allach,Allach-Untermenzing,23,"Allach, Gerberau"
1,Altstadt,Altstadt-Lehel,1,"Angerviertel, Graggenauviertel, Hackenviertel,..."
2,Am Hart,Milbertshofen-Am Hart,11,"Am Hart, Harthof (Ostteil), Nordhaide"
3,Am Moosfeld,Trudering-Riem,15,Am Moosfeld
4,Am Riesenfeld,Milbertshofen-Am Hart,11,"Studentenviertel Oberwiesenfeld, Am Oberwiesen..."


In [5]:
df.columns=['Neighbourhood', 'Borough', 'Borough #', 'Drop'] #rename the columns
df=df.drop(columns='Drop')# drop the 4th column
df.head()

Unnamed: 0,Neighbourhood,Borough,Borough #
0,Allach,Allach-Untermenzing,23
1,Altstadt,Altstadt-Lehel,1
2,Am Hart,Milbertshofen-Am Hart,11
3,Am Moosfeld,Trudering-Riem,15
4,Am Riesenfeld,Milbertshofen-Am Hart,11


## Group the Neighbourhoods by the Boroughs and join them into one row

In [6]:
grouped=df.groupby(['Borough', 'Borough #'])['Neighbourhood'].apply(list).apply(lambda x:', '.join(x)).reset_index()

In [7]:
grouped.head()

Unnamed: 0,Borough,Borough #,Neighbourhood
0,Allach-Untermenzing,23,"Allach, Untermenzing"
1,Altstadt-Lehel,1,"Altstadt, Lehel"
2,Au-Haidhausen,5,"Au, Haidhausen"
3,Aubing-Lochhausen-Langwied,22,"Aubing, Freiham, Langwied, Lochhausen"
4,Berg am Laim,14,Berg am Laim


# Collect data on the rents and housing quality/ living conditions in each of the Boroughs/Neighbourhoods

![title](https://suedbayerische-immobilien.de/sites/default/files/Wohnqualitaet-Muenchen-Toplagen/Wohnqualitaet-Muenchen-Wohnviertel-Toplagen-Stadtteile.png)


## I have used the above image to rate the locations from 1 (worst) to 4 (best) and the written the data into a csv file. We will now import that file and create a pandas dataframe

In [8]:
location='/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/location_rating.csv'
location_df=pd.read_csv(location, sep=';')
location_df.head()

Unnamed: 0,Location,Points
0,Altstadt-Lehel,4
1,Maxvorstadt,4
2,Schwabing,4
3,Altbogenhausen,4
4,Au-Haidhausen,4


## Retrieve the Zip Codes and add them to the dataframe

In [9]:
url_zip = 'https://www.muenchen.de/leben/service/postleitzahlen.html'

In [10]:
df_zip = pd.read_html(url_zip, header =0)[0]
df_zip.rename({'Stadtteil':'Borough', 'Postleitzahl': 'ZIP'}, axis='columns', inplace=True)
df_zip.head()

Unnamed: 0,Borough,ZIP
0,Allach-Untermenzing,"80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,"80331, 80333, 80335, 80336, 80469, 80538, 80539"
2,Au-Haidhausen,"81541, 81543, 81667, 81669, 81671, 81675, 81677"
3,Aubing-Lochhausen-Langwied,"81243, 81245, 81249"
4,Berg am Laim,"81671, 81673, 81735, 81825"


#### Since the Names ob the Boroughs differ slightly we can not join both dfs using the Borough column, instead we will sort the grouped df using the Borough column and then concat the the column with the zip codes (df_zip is also sorted alphabetical)

In [11]:
grouped.sort_values(by=['Borough'], inplace=True)
grouped=grouped.reset_index(drop=True)
grouped.head()

Unnamed: 0,Borough,Borough #,Neighbourhood
0,Allach-Untermenzing,23,"Allach, Untermenzing"
1,Altstadt-Lehel,1,"Altstadt, Lehel"
2,Au-Haidhausen,5,"Au, Haidhausen"
3,Aubing-Lochhausen-Langwied,22,"Aubing, Freiham, Langwied, Lochhausen"
4,Berg am Laim,14,Berg am Laim


In [12]:
df_muc = pd.concat([grouped, df_zip['ZIP']], axis=1, join_axes=[grouped.index])

In [13]:
df_muc.head()

Unnamed: 0,Borough,Borough #,Neighbourhood,ZIP
0,Allach-Untermenzing,23,"Allach, Untermenzing","80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,1,"Altstadt, Lehel","80331, 80333, 80335, 80336, 80469, 80538, 80539"
2,Au-Haidhausen,5,"Au, Haidhausen","81541, 81543, 81667, 81669, 81671, 81675, 81677"
3,Aubing-Lochhausen-Langwied,22,"Aubing, Freiham, Langwied, Lochhausen","81243, 81245, 81249"
4,Berg am Laim,14,Berg am Laim,"81671, 81673, 81735, 81825"


## Next we will collect rent data for the Neighbourhoods

In [14]:
url_rent='https://www.miet-check.de/mietpreise/plz/muenchen/6562/'

In [15]:
df_rent=pd.read_html(url_rent, header=0)[0]
df_rent.head(3)

Unnamed: 0,#,PLZ,Mietpreis pro m2,Anzahl Einträge,Informationen
0,1,80331,23.2 Euro,178,mehr Infos
1,2,80333,22.9 Euro,244,mehr Infos
2,3,80335,21.74 Euro,211,mehr Infos


#### We only need the columns "PLZ"-->"ZIP" and "Mietpreis pro m2"-->"rpm2" (rpm2 = rent per m2)

In [16]:
df_rent=df_rent.drop(columns= ['#', 'Anzahl Einträge', 'Informationen'])

In [17]:
df_rent.columns=['ZIP', 'rpm2']

#### Remove the Euro

In [18]:
df_rent['rpm2'] = df_rent['rpm2'].map(lambda x: str(x)[:-5])

## Now we will need to add the rent data to the existing data

### Since the rent data does not match the bouroughs nor the neighbourhoods exactly we will have to join the data using the ZIP codes. To achieve this we will first create a dictionary out of df_rent (with the ZIP code as the key) and then use that dictonary to replace the ZIP code values in  df_zip with the rpm2 values for the corresponding ZIP code. At the end we will calculate the mean/average rent for each Borough.

#### Create the dictonary

In [19]:
rdic=dict(zip(df_rent.ZIP,df_rent.rpm2))

#### Replace the Zip codes in df_zip  with the rpm2 values

In [20]:
df_zip.head()

Unnamed: 0,Borough,ZIP
0,Allach-Untermenzing,"80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,"80331, 80333, 80335, 80336, 80469, 80538, 80539"
2,Au-Haidhausen,"81541, 81543, 81667, 81669, 81671, 81675, 81677"
3,Aubing-Lochhausen-Langwied,"81243, 81245, 81249"
4,Berg am Laim,"81671, 81673, 81735, 81825"


##### First we need to split up the ZIP code column into one  ZIP code per column

In [21]:
z=df_zip

In [22]:
z=pd.concat([z[['Borough']], z['ZIP'].str.split(',', expand=True)], axis=1)

In [23]:
z.set_index('Borough', inplace=True)
z.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Allach-Untermenzing,80995,80997,80999,81247.0,81249.0,,,,
Altstadt-Lehel,80331,80333,80335,80336.0,80469.0,80538.0,80539.0,,
Au-Haidhausen,81541,81543,81667,81669.0,81671.0,81675.0,81677.0,,
Aubing-Lochhausen-Langwied,81243,81245,81249,,,,,,
Berg am Laim,81671,81673,81735,81825.0,,,,,


In [24]:
df_zip=z

##### Then we can replace the values

In [25]:
df_zip = df_zip.apply(pd.to_numeric)

In [26]:
df_zip.replace(rdic, inplace=True)
df_zip.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Allach-Untermenzing,15.95,16.95,15.56,17.29,16.11,,,,
Altstadt-Lehel,23.2,22.9,21.74,20.38,23.22,22.76,22.25,,
Au-Haidhausen,20.24,19.53,21.27,19.08,16.39,20.21,22.01,,
Aubing-Lochhausen-Langwied,15.5,17.65,16.11,,,,,,
Berg am Laim,16.39,16.98,16.34,16.54,,,,,


##### Next we will calculate the average rent per Borough (we will need to convert the df to numeric values before we can calculate the average)

In [27]:
df_zip=df_zip.apply(pd.to_numeric)
df_zip.dtypes

0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8    float64
dtype: object

In [28]:
df_zip['average rent'] = df_zip.mean(numeric_only=True, axis=1)
df_zip.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,average rent
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Allach-Untermenzing,15.95,16.95,15.56,17.29,16.11,,,,,16.372
Altstadt-Lehel,23.2,22.9,21.74,20.38,23.22,22.76,22.25,,,22.35
Au-Haidhausen,20.24,19.53,21.27,19.08,16.39,20.21,22.01,,,19.818571
Aubing-Lochhausen-Langwied,15.5,17.65,16.11,,,,,,,16.42
Berg am Laim,16.39,16.98,16.34,16.54,,,,,,16.5625


In [29]:
rent=df_zip[['average rent']]

In [30]:
rent.head()

Unnamed: 0_level_0,average rent
Borough,Unnamed: 1_level_1
Allach-Untermenzing,16.372
Altstadt-Lehel,22.35
Au-Haidhausen,19.818571
Aubing-Lochhausen-Langwied,16.42
Berg am Laim,16.5625


# Miete mit der Wohnqualität vergleichen (daten zum DF hinzufügen und dann nach Mieten (absteigend) ordnen. Korrelation zu Wohnqualität gegeben?

## In the next step we will collect data from foursquare and use that data for clustering

#### Foursquare credentials

In [31]:
CLIENT_ID = '0MJA3NYYG3U2ZY1LTZN2OYEHS3Y3WVSON2GBSO3IL4EDYVIR' # your Foursquare ID
CLIENT_SECRET = 'WGWSAF2TKVUQPE3PD0N3EOITFVBY5EYP1VCZI3BMUG0ROUS5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

#### Import geodata from https://public.opendatasoft.com/explore/dataset/postleitzahlen-deutschland/table/ using geopandas

In [33]:
fname='/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/muc.geojson'
df_geo=gpd.read_file(fname)
df_geo.head()

Unnamed: 0,note,plz,geometry
0,München,80997,"POLYGON ((11.4597967 48.2117073, 11.460079 48...."
1,München,81669,"POLYGON ((11.5825324 48.1269235, 11.5837205 48..."
2,München,80686,"POLYGON ((11.490444 48.1377022, 11.4907062 48...."
3,München,80802,"POLYGON ((11.579715 48.15565839999999, 11.5798..."
4,München,80331,"POLYGON ((11.5631981 48.13660969999999, 11.563..."


#### Get the latitude/y and longitude/x of the centroid for each polygon/ZIP

In [34]:
df_geo['latitude']=df_geo['geometry'].centroid.y
df_geo['longitude']=df_geo['geometry'].centroid.x
df_geo['point']=df_geo['geometry'].centroid

In [35]:
df_geo.head()

Unnamed: 0,note,plz,geometry,latitude,longitude,point
0,München,80997,"POLYGON ((11.4597967 48.2117073, 11.460079 48....",48.191879,11.482502,POINT (11.48250248003875 48.19187885565856)
1,München,81669,"POLYGON ((11.5825324 48.1269235, 11.5837205 48...",48.120075,11.601146,POINT (11.60114591581964 48.12007527984551)
2,München,80686,"POLYGON ((11.490444 48.1377022, 11.4907062 48....",48.132127,11.512238,POINT (11.51223754740396 48.13212689623667)
3,München,80802,"POLYGON ((11.579715 48.15565839999999, 11.5798...",48.159255,11.591139,POINT (11.5911393228924 48.15925479186028)
4,München,80331,"POLYGON ((11.5631981 48.13660969999999, 11.563...",48.135964,11.572905,POINT (11.5729048502942 48.13596423775832)


In [36]:
df_lalo=df_geo[['plz','latitude','longitude']]

In [37]:
df_lalo.head()

Unnamed: 0,plz,latitude,longitude
0,80997,48.191879,11.482502
1,81669,48.120075,11.601146
2,80686,48.132127,11.512238
3,80802,48.159255,11.591139
4,80331,48.135964,11.572905


#### (We will add the Borough as an index to this data, in order to do this we are going to create a df/list with just one column from df_zip and remove the rows without a zip code and then join it with df_lalo on plz/ZIP)

In [38]:
#zl=pd.concat([z[0], z[1],z[2],z[3],z[4],z[5],z[6],z[7],z[8]])
#df_zl=pd.DataFrame(zl)
#df_zl=df_zl.dropna()
#df_zl.rename(columns={0:'ZIP'}, inplace=True)
#df_zl.sort_index(inplace=True)
#df_zl.head()

### Collect the Data from foursquare

### Extract the Data we want to use

### Cluster the Boroughs/Neighbourhoods

In [39]:
def getNearbyVenues(latitudes, longitudes):
    radius=1000
    LIMIT=900
    venues_list=[]
    for lat, lng in zip(latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([( 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [41]:
test_venues = getNearbyVenues(latitudes=df_lalo['latitude'], longitudes=df_lalo['longitude'])

In [42]:
test_venues.shape

(4052, 6)

In [43]:
test_venues.head()

Unnamed: 0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,48.191879,11.482502,NORMA,48.183488,11.47884,Supermarket
1,48.191879,11.482502,H Manzostraße,48.183397,11.484077,Bus Stop
2,48.191879,11.482502,BrotUndKaffee,48.18345,11.478196,Bakery
3,48.191879,11.482502,München Nord Rangierbahnhof,48.194881,11.495227,Train Station
4,48.120075,11.601146,Thessaloniki,48.121549,11.599192,Greek Restaurant
