In [1]:
import pandas as pd
import numpy as np
import wikipedia # Wikipedia is a Python library that makes it easy to access and parse data from Wikipedia

### Create a pandas dataframe including the boroughs, and their including neighbourhoods, for the city of Munich, Germany

#### Collect the required data from a wikipedia page

In [2]:
wikipedia.set_lang('de') # we need to change the language since the page is in german

In [3]:
wiki_munich=wikipedia.page("Liste der Stadtteile Münchens")

In [4]:
df = pd.read_html(wiki_munich.url, header =0)[1] # we want to get the data from the second table on the page and use the first row as header
df.head()

Unnamed: 0,Stadtteil,"Stadtbezirk, in dem der Stadtteil (größtenteils) liegt",Stadt-bezirks-nr.,Quartiere und Siedlungen im Stadtteil
0,Allach,Allach-Untermenzing,23,"Allach, Gerberau"
1,Altstadt,Altstadt-Lehel,1,"Angerviertel, Graggenauviertel, Hackenviertel,..."
2,Am Hart,Milbertshofen-Am Hart,11,"Am Hart, Harthof (Ostteil), Nordhaide"
3,Am Moosfeld,Trudering-Riem,15,Am Moosfeld
4,Am Riesenfeld,Milbertshofen-Am Hart,11,"Studentenviertel Oberwiesenfeld, Am Oberwiesen..."


In [5]:
df.columns=['Neighbourhood', 'Borough', 'Borough #', 'Drop'] #rename the columns
df=df.drop(columns='Drop')# drop the 4th column
df.head()

Unnamed: 0,Neighbourhood,Borough,Borough #
0,Allach,Allach-Untermenzing,23
1,Altstadt,Altstadt-Lehel,1
2,Am Hart,Milbertshofen-Am Hart,11
3,Am Moosfeld,Trudering-Riem,15
4,Am Riesenfeld,Milbertshofen-Am Hart,11


In [6]:
#Group the Neighbourhoods by the Boroughs and join them into one row

In [7]:
grouped=df.groupby(['Borough #', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x:', '.join(x)).reset_index()

In [8]:
grouped.head()

Unnamed: 0,Borough #,Borough,Neighbourhood
0,1,Altstadt-Lehel,"Altstadt, Lehel"
1,2,Ludwigsvorstadt-Isarvorstadt,"Isarvorstadt, Ludwigsvorstadt"
2,3,Maxvorstadt,Maxvorstadt
3,4,Schwabing-West,Schwabing (Westteil)
4,5,Au-Haidhausen,"Au, Haidhausen"


### Collect data on the rents and housing quality/ living conditions in each of the Boroughs/Neighbourhoods

![title](https://suedbayerische-immobilien.de/sites/default/files/Wohnqualitaet-Muenchen-Toplagen/Wohnqualitaet-Muenchen-Wohnviertel-Toplagen-Stadtteile.png)


#### I have used the above image to rate the locations from 1 (worst) to 4 (best) and the written the data into a csv file. We will now import that file and create a pandas dataframe

In [9]:
location='/Users/achimpeichl/Documents/GitHub/Coursera_Capstone/Munich/location_rating.csv'
location_df=pd.read_csv(location, sep=';')
location_df.head()

Unnamed: 0,Location,Points
0,Altstadt-Lehel,4
1,Maxvorstadt,4
2,Schwabing,4
3,Altbogenhausen,4
4,Au-Haidhausen,4


#### Retrieve the Zip Codes and add them to the dataframe

In [10]:
url_zip = 'https://www.muenchen.de/leben/service/postleitzahlen.html'

In [11]:
df_zip = pd.read_html(url_zip, header =0)[0]
df_zip.rename({'Postleitzahl': 'ZIP'}, axis='columns', inplace=True)
df_zip.head()

Unnamed: 0,Stadtteil,ZIP
0,Allach-Untermenzing,"80995, 80997, 80999, 81247, 81249"
1,Altstadt-Lehel,"80331, 80333, 80335, 80336, 80469, 80538, 80539"
2,Au-Haidhausen,"81541, 81543, 81667, 81669, 81671, 81675, 81677"
3,Aubing-Lochhausen-Langwied,"81243, 81245, 81249"
4,Berg am Laim,"81671, 81673, 81735, 81825"


Since the Names ob the Boroughs differ slightly we can not join both dfs using the Borough column, instead we will sort the grouped df using the Borough column and then concat the the column with the zip codes (df_zip is also sorted alphabetical)

In [12]:
grouped.sort_values(by=['Borough'], inplace=True)
grouped.reset_index(inplace=True)

In [13]:
grouped.head()

Unnamed: 0,index,Borough #,Borough,Neighbourhood
0,22,23,Allach-Untermenzing,"Allach, Untermenzing"
1,0,1,Altstadt-Lehel,"Altstadt, Lehel"
2,4,5,Au-Haidhausen,"Au, Haidhausen"
3,21,22,Aubing-Lochhausen-Langwied,"Aubing, Freiham, Langwied, Lochhausen"
4,13,14,Berg am Laim,Berg am Laim


In [14]:
df_muc = pd.concat([grouped, df_zip['ZIP']], axis=1, join_axes=[grouped.index])

In [15]:
df_muc.head()

Unnamed: 0,index,Borough #,Borough,Neighbourhood,ZIP
0,22,23,Allach-Untermenzing,"Allach, Untermenzing","80995, 80997, 80999, 81247, 81249"
1,0,1,Altstadt-Lehel,"Altstadt, Lehel","80331, 80333, 80335, 80336, 80469, 80538, 80539"
2,4,5,Au-Haidhausen,"Au, Haidhausen","81541, 81543, 81667, 81669, 81671, 81675, 81677"
3,21,22,Aubing-Lochhausen-Langwied,"Aubing, Freiham, Langwied, Lochhausen","81243, 81245, 81249"
4,13,14,Berg am Laim,Berg am Laim,"81671, 81673, 81735, 81825"


#### Now we need to collect rent data for the Neighbourhoods and add them to our existing data

In [16]:
url_rent='https://www.wohnungsboerse.net/mietspiegel-Muenchen/2091'

In [17]:
dfs_rent=pd.read_html(url_rent, header=0)
df_rent = pd.concat(dfs_rent[3:5])
df_rent.head()

Unnamed: 0,STADTTEIL,€/m²
0,Allach pdfData.whgDistrictPrices.push({ distr...,"16,44 €"
1,Altstadt pdfData.whgDistrictPrices.push({ dis...,"24,12 €"
2,Am Hart pdfData.whgDistrictPrices.push({ dist...,"15,82 €"
3,Au pdfData.whgDistrictPrices.push({ district:...,"21,53 €"
4,Aubing pdfData.whgDistrictPrices.push({ distr...,"17,35 €"


#### Split the string to remove evrything but the name

In [18]:
df_rent["STADTTEIL"]= df_rent['STADTTEIL'].str.split(" pdf", n = 1, expand = True)
df_rent.head()

Unnamed: 0,STADTTEIL,€/m²
0,Allach,"16,44 €"
1,Altstadt,"24,12 €"
2,Am Hart,"15,82 €"
3,Au,"21,53 €"
4,Aubing,"17,35 €"


In [19]:
df2=df_rent
df2.head()

Unnamed: 0,STADTTEIL,€/m²
0,Allach,"16,44 €"
1,Altstadt,"24,12 €"
2,Am Hart,"15,82 €"
3,Au,"21,53 €"
4,Aubing,"17,35 €"


In [20]:
df2.reset_index(inplace=True)

In [21]:
df2.head()

Unnamed: 0,index,STADTTEIL,€/m²
0,0,Allach,"16,44 €"
1,1,Altstadt,"24,12 €"
2,2,Am Hart,"15,82 €"
3,3,Au,"21,53 €"
4,4,Aubing,"17,35 €"


In [22]:
df_rent.head()

Unnamed: 0,index,STADTTEIL,€/m²
0,0,Allach,"16,44 €"
1,1,Altstadt,"24,12 €"
2,2,Am Hart,"15,82 €"
3,3,Au,"21,53 €"
4,4,Aubing,"17,35 €"


In [23]:
df['Model'] = [x.split(',')[0].replace('(', '') for x in df['Raw']]


KeyError: 'Raw'

#### Since the Locations does not match the bouroughs nor the neighbourhoods exactly we will have to explore the best way to join the data

In [None]:
def waspasst(Bezirk):
    for location in Bezirk:
        if df[df['Borough'].isin([location])].empty is True:
            print(location+' nicht gefunden')
    

In [None]:
waspasst(df_zip['Stadtteil'])

In [None]:
def matches_out(list1, list2):
    return list(set(list1).intersection(set(list2)))

In [None]:
matches_out(df['Neighbourhood'], location_df['Location'])

In [None]:
len(matches_out(df['Borough'], df_zip['Stadtteil']))