### Importing Packages

In [33]:
import pandas as pd # data processing
import fnmatch # file searching
import os
import folium # visualizing weather stations
from folium.plugins import MarkerCluster
from datetime import datetime # converting int into datetime

### Collecting all geographic data

In [34]:
# collecting all metadata with geographical informations
dfs = []
# going through all files
for filename in os.listdir('Daten/Wetterdaten'):
    if fnmatch.fnmatch(filename, 'tageswerte*'):
        for filenames in os.listdir(f'Daten/Wetterdaten/{filename}'):
            # files with "geographie" in it
            if fnmatch.fnmatch(filenames, "*Geographie*"):
                df = pd.read_csv(f"Daten/Wetterdaten/{filename}/{filenames}", encoding="unicode_escape", delimiter=";")
                # lower case, because of different labeling
                df.columns = df.columns.str.lower()
                dfs.append(df)
# all geographic data in one dataframe
geodaten = pd.concat(dfs)
geodaten = geodaten.reset_index(drop=True)
geodaten.info()
geodaten.head(200)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   stations_id    86 non-null     int64  
 1   stationshoehe  86 non-null     float64
 2   geogr.breite   86 non-null     float64
 3   geogr.laenge   86 non-null     float64
 4   von_datum      86 non-null     int64  
 5   bis_datum      86 non-null     object 
 6   stationsname   86 non-null     object 
dtypes: float64(3), int64(2), object(2)
memory usage: 4.8+ KB


Unnamed: 0,stations_id,stationshoehe,geogr.breite,geogr.laenge,von_datum,bis_datum,stationsname
0,377,181.0,49.1017,7.9920,19261101,19390831,"Bergzabern, Bad"
1,377,181.0,49.1017,7.9920,19410101,19431231,"Bergzabern, Bad"
2,377,181.0,49.1017,7.9920,19470101,19650831,"Bergzabern, Bad"
3,377,185.0,49.1040,7.9967,19650901,19770623,"Bergzabern, Bad"
4,377,180.0,49.1014,8.0034,19770624,20021231,"Bergzabern, Bad"
...,...,...,...,...,...,...,...
81,5717,178.0,51.2504,7.1455,19241001,19320331,Wuppertal-Buchenhofen
82,5717,126.0,51.2300,7.1100,19320401,20060831,Wuppertal-Buchenhofen
83,5717,134.0,51.2256,7.1052,20060901,,Wuppertal-Buchenhofen
84,5792,2956.0,47.4210,10.9848,19000719,20071016,Zugspitze


### Turning von_datum and bis_datum into datetime types

In [35]:
def zahl_zu_datum(zahl):
        try:
            datum_string = str(zahl)
            datum_datetime = datetime.strptime(datum_string, '%Y%m%d')
            datum_datetime2 = datetime.strftime(datum_datetime, '%m.%d.%Y')
            return datum_datetime2
        except:
            return None

geodaten['von_datum'] = geodaten['von_datum'].apply(zahl_zu_datum)
geodaten['von_datum'] = pd.to_datetime(geodaten["von_datum"])

geodaten["bis_datum"] = geodaten["bis_datum"].apply(zahl_zu_datum)
geodaten['bis_datum'] = pd.to_datetime(geodaten["bis_datum"])

geodaten.head(20)


Unnamed: 0,stations_id,stationshoehe,geogr.breite,geogr.laenge,von_datum,bis_datum,stationsname
0,377,181.0,49.1017,7.992,1926-11-01,1939-08-31,"Bergzabern, Bad"
1,377,181.0,49.1017,7.992,1941-01-01,1943-12-31,"Bergzabern, Bad"
2,377,181.0,49.1017,7.992,1947-01-01,1965-08-31,"Bergzabern, Bad"
3,377,185.0,49.104,7.9967,1965-09-01,1977-06-23,"Bergzabern, Bad"
4,377,180.0,49.1014,8.0034,1977-06-24,2002-12-31,"Bergzabern, Bad"
5,377,210.0,49.107,7.9967,2003-03-11,NaT,"Bergzabern, Bad"
6,433,47.0,52.4641,13.4003,1928-01-01,1943-12-31,Berlin-Tempelhof
7,433,48.0,52.4705,13.4041,1944-01-01,1945-02-28,Berlin-Tempelhof
8,433,48.0,52.4705,13.4041,1945-03-01,1947-06-30,Berlin-Tempelhof
9,433,48.2,52.4686,13.4039,1947-07-01,1951-02-28,Berlin-Tempelhof


In [36]:
geodaten.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   stations_id    86 non-null     int64         
 1   stationshoehe  86 non-null     float64       
 2   geogr.breite   86 non-null     float64       
 3   geogr.laenge   86 non-null     float64       
 4   von_datum      86 non-null     datetime64[ns]
 5   bis_datum      71 non-null     datetime64[ns]
 6   stationsname   86 non-null     object        
dtypes: datetime64[ns](2), float64(3), int64(1), object(1)
memory usage: 4.8+ KB


### Deleting all data before 1994

In [46]:
geodaten = geodaten[(geodaten['bis_datum'] >= pd.to_datetime('1994-01-01')) | (geodaten["bis_datum"].isna())]
geodaten.head(500)

Unnamed: 0,stations_id,stationshoehe,geogr.breite,geogr.laenge,von_datum,bis_datum,stationsname
4,377,180.0,49.1014,8.0034,1977-06-24,2002-12-31,"Bergzabern, Bad"
5,377,210.0,49.107,7.9967,2003-03-11,NaT,"Bergzabern, Bad"
13,433,47.72,52.4675,13.4021,1970-09-10,2022-09-12,Berlin-Tempelhof
14,433,47.74,52.4676,13.402,2022-09-13,NaT,Berlin-Tempelhof
20,555,77.0,51.4917,7.2157,1979-01-01,1994-04-30,Bochum
21,555,101.0,51.4789,7.2697,2007-03-19,2019-03-31,Bochum
22,555,110.0,51.5026,7.2289,2019-04-01,NaT,Bochum
24,850,39.0,52.5958,10.028,1979-01-01,2007-06-13,Celle
25,850,39.0,52.5959,10.0296,2007-06-14,2017-03-30,Celle
26,850,44.74,52.5959,10.0296,2017-03-31,NaT,Celle


### Using the average of the station postition

In [62]:
mittelpunkt_stationen_br = geodaten.groupby("stationsname")["geogr.breite"].mean()
mittelpunkt_stationen_la = geodaten.groupby("stationsname")["geogr.laenge"].mean()
stations_mittelwert = geodaten.groupby("stationsname")["stationshoehe"].median()

mittelpunkt_koordinaten = pd.merge(mittelpunkt_stationen_br,mittelpunkt_stationen_la, on="stationsname")
mittelpunkt_koordinaten = pd.merge(mittelpunkt_koordinaten, stations_mittelwert, on="stationsname")

mittelpunkt_koordinaten.reset_index(drop=False, inplace=True)
mittelpunkt_koordinaten.to_csv("geographie.csv")


In [63]:
median_hoehe = stations_mittelwert.median()
durschnitt_hoehe = round(stations_mittelwert.mean(),2)
print(median_hoehe)
print(durschnitt_hoehe)

130.0
357.76


In [43]:
median_breite = mittelpunkt_stationen_br.median()
median_laenge = mittelpunkt_stationen_la.median()
durschnitt_laenge = round(mittelpunkt_stationen_la.mean(),4)
durschnitt_breite = round(mittelpunkt_stationen_br.mean(),4)
median_breite

51.2278

### Plotting the locations of the weather stations

In [45]:
map_deutschland = folium.Map(location=[51.1657, 10.4515], zoom_start=6)

# Marker Cluster erstellen
marker_cluster = MarkerCluster().add_to(map_deutschland)

# Wetterstationen hinzufügen
for index, row in mittelpunkt_koordinaten.iterrows():
    folium.Marker(location=[row['geogr.breite'], row['geogr.laenge']],
                  popup=row['stationsname']).add_to(marker_cluster)
    

folium.Marker(location=[median_breite, median_laenge],
                  popup="Median").add_to(map_deutschland)

folium.Marker(location=[durschnitt_breite, durschnitt_laenge],
                  popup="Durchschnitt").add_to(map_deutschland)

map_deutschland