In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


In [1]:
# import libraries
import pandas as pd


 # Weather data from German meteorological service

In [2]:
# read fixed width formatted text file with list of weather stations in DE
# first, extract list of column names (separated by space(s))
cols_stn = pd.read_csv('data/Meteo/DE/wind_hourly/FF_Stundenwerte_Beschreibung_Stationen.txt', sep = r"\s+", nrows = 1).columns.tolist()


In [3]:
# then, extract the data, skipping first two rows and assigning column names
# encoding used due to presence of accented latin characters (e.g., ü)
df_stn = pd.read_fwf('data/Meteo/DE/wind_hourly/FF_Stundenwerte_Beschreibung_Stationen.txt', encoding = "ISO-8859-1", skiprows = 2, names = cols_stn)
df_stn.dtypes # return data types of each column


Stations_id        int64
von_datum          int64
bis_datum          int64
Stationshoehe      int64
geoBreite        float64
geoLaenge        float64
Stationsname      object
Bundesland        object
dtype: object

In [4]:
# tanslate column titles to English
df_stn = df_stn.set_axis(['station_id', 'start_date', 'end_date', 'station_height', 'latitude', 'longitude', 'station_name', 'state'], axis = 'columns', inplace = False)


In [5]:
# filter stations with data between 2018-01-01 and 2018-12-31
df_stn = df_stn.drop(df_stn[(df_stn.start_date > 20180101) | (df_stn.end_date < 20181231)].index)
df_stn # return dataframe


Unnamed: 0,station_id,start_date,end_date,station_height,latitude,longitude,station_name,state
1,11,19800901,20190417,680,47.9737,8.5205,Donaueschingen (Landeplatz),Baden-Württemberg
5,90,19880219,20190417,305,50.7557,9.2583,Alsfeld,Hessen
7,102,20010404,20190417,32,53.8633,8.1275,Leuchtturm Alte Weser,Niedersachsen
9,125,19740301,20190331,756,47.8342,10.8667,Altenstadt,Bayern
11,161,19801201,20190417,75,50.4237,7.4202,Andernach,Rheinland-Pfalz
12,164,19790101,20190417,54,53.0316,13.9908,Angermünde,Brandenburg
16,183,19730101,20190417,42,54.6792,13.4343,Arkona,Mecklenburg-Vorpommern
18,197,19900801,20190417,365,51.3218,9.0558,"Arolsen-Volkhardinghausen, Bad",Hessen
19,198,19610101,20190417,164,51.3745,11.2920,Artern,Thüringen
21,232,19610101,20190417,461,48.4254,10.9420,Augsburg,Bayern


In [6]:
# convert dtypes for start_date and end_date to datetime
df_stn['start_date'] = pd.to_datetime(df_stn['start_date'], format = "%Y%m%d")
df_stn['end_date'] = pd.to_datetime(df_stn['end_date'], format = "%Y%m%d")
df_stn # return dataframe


Unnamed: 0,station_id,start_date,end_date,station_height,latitude,longitude,station_name,state
1,11,1980-09-01,2019-04-17,680,47.9737,8.5205,Donaueschingen (Landeplatz),Baden-Württemberg
5,90,1988-02-19,2019-04-17,305,50.7557,9.2583,Alsfeld,Hessen
7,102,2001-04-04,2019-04-17,32,53.8633,8.1275,Leuchtturm Alte Weser,Niedersachsen
9,125,1974-03-01,2019-03-31,756,47.8342,10.8667,Altenstadt,Bayern
11,161,1980-12-01,2019-04-17,75,50.4237,7.4202,Andernach,Rheinland-Pfalz
12,164,1979-01-01,2019-04-17,54,53.0316,13.9908,Angermünde,Brandenburg
16,183,1973-01-01,2019-04-17,42,54.6792,13.4343,Arkona,Mecklenburg-Vorpommern
18,197,1990-08-01,2019-04-17,365,51.3218,9.0558,"Arolsen-Volkhardinghausen, Bad",Hessen
19,198,1961-01-01,2019-04-17,164,51.3745,11.2920,Artern,Thüringen
21,232,1961-01-01,2019-04-17,461,48.4254,10.9420,Augsburg,Bayern


In [7]:
df_stn.dtypes # return data types of each column


station_id                 int64
start_date        datetime64[ns]
end_date          datetime64[ns]
station_height             int64
latitude                 float64
longitude                float64
station_name              object
state                     object
dtype: object

In [8]:
# states with data available
df_stn.state.unique()


array(['Baden-Württemberg', 'Hessen', 'Niedersachsen', 'Bayern',
       'Rheinland-Pfalz', 'Brandenburg', 'Mecklenburg-Vorpommern',
       'Thüringen', 'Berlin', 'Saarland', 'Nordrhein-Westfalen', 'Bremen',
       'Sachsen-Anhalt', 'Schleswig-Holstein', 'Sachsen', 'Hamburg'],
      dtype=object)