In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


 # Weather data from German meteorological service

In [1]:
# import libraries
import pandas as pd


In [2]:
# set jupyter notebook display options
pd.set_option('display.max_rows', 900) # set up display area to show dataframe
pd.options.display.max_rows # prevent truncation of text strings in columns
pd.set_option('display.max_colwidth', -1)


 ## Wind data

In [3]:
# read fixed width formatted text file with list of weather stations in DE
# first, extract list of column names (separated by space(s))
cols_stn = pd.read_csv('data/Meteo/DE/wind_hourly/FF_Stundenwerte_Beschreibung_Stationen.txt', sep=r"\s+", nrows=1).columns.tolist()


In [4]:
# then, extract the data, skipping first two rows and assigning column names
# encoding used due to presence of accented latin characters (e.g., ü)
df_stn=pd.read_fwf('data/Meteo/DE/wind_hourly/FF_Stundenwerte_Beschreibung_Stationen.txt', encoding="ISO-8859-1", skiprows=2, names=cols_stn)
df_stn.dtypes # return data types of each column


Stations_id      int64  
von_datum        int64  
bis_datum        int64  
Stationshoehe    int64  
geoBreite        float64
geoLaenge        float64
Stationsname     object 
Bundesland       object 
dtype: object

In [5]:
# tanslate column titles to English
df_stn = df_stn.set_axis(['station_id', 'start_date', 'end_date', 'station_height', 'latitude', 'longitude', 'station_name', 'state'], axis='columns', inplace=False)


In [6]:
# filter stations with data between 2018-01-01 and 2018-12-31
df_stn = df_stn.drop(df_stn[(df_stn.start_date>20180101)|(df_stn.end_date<20181231)].index)


In [7]:
# convert dtypes for start_date and end_date to datetime
df_stn['start_date'] = pd.to_datetime(df_stn['start_date'], format="%Y%m%d")
df_stn['end_date'] = pd.to_datetime(df_stn['end_date'], format="%Y%m%d")


In [8]:
# sort data by state
df_stn = df_stn.sort_values(['state'])
df_stn = df_stn.reset_index(drop=True) # reset index
df_stn # return dataframe


Unnamed: 0,station_id,start_date,end_date,station_height,latitude,longitude,station_name,state
0,11,1980-09-01,2019-04-17,680,47.9737,8.5205,Donaueschingen (Landeplatz),Baden-Württemberg
1,3761,1952-01-01,2019-04-16,276,49.207,9.5175,Öhringen,Baden-Württemberg
2,3402,2008-01-24,2019-04-16,750,48.3851,9.4837,Münsingen-Apfelstetten,Baden-Württemberg
3,3362,2006-07-01,2019-04-16,244,48.9721,8.8734,Mühlacker,Baden-Württemberg
4,3268,2013-10-01,2019-04-16,900,48.1694,8.9433,Meßstetten-Appental,Baden-Württemberg
5,2886,1970-01-01,2019-03-31,539,48.2176,9.9097,Laupheim,Baden-Württemberg
6,2812,1977-01-01,2019-04-16,155,48.3647,7.828,Lahr,Baden-Württemberg
7,3925,1989-04-06,2019-04-16,333,48.9329,8.6973,Pforzheim-Ispringen,Baden-Württemberg
8,2712,1959-07-01,2019-04-16,443,47.6774,9.1901,Konstanz,Baden-Württemberg
9,2485,2011-05-01,2019-04-16,489,48.9171,9.6872,Kaisersbach-Cronhütte,Baden-Württemberg


In [9]:
# save as csv
# cp1252 encoding used for latin characters
df_stn.to_csv('data/output/DE_wind_stn_2018.csv', index=None, encoding="cp1252")


In [10]:
df_stn.dtypes # return data types of each column


station_id        int64         
start_date        datetime64[ns]
end_date          datetime64[ns]
station_height    int64         
latitude          float64       
longitude         float64       
station_name      object        
state             object        
dtype: object

In [11]:
# states with data available
df_stn.state.unique()


array(['Baden-Württemberg', 'Bayern', 'Berlin', 'Brandenburg', 'Bremen',
       'Hamburg', 'Hessen', 'Mecklenburg-Vorpommern', 'Niedersachsen',
       'Nordrhein-Westfalen', 'Rheinland-Pfalz', 'Saarland', 'Sachsen',
       'Sachsen-Anhalt', 'Schleswig-Holstein', 'Thüringen'], dtype=object)