In [0]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
import os
try:
	os.chdir(os.path.join(os.getcwd(), '..'))
	print(os.getcwd())
except:
	pass


 # Weather data from German meteorological service

In [1]:
# import libraries
import pandas as pd


In [2]:
# set jupyter notebook display options
pd.set_option('display.max_rows', 900) # set up display area to show dataframe
pd.options.display.max_rows # prevent truncation of text strings in columns
pd.set_option('display.max_colwidth', -1)


 ## Solar data

In [3]:
# read fixed width formatted text file with list of weather stations in DE
# first, extract list of column names (separated by space(s))
cols_stn = pd.read_csv('data/Meteo/DE/solar_hourly/ST_Stundenwerte_Beschreibung_Stationen.txt', sep=r"\s+", nrows=1).columns.tolist()


In [4]:
# then, extract the data, skipping first two rows and assigning column names
# encoding used due to presence of accented latin characters (e.g., ü)
df_stn = pd.read_fwf('data/Meteo/DE/solar_hourly/ST_Stundenwerte_Beschreibung_Stationen.txt', encoding="ISO-8859-1", skiprows=2, names=cols_stn)
df_stn.dtypes # return data types of each column


Stations_id      int64  
von_datum        int64  
bis_datum        int64  
Stationshoehe    int64  
geoBreite        float64
geoLaenge        float64
Stationsname     object 
Bundesland       object 
dtype: object

In [5]:
# tanslate column titles to English
df_stn = df_stn.set_axis(['station_id', 'start_date', 'end_date', 'station_height', 'latitude', 'longitude', 'station_name', 'state'], axis='columns', inplace=False)


In [6]:
# filter stations with data between 2018-01-01 and 2018-12-31
df_stn = df_stn.drop(df_stn[(df_stn.start_date>20180101)|(df_stn.end_date<20181231)].index)


In [7]:
# convert dtypes for start_date and end_date to datetime
df_stn['start_date'] = pd.to_datetime(df_stn['start_date'], format="%Y%m%d")
df_stn['end_date'] = pd.to_datetime(df_stn['end_date'], format="%Y%m%d")


In [8]:
# sort data by state
df_stn = df_stn.sort_values(['state'])
df_stn = df_stn.reset_index(drop=True) # reset index
df_stn # return dataframe


Unnamed: 0,station_id,start_date,end_date,station_height,latitude,longitude,station_name,state
0,5906,1979-01-01,2019-06-30,98,49.5062,8.5585,Mannheim,Baden-Württemberg
1,4928,1979-01-01,2019-06-30,314,48.8281,9.2,Stuttgart (Schnarrenberg),Baden-Württemberg
2,2712,1977-01-01,2019-06-30,443,47.6774,9.1901,Konstanz,Baden-Württemberg
3,5792,2013-01-01,2019-06-30,2964,47.4209,10.9847,Zugspitze,Bayern
4,5705,1957-01-01,2019-06-30,268,49.7703,9.9577,Würzburg,Bayern
5,5404,1961-01-01,2019-06-30,477,48.4025,11.6946,Weihenstephan-Dürnast,Bayern
6,5856,1997-01-02,2019-06-30,476,48.5451,13.3531,Fürstenzell,Bayern
7,2290,1953-01-01,2019-06-30,977,47.8009,11.0108,Hohenpeißenberg,Bayern
8,3668,2009-01-01,2019-06-30,314,49.503,11.0549,Nürnberg,Bayern
9,3015,1981-01-01,2019-06-30,98,52.2085,14.118,Lindenberg,Brandenburg


In [9]:
# save as csv
# cp1252 encoding used for latin characters
df_stn.to_csv('data/output/DE_solar_stn_2018.csv', index=None, encoding="cp1252")


In [10]:
df_stn.dtypes # return data types of each column


station_id        int64         
start_date        datetime64[ns]
end_date          datetime64[ns]
station_height    int64         
latitude          float64       
longitude         float64       
station_name      object        
state             object        
dtype: object

In [11]:
# states with data available
df_stn.state.unique()


array(['Baden-Württemberg', 'Bayern', 'Brandenburg', 'Bremen', 'Hamburg',
       'Mecklenburg-Vorpommern', 'Niedersachsen', 'Rheinland-Pfalz',
       'Saarland', 'Sachsen', 'Sachsen-Anhalt', 'Schleswig-Holstein'],
      dtype=object)