In [1]:
import pandas as pd
from datetime import datetime, date, time, timedelta, timezone
import dateutil.parser as parser

import warnings
warnings.filterwarnings('ignore')

### Field data Sugarbeet

In [3]:
# load the data from the CSV file provided by the company
df_sugarbeet = pd.read_csv('data/DatenfürGina_recorded2.csv', decimal=',', delimiter=';')
# change the remaining column names to lowercase for easier use later on
df_sugarbeet.columns = df_sugarbeet.columns.str.lower()

# rename some of the columns
df_sugarbeet.rename(columns={'ginams!': 'ms_comp', 'ginaotype!': 'otype_comp', 'ginapoll!': 'pollinator_comp', 'ginaseednames!': 'seednames_coded', 'fieldid': 'station_location'}, inplace=True)

#remove the numbers from the fieldid
df_sugarbeet['station_location'] = df_sugarbeet['station_location'].replace(r'1', r'', regex=True)
df_sugarbeet['station_location'] = df_sugarbeet['station_location'].replace(r'_2', r'', regex=True)
df_sugarbeet['station_location'] = df_sugarbeet['station_location'].replace(r'2', r'', regex=True)
#drop the columns with unnecessary information after discussion with the stakeholder
df_sugarbeet.drop(['cropid','bm', 'breedid', 'locationid', 'fieldblock', 'fieldsubblock', 'filler',
       'labnr', 'layoutnr','plotid', 'plotindex', 'rep','spectraname', 'trial', 'year', 'anzahl', 'standardind'], axis=1, inplace=True)
#remove one outlier with negative value
df_sugarbeet = df_sugarbeet[df_sugarbeet.betaine_nir >= 0]
#exclude missing value (0.19 %)
df_sugarbeet = df_sugarbeet.dropna()

In [4]:
#change region names to english
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Niederlande', r'Netherlands', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Franken', r'Franconia', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Niederbayern', r'Lower_Bavaria', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Südfrankreich', r'Southern_France', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Ostdeutschland', r'Eastern_Germany', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Nordrheinwestfalen', r'NRW', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Normandie', r'Normandy', regex=True)
df_sugarbeet['region'] = df_sugarbeet['region'].replace(r'Norddeutschland', r'Northern_Germany', regex=True)

In [5]:
# OPTIONAL
# only use when working with field weather stations
#replace stations locations to use the correct field weather information, based on discussion with stakeholder
df_sugarbeet['station_location'] = df_sugarbeet['station_location'].replace(r'Hamm', r'Soest', regex=True)
df_sugarbeet['station_location'] = df_sugarbeet['station_location'].replace(r'Oberviehhausen', r'Mattenkofen', regex=True)
df_sugarbeet['station_location'] = df_sugarbeet['station_location'].replace(r'Vierhöfen', r'Mattenkofen', regex=True)

### Weather data fieldstations

In [6]:
# import weather data, rename the columns in english and at the same time rename Unnamed1 to country and location to station-location
weather_column_names = (['station_location', 'country', 'Date_time', 'Day', 'Month', 'Year', 'Hour', 'air_temperature_avg', 'air_temperature_max', 'air_temperature_min',
                        'dew_point_avg', 'dew_point_min', 'solar_radiation', 'saturation_VPD_avg', 'saturation_VPD_min', 'relative_humidity_avg',
                        'relative_humidity_max', 'relative_humidity_min', 'precipitation', 'leaf_wetness', 'wind_speed_avg', 'wind_speed_max', 'wind_speed_gusts',
                        'wind_direction', 'EAG_soil_moisture_1', 'EAG_soil_moisture_2', 'EAG_soil_moisture_3', 'EAG_soil_moisture_4', 'EAG_soil_moisture_5', 'EAG_soil_moisture_6',
                        'soil_salinity_1', 'soil_salinity_2', 'soil_salinity_3', 'soil_salinity_4', 'soil_salinity_5', 'soil_salinity_6', 'soil_temperature_1_vg', 'soil_temperature_1_max',
                        'soil_temperature_1_min', 'soil_temperature_2_vg', 'soil_temperature_2_max', 'soil_temperature_2_min', 'soil_temperature_3_vg', 'soil_temperature_3_max',
                        'soil_temperature_3_min', 'soil_temperature_4_vg', 'soil_temperature_4_max', 'soil_temperature_4_min', 'soil_temperature_5_vg', 'soil_temperature_5_max',
                        'soil_temperature_5_min', 'soil_temperature_6_vg', 'soil_temperature_6_max', 'soil_temperature_6_min', 'solar_panel', 'battery',
                        'deltaT_avg', 'deltaT_max', 'deltaT_min', 'ET0'])
# import csv file
df_weatherstations = pd.read_csv('data/wetterdaten_all-stations_hourly.csv', delimiter=';', decimal=',', header=0, names=weather_column_names)
#lower case the name of the columns, rename a col
df_weatherstations.columns = df_weatherstations.columns.str.lower()

In [7]:
#change date to datetime format and extract month and day, directly changing the month and day columns
df_weatherstations['date_time'] = pd.to_datetime(df_weatherstations['date_time'], yearfirst=True, format="%Y-%m-%d %H:%M:%S")
df_weatherstations['month'] = df_weatherstations['date_time'].dt.month
df_weatherstations['day'] = df_weatherstations['date_time'].dt.day


### Location and sowing/harvesting dates

In [8]:
# load data from CSV file
df_locations = pd.read_csv('data/fieldid_gps_dates.csv')
# remove ° from lat and lon information
df_locations['latitude'] = df_locations['latitude'].replace(r'°', r'', regex=True)
df_locations['longitude'] = df_locations['longitude'].replace(r'°', r'', regex=True)
# in order to use the information in the lat and lon columns, first change the type to string, then to float
df_locations['latitude'] = df_locations['latitude'].astype(str)
df_locations['longitude'] = df_locations['longitude'].astype(str)
# float
df_locations['latitude'] = df_locations['latitude'].astype(float)
df_locations['longitude'] = df_locations['longitude'].astype(float)

### Weather data Openweather

In [9]:
# load individual openweatherdata
df_anklam = pd.read_csv("data/Anklam.csv", delimiter=",")
df_bautzen = pd.read_csv("data/Bautzen.csv", delimiter=",")
df_emmeloord = pd.read_csv("data/Emmeloord.csv", delimiter=",")
df_goderville = pd.read_csv("data/Goderville.csv", delimiter=",")
df_hamm = pd.read_csv("data/Hamm.csv", delimiter=",")
df_herchsheim = pd.read_csv("data/Herchsheim.csv", delimiter=",")
df_lamotte = pd.read_csv("data/Lamotte.csv", delimiter=",")
df_lelystad = pd.read_csv("data/Lelystad.csv", delimiter=",")
df_mattenkofen = pd.read_csv("data/Mattenkofen.csv", delimiter=",")
df_oberviehhausen = pd.read_csv("data/Oberviehhausen.csv", delimiter=",")
df_pithiviers = pd.read_csv("data/Pithiviers.csv", delimiter=",")
df_soest = pd.read_csv("data/Soest.csv", delimiter=",")
df_sommepy1 = pd.read_csv("data/Sommepy1.csv", delimiter=",")
df_sommepy2 = pd.read_csv("data/Sommepy2.csv", delimiter=",")
df_stadthagen = pd.read_csv("data/Stadthagen.csv", delimiter=",")
df_vierhoefen = pd.read_csv("data/Vierhöfen.csv", delimiter=",")

# combine into one dataframe
df_openweather = pd.concat ([df_anklam,
                            df_bautzen, 
                           df_emmeloord, 
                           df_goderville, 
                           df_hamm, 
                           df_herchsheim, 
                           df_lamotte, 
                           df_lelystad, 
                           df_mattenkofen, 
                           df_oberviehhausen, 
                           df_pithiviers, 
                           df_soest, 
                           df_sommepy1, 
                           df_sommepy2, 
                           df_stadthagen,
                           df_vierhoefen], ignore_index=True)

In [10]:
# split iso datetime according to the + and only keep the first (the date) in a new columns
df_openweather['date'] = df_openweather.dt_iso.apply(lambda x: x.split('+')[0])

# change the new column to datetime format
df_openweather['date'] = pd.to_datetime(df_openweather['date'])

# create additional columns for year, month, and day in addition to a plottingdate counting the day of the year
df_openweather['year'] = df_openweather['date'].dt.year
df_openweather['month'] = df_openweather['date'].dt.month
df_openweather['day'] = df_openweather['date'].dt.day
df_openweather['plotting_date'] = df_openweather['date'].dt.dayofyear

In [11]:
# drop unnecessary columns
df_openweather.drop(['visibility','sea_level', 'grnd_level', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h'], axis=1, inplace=True)

In [12]:
# create a dataframe for the year 2021 to match the sugarbeet data timeframe
df_openweather_2021 = df_openweather.query('year == 2021')

### Save dataframes as pickles for next notebooks

In [13]:
df_sugarbeet.to_pickle('pickles/01_df_sugarbeet.pkl')
df_weatherstations.to_pickle('pickles/01_df_weatherstations.pkl')
df_locations.to_pickle('pickles/01_df_locations.pkl')
df_openweather.to_pickle('pickles/01_df_openweather.pkl')
df_openweather_2021.to_pickle('pickles/01_df_openweather_2021.pkl')