In [1]:
# Import packages
import numpy as np
import pandas as pd

# import functions
from src.preprocessing.preprocessing_functions import column_rename, drop_rows, drop_columns, merge_frames, make_datetime, combine_datetime, column_transform, pivot_frame 

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, date, time, timedelta, timezone
import dateutil.parser as parser


In [2]:
#load data from pickles
df_sugarbeet = pd.read_pickle('pickles/01_df_sugarbeet.pkl')
df_weatherstations = pd.read_pickle('pickles/01_df_weatherstations.pkl')
df_locations = pd.read_pickle('pickles/01_df_locations.pkl')
df_openweather = pd.read_pickle('pickles/01_df_openweather.pkl')
df_openweather_2021 = pd.read_pickle('pickles/01_df_openweather_2021.pkl')

#### Weather data from Openweather Stations

In [3]:
df_openweather_2021.columns

Index(['dt', 'dt_iso', 'timezone', 'station_location', 'lat', 'lon', 'temp',
       'dew_point', 'feels_like', 'temp_min', 'temp_max', 'pressure',
       'humidity', 'wind_speed', 'wind_deg', 'clouds_all', 'weather_id',
       'weather_main', 'weather_description', 'weather_icon', 'date', 'year',
       'month', 'day', 'plotting_date'],
      dtype='object')

In [4]:
# exclude jan, feb, mar, nov, dec because most plants were not 
monthkeep = [4, 5,  6,  7,  8,  9, 10]
df_openweather_2021 = df_openweather_2021[df_openweather_2021.month.isin(monthkeep)]

In [5]:
# join weatherdata and location data on station_location columns
# to be able to include sowing and harvesting dates for development stage calculations
df_weatherlocations = merge_frames(df_openweather_2021, df_locations, 'station_location', 'outer')

In [6]:
# create sowing and harvesting date columns
combine_datetime(df_weatherlocations, 'sowing_year', 'sowing_month', 'sowing_day', 'sowing_date')
combine_datetime(df_weatherlocations, 'harvesting_year', 'harvesting_month', 'havesting_day', 'harvesting_date')

Unnamed: 0,dt,dt_iso,timezone,station_location,lat,lon,temp,dew_point,feels_like,temp_min,...,latitude,longitude,sowing_year,sowing_month,sowing_day,harvesting_year,harvesting_month,havesting_day,sowing_date,harvesting_date
0,1.617235e+09,2021-04-01 00:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.80,5.44,6.74,7.22,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
1,1.617239e+09,2021-04-01 01:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.88,5.85,6.91,7.30,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
2,1.617242e+09,2021-04-01 02:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.87,5.68,6.95,7.09,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
3,1.617246e+09,2021-04-01 03:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.75,5.39,6.96,6.87,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
4,1.617250e+09,2021-04-01 04:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,8.11,5.05,7.39,7.33,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82377,,,,Vierhöfen,,,,,,,...,48.776446,12.737804,2021.0,3.0,31.0,2021.0,10.0,11.0,2021-03-31,2021-10-11
82378,,,,Söllingen,,,,,,,...,52.109175,10.926463,2021.0,4.0,24.0,2021.0,11.0,1.0,2021-04-24,2021-11-01
82379,,,,Berklingen,,,,,,,...,52.102719,10.732768,2021.0,4.0,1.0,2021.0,9.0,11.0,2021-04-01,2021-09-11
82380,,,,Rittershausen,,,,,,,...,49.603966,10.013308,2021.0,4.0,14.0,2021.0,10.0,13.0,2021-04-14,2021-10-13


In [7]:
# drop unnecessary columns
dropcollist2 = ['sowing_year', 'sowing_month', 'sowing_day', 'harvesting_year', 'harvesting_month', 'havesting_day']
drop_columns(df_weatherlocations, dropcollist2)

Unnamed: 0,dt,dt_iso,timezone,station_location,lat,lon,temp,dew_point,feels_like,temp_min,...,weather_icon,date,year,month,day,plotting_date,latitude,longitude,sowing_date,harvesting_date
0,1.617235e+09,2021-04-01 00:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.80,5.44,6.74,7.22,...,04n,2021-04-01 00:00:00,2021.0,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01
1,1.617239e+09,2021-04-01 01:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.88,5.85,6.91,7.30,...,04n,2021-04-01 01:00:00,2021.0,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01
2,1.617242e+09,2021-04-01 02:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.87,5.68,6.95,7.09,...,04n,2021-04-01 02:00:00,2021.0,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01
3,1.617246e+09,2021-04-01 03:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.75,5.39,6.96,6.87,...,04n,2021-04-01 03:00:00,2021.0,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01
4,1.617250e+09,2021-04-01 04:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,8.11,5.05,7.39,7.33,...,04n,2021-04-01 04:00:00,2021.0,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82377,,,,Vierhöfen,,,,,,,...,,NaT,,,,,48.776446,12.737804,2021-03-31,2021-10-11
82378,,,,Söllingen,,,,,,,...,,NaT,,,,,52.109175,10.926463,2021-04-24,2021-11-01
82379,,,,Berklingen,,,,,,,...,,NaT,,,,,52.102719,10.732768,2021-04-01,2021-09-11
82380,,,,Rittershausen,,,,,,,...,,NaT,,,,,49.603966,10.013308,2021-04-14,2021-10-13


In [8]:
# the merge of the dataframes created some extra columns that are mostly empty.
#remove missing values according to the datetime columns
df_weatherlocations = df_weatherlocations.dropna(subset=['dt_iso'], axis=0)

#### Development stage definition

In [9]:
# create columns for the first and the last growth stage
df_weatherlocations['s1'] = 30
df_weatherlocations['s2'] = - 45

# to create a development category, use np.where. First: create conditions (time frames) and values (category names)
conditions = [(df_weatherlocations.date >= (df_weatherlocations.sowing_date)) & (df_weatherlocations.date <= (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s1, unit='d'))),
              (df_weatherlocations.date > (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s1, unit='d'))) & (df_weatherlocations.date <= (df_weatherlocations.harvesting_date + pd.to_timedelta(df_weatherlocations.s2, unit='d'))),
              (df_weatherlocations.date > (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s2, unit='d'))) & (df_weatherlocations.date <= (df_weatherlocations.harvesting_date))]
              #(df_weatherlocations.date_time > (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s3, unit='d'))) & (df_weatherlocations.date_time <= (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s4, unit='d')))
            

values = [1,2,3]
df_weatherlocations['development_category'] = np.select(conditions, values)

In [10]:
# it created a dev stage 0 category
df_weatherlocations.query('development_category == 0')

Unnamed: 0,dt,dt_iso,timezone,station_location,lat,lon,temp,dew_point,feels_like,temp_min,...,month,day,plotting_date,latitude,longitude,sowing_date,harvesting_date,s1,s2,development_category
10009,1.634778e+09,2021-10-21 01:00:00 +0000 UTC,7200.0,Bautzen,51.206614,14.397561,15.69,11.87,15.35,13.86,...,10.0,21.0,294.0,51.201418,14.22874,2021-04-27,2021-10-21,30,-45,0
10010,1.634782e+09,2021-10-21 02:00:00 +0000 UTC,7200.0,Bautzen,51.206614,14.397561,16.03,11.81,15.67,14.82,...,10.0,21.0,294.0,51.201418,14.22874,2021-04-27,2021-10-21,30,-45,0
10011,1.634785e+09,2021-10-21 03:00:00 +0000 UTC,7200.0,Bautzen,51.206614,14.397561,15.40,12.35,15.14,13.82,...,10.0,21.0,294.0,51.201418,14.22874,2021-04-27,2021-10-21,30,-45,0
10012,1.634789e+09,2021-10-21 04:00:00 +0000 UTC,7200.0,Bautzen,51.206614,14.397561,14.52,12.38,14.30,13.45,...,10.0,21.0,294.0,51.201418,14.22874,2021-04-27,2021-10-21,30,-45,0
10013,1.634792e+09,2021-10-21 05:00:00 +0000 UTC,7200.0,Bautzen,51.206614,14.397561,14.63,12.49,14.42,12.75,...,10.0,21.0,294.0,51.201418,14.22874,2021-04-27,2021-10-21,30,-45,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82371,1.635707e+09,2021-10-31 19:00:00 +0000 UTC,3600.0,VierhÃ¶fen,48.747079,12.711756,8.18,4.76,6.21,6.73,...,10.0,31.0,304.0,,,NaT,NaT,30,-45,0
82372,1.635710e+09,2021-10-31 20:00:00 +0000 UTC,3600.0,VierhÃ¶fen,48.747079,12.711756,7.20,4.34,5.36,6.18,...,10.0,31.0,304.0,,,NaT,NaT,30,-45,0
82373,1.635714e+09,2021-10-31 21:00:00 +0000 UTC,3600.0,VierhÃ¶fen,48.747079,12.711756,6.38,3.88,4.68,5.62,...,10.0,31.0,304.0,,,NaT,NaT,30,-45,0
82374,1.635718e+09,2021-10-31 22:00:00 +0000 UTC,3600.0,VierhÃ¶fen,48.747079,12.711756,5.79,3.80,4.41,5.57,...,10.0,31.0,304.0,,,NaT,NaT,30,-45,0


In [11]:
devstagedroplist = [0] # drop because most field weatherdata is missing
# drop location with lots of missing values
drop_rows(df_weatherlocations, 'development_category', devstagedroplist)

Unnamed: 0,dt,dt_iso,timezone,station_location,lat,lon,temp,dew_point,feels_like,temp_min,...,month,day,plotting_date,latitude,longitude,sowing_date,harvesting_date,s1,s2,development_category
0,1.617235e+09,2021-04-01 00:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.80,5.44,6.74,7.22,...,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,3
1,1.617239e+09,2021-04-01 01:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.88,5.85,6.91,7.30,...,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,3
2,1.617242e+09,2021-04-01 02:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.87,5.68,6.95,7.09,...,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,3
3,1.617246e+09,2021-04-01 03:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,7.75,5.39,6.96,6.87,...,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,3
4,1.617250e+09,2021-04-01 04:00:00 +0000 UTC,7200.0,Anklam,53.940211,13.600744,8.11,5.05,7.39,7.33,...,4.0,1.0,91.0,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77235,1.635707e+09,2021-10-31 19:00:00 +0000 UTC,3600.0,Stadthagen,52.358144,9.241481,16.11,11.27,15.68,14.50,...,10.0,31.0,304.0,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
77236,1.635710e+09,2021-10-31 20:00:00 +0000 UTC,3600.0,Stadthagen,52.358144,9.241481,15.77,10.53,15.26,13.83,...,10.0,31.0,304.0,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
77237,1.635714e+09,2021-10-31 21:00:00 +0000 UTC,3600.0,Stadthagen,52.358144,9.241481,16.09,10.19,15.53,14.38,...,10.0,31.0,304.0,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
77238,1.635718e+09,2021-10-31 22:00:00 +0000 UTC,3600.0,Stadthagen,52.358144,9.241481,17.22,10.36,16.67,15.00,...,10.0,31.0,304.0,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3


#### Feature engineering of weather according to development stages

In [12]:
# make lists based on which columns are transformed (mean or sum) according to specific grouping (in this case month and location)

new_col = ['temp_dev_stage', 
           'temp_min_dev_stage', 
           'temp_max_dev_stage', 
            'dew_point_dev_stage', 
           'pressure_dev_stage', 
           'humidity_dev_stage', 
           'wind_speed_dev_stage',
           'wind_deg_dev_stage'
           ]
grouping = ['station_location', 'development_category']
col_transform = ['temp',
                 'temp_min',
                 'temp_max',
                 'dew_point',
                 'pressure',
                 'humidity',
                 'wind_speed',
                 'wind_deg'
                 ]
dropcollist3 = ['year', 
                'dt', 
                'day', 
                'dt_iso', 
                'timezone', 
                'feels_like',
                'weather_main', 
                'weather_description', 
                'weather_icon', 
                'weather_id',
                'clouds_all',
                's1', 
                's2',
                'plotting_date',
                'lat', 
                'lon', 
                'date'
                ]

In [13]:
# create dev_stage dataframe
df_weatherlocations_dev_stage = df_weatherlocations.copy()
df_weatherlocations_dev_stage = drop_columns(df_weatherlocations_dev_stage, dropcollist3)

In [14]:
# make monthly average out of columns in col_transform list
column_transform(df_weatherlocations_dev_stage, new_col, grouping, col_transform, how='mean')

Unnamed: 0,station_location,month,latitude,longitude,sowing_date,harvesting_date,development_category,temp_dev_stage,temp_min_dev_stage,temp_max_dev_stage,dew_point_dev_stage,pressure_dev_stage,humidity_dev_stage,wind_speed_dev_stage,wind_deg_dev_stage
0,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,3,10.183559,9.599720,10.724579,7.245270,1016.329978,82.676492,4.037275,213.990654
1,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,3,10.183559,9.599720,10.724579,7.245270,1016.329978,82.676492,4.037275,213.990654
2,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,3,10.183559,9.599720,10.724579,7.245270,1016.329978,82.676492,4.037275,213.990654
3,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,3,10.183559,9.599720,10.724579,7.245270,1016.329978,82.676492,4.037275,213.990654
4,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,3,10.183559,9.599720,10.724579,7.245270,1016.329978,82.676492,4.037275,213.990654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61750,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,3,9.323733,8.038129,10.675776,5.300826,1017.592362,77.067030,4.803118,223.324240
61751,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,3,9.323733,8.038129,10.675776,5.300826,1017.592362,77.067030,4.803118,223.324240
61752,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,3,9.323733,8.038129,10.675776,5.300826,1017.592362,77.067030,4.803118,223.324240
61753,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,3,9.323733,8.038129,10.675776,5.300826,1017.592362,77.067030,4.803118,223.324240


In [15]:
# create list of columns to include in pivot table
pivotvaluedevstagelist = ['latitude', 
                         'longitude', 
                         'sowing_date', 
                         'harvesting_date',
                         'temp_dev_stage', 
                         'temp_min_dev_stage',
                         'temp_max_dev_stage', 
                         'dew_point_dev_stage', 
                         'pressure_dev_stage',
                         'humidity_dev_stage', 
                         'wind_speed_dev_stage', 
                         'wind_deg_dev_stage']


In [16]:
# make column content to str to facilitate flattening in the pivot dataframe later
df_weatherlocations_dev_stage.development_category = df_weatherlocations_dev_stage.development_category.astype(str)

In [17]:
# pivot table on station_location and create dev-stage sorted weather phenomena
df_weatherlocations_devstageypiv = pivot_frame(df_weatherlocations_dev_stage, 'station_location', 'development_category', pivotvaluedevstagelist)
df_weatherlocations_devstageypiv

Unnamed: 0,station_location,dew_point_dev_stage_1,dew_point_dev_stage_2,dew_point_dev_stage_3,humidity_dev_stage_1,humidity_dev_stage_2,humidity_dev_stage_3,latitude_1,latitude_2,latitude_3,...,temp_max_dev_stage_3,temp_min_dev_stage_1,temp_min_dev_stage_2,temp_min_dev_stage_3,wind_deg_dev_stage_1,wind_deg_dev_stage_2,wind_deg_dev_stage_3,wind_speed_dev_stage_1,wind_speed_dev_stage_2,wind_speed_dev_stage_3
0,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,10.724579,7.40889,16.858393,9.59972,189.174757,203.333664,213.990654,3.730347,3.072626,4.037275
1,Bautzen,6.386963,13.989677,7.247917,73.033287,76.472222,79.633216,51.201418,51.201418,51.201418,...,12.031761,10.216283,17.157827,9.479401,216.725381,219.898693,228.839789,3.717268,2.5329,3.074472
2,Emmeloord,4.815132,11.598677,8.652186,72.135922,70.289414,71.951923,52.698061,52.698061,52.698061,...,15.235923,8.269334,15.895495,12.652981,191.661581,198.396396,206.214744,3.620014,2.097354,2.706654
3,Goderville,2.44828,12.835102,10.084498,73.460472,83.249433,83.22097,49.645645,49.645645,49.645645,...,14.042638,6.413703,14.737336,11.666077,119.846047,188.081633,188.165727,4.390818,4.15788,5.225682
4,Hamm,5.104202,13.614637,6.954753,71.269071,78.292003,80.202919,51.609609,51.609609,51.609609,...,11.555351,9.351165,16.733149,9.238151,203.973648,194.602151,208.312022,2.556463,2.602077,3.327019
5,Herchsheim,2.70111,12.777462,9.149242,69.84466,77.352652,78.486364,49.638659,49.638659,49.638659,...,14.356038,6.793065,15.826167,11.551811,185.228849,193.878788,178.018939,2.773245,2.623598,2.574432
6,Lamotte,1.579085,12.227196,11.2235,72.105409,80.645607,81.803939,49.87938,49.87938,49.87938,...,15.388729,6.113454,15.135275,13.885828,143.467406,192.944061,161.225604,4.500527,4.163712,3.534038
7,Lelystad,5.679334,12.80451,10.094613,75.744799,75.226351,77.043535,52.552513,52.552513,52.552513,...,15.317973,8.332857,16.056396,12.756082,183.62552,165.853604,176.204678,4.139196,3.033159,3.651832
8,Mattenkofen,1.81214,13.0012,11.525472,71.023774,78.606681,78.736111,48.776446,48.776446,48.776446,...,16.484083,4.85312,15.421911,14.521694,245.408618,204.95079,197.181481,2.613789,2.394828,2.210759
9,Oberviehhausen,2.372288,12.424009,9.361619,67.572816,71.454796,74.391509,48.702083,48.702083,48.702083,...,14.929748,6.554924,16.481325,12.701934,236.231623,206.337657,212.659591,2.565631,2.416046,2.571879


In [18]:
df_weatherlocations_devstageypiv

Unnamed: 0,station_location,dew_point_dev_stage_1,dew_point_dev_stage_2,dew_point_dev_stage_3,humidity_dev_stage_1,humidity_dev_stage_2,humidity_dev_stage_3,latitude_1,latitude_2,latitude_3,...,temp_max_dev_stage_3,temp_min_dev_stage_1,temp_min_dev_stage_2,temp_min_dev_stage_3,wind_deg_dev_stage_1,wind_deg_dev_stage_2,wind_deg_dev_stage_3,wind_speed_dev_stage_1,wind_speed_dev_stage_2,wind_speed_dev_stage_3
0,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,10.724579,7.40889,16.858393,9.59972,189.174757,203.333664,213.990654,3.730347,3.072626,4.037275
1,Bautzen,6.386963,13.989677,7.247917,73.033287,76.472222,79.633216,51.201418,51.201418,51.201418,...,12.031761,10.216283,17.157827,9.479401,216.725381,219.898693,228.839789,3.717268,2.5329,3.074472
2,Emmeloord,4.815132,11.598677,8.652186,72.135922,70.289414,71.951923,52.698061,52.698061,52.698061,...,15.235923,8.269334,15.895495,12.652981,191.661581,198.396396,206.214744,3.620014,2.097354,2.706654
3,Goderville,2.44828,12.835102,10.084498,73.460472,83.249433,83.22097,49.645645,49.645645,49.645645,...,14.042638,6.413703,14.737336,11.666077,119.846047,188.081633,188.165727,4.390818,4.15788,5.225682
4,Hamm,5.104202,13.614637,6.954753,71.269071,78.292003,80.202919,51.609609,51.609609,51.609609,...,11.555351,9.351165,16.733149,9.238151,203.973648,194.602151,208.312022,2.556463,2.602077,3.327019
5,Herchsheim,2.70111,12.777462,9.149242,69.84466,77.352652,78.486364,49.638659,49.638659,49.638659,...,14.356038,6.793065,15.826167,11.551811,185.228849,193.878788,178.018939,2.773245,2.623598,2.574432
6,Lamotte,1.579085,12.227196,11.2235,72.105409,80.645607,81.803939,49.87938,49.87938,49.87938,...,15.388729,6.113454,15.135275,13.885828,143.467406,192.944061,161.225604,4.500527,4.163712,3.534038
7,Lelystad,5.679334,12.80451,10.094613,75.744799,75.226351,77.043535,52.552513,52.552513,52.552513,...,15.317973,8.332857,16.056396,12.756082,183.62552,165.853604,176.204678,4.139196,3.033159,3.651832
8,Mattenkofen,1.81214,13.0012,11.525472,71.023774,78.606681,78.736111,48.776446,48.776446,48.776446,...,16.484083,4.85312,15.421911,14.521694,245.408618,204.95079,197.181481,2.613789,2.394828,2.210759
9,Oberviehhausen,2.372288,12.424009,9.361619,67.572816,71.454796,74.391509,48.702083,48.702083,48.702083,...,14.929748,6.554924,16.481325,12.701934,236.231623,206.337657,212.659591,2.565631,2.416046,2.571879


In [19]:
df_weatherlocations_devstageypiv.to_pickle('pickles/df_openweather_devstage.pkl')

#### Feature engineering monthly weather values

In [20]:
# make lists based on which columns are transformed (mean or sum) according to specific grouping (in this case month and location)
# 
new_col = ['temp_monthly', 
           'temp_min_monthly', 
           'temp_max_monthly', 
            'dew_point_monthly', 
           'pressure_monthly', 
           'humidity_monthly', 
           'wind_speed_monthly',
           'wind_deg_monthly'
           ]
grouping = ['station_location', 'month']
col_transform = ['temp',
                 'temp_min',
                 'temp_max',
                 'dew_point',
                 'pressure',
                 'humidity',
                 'wind_speed',
                 'wind_deg'
                 ]
dropcollist4 = ['year', 
                'dt', 
                'day', 
                'dt_iso', 
                'timezone', 
                'feels_like',
                'weather_main', 
                'development_category',
                'weather_description', 
                'weather_icon', 
                'weather_id',
                'clouds_all',
                's1', 
                's2',
                'plotting_date',
                'lat', 
                'lon', 
                'date'
                ]

In [21]:
# create monthly dataframe
df_weatherlocations_monthly = df_weatherlocations.copy()
df_weatherlocations_monthly = drop_columns(df_weatherlocations_monthly, dropcollist4)

In [22]:
# make monthly average out of columns in col_transform list
column_transform(df_weatherlocations_monthly, new_col, grouping, col_transform, how='mean')

Unnamed: 0,station_location,month,latitude,longitude,sowing_date,harvesting_date,temp_monthly,temp_min_monthly,temp_max_monthly,dew_point_monthly,pressure_monthly,humidity_monthly,wind_speed_monthly,wind_deg_monthly
0,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,5.893292,5.066958,6.614986,1.361167,1016.927778,74.056944,3.824083,207.911111
1,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,5.893292,5.066958,6.614986,1.361167,1016.927778,74.056944,3.824083,207.911111
2,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,5.893292,5.066958,6.614986,1.361167,1016.927778,74.056944,3.824083,207.911111
3,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,5.893292,5.066958,6.614986,1.361167,1016.927778,74.056944,3.824083,207.911111
4,Anklam,4.0,53.939204,13.595342,2021-04-14,2021-11-01,5.893292,5.066958,6.614986,1.361167,1016.927778,74.056944,3.824083,207.911111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61750,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,11.481095,10.160107,12.802483,8.012617,1016.435247,79.985314,4.862951,210.033378
61751,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,11.481095,10.160107,12.802483,8.012617,1016.435247,79.985314,4.862951,210.033378
61752,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,11.481095,10.160107,12.802483,8.012617,1016.435247,79.985314,4.862951,210.033378
61753,Stadthagen,10.0,52.260091,10.261494,2021-04-23,2021-11-15,11.481095,10.160107,12.802483,8.012617,1016.435247,79.985314,4.862951,210.033378


In [23]:
# create list of columns to include in pivot table
pivotvaluemonthlylist = ['latitude', 
                         'longitude', 
                         'sowing_date', 
                         'harvesting_date',
                         'temp_monthly', 
                         'temp_min_monthly',
                         'temp_max_monthly', 
                         'dew_point_monthly', 
                         'pressure_monthly',
                         'humidity_monthly', 
                         'wind_speed_monthly', 
                         'wind_deg_monthly']


In [24]:
# make column name to str to facilitate flattening in the pivot dataframe later
df_weatherlocations_monthly.month = df_weatherlocations_monthly.month.astype(str)

In [25]:
df_weatherlocations_monthlypiv = pivot_frame(df_weatherlocations_monthly, 'station_location', 'month', pivotvaluemonthlylist)
df_weatherlocations_monthlypiv

Unnamed: 0,station_location,dew_point_monthly_10.0,dew_point_monthly_4.0,dew_point_monthly_5.0,dew_point_monthly_6.0,dew_point_monthly_7.0,dew_point_monthly_8.0,dew_point_monthly_9.0,humidity_monthly_10.0,humidity_monthly_4.0,...,wind_deg_monthly_7.0,wind_deg_monthly_8.0,wind_deg_monthly_9.0,wind_speed_monthly_10.0,wind_speed_monthly_4.0,wind_speed_monthly_5.0,wind_speed_monthly_6.0,wind_speed_monthly_7.0,wind_speed_monthly_8.0,wind_speed_monthly_9.0
0,Anklam,7.891169,1.361167,7.320282,13.370319,15.453817,12.789691,12.337028,83.663978,74.056944,...,200.834677,193.145161,204.701389,4.013226,3.824083,3.596142,2.678528,3.06375,3.611411,3.177056
1,Bautzen,7.769626,1.892556,7.016465,14.110694,15.383065,13.698992,12.618542,81.717256,74.922222,...,215.642473,236.961022,218.748611,3.119293,3.429069,3.685215,2.425486,2.454973,2.620134,2.619792
2,Emmeloord,,2.310111,6.274691,12.113208,12.852755,11.593427,11.23961,,74.422222,...,207.428763,220.100806,165.561039,,4.07325,3.269005,2.050597,2.000444,2.019651,1.883247
3,Goderville,9.976868,2.6615,7.648616,13.325028,14.771922,13.846626,13.995194,84.096774,73.481944,...,192.36828,177.314516,167.116667,5.121962,4.448611,5.478481,3.101889,4.027809,4.522917,3.868236
4,Hamm,8.450188,2.191903,7.162473,14.772319,14.83422,13.63332,12.771208,82.149194,72.786111,...,206.157258,212.346774,174.652778,3.988495,2.098292,3.121599,2.224403,2.37914,3.099973,2.633139
5,Herchsheim,7.920381,1.321347,6.469879,14.432708,14.315874,13.349301,12.123069,83.017301,68.756944,...,184.413978,222.802419,155.361111,2.825744,2.743028,3.676559,2.062194,2.445013,2.650806,2.112514
6,Lamotte,8.873747,1.794278,7.000599,13.831547,14.399709,13.355871,12.700692,86.646778,72.065278,...,186.12037,205.448549,146.518672,3.829093,4.565917,5.005459,3.500493,3.916878,4.094987,3.463956
7,Lelystad,,2.674869,7.337003,13.304806,14.096774,13.491586,13.499612,,74.639004,...,172.833333,182.575269,143.498615,,4.6926,4.09129,2.777014,2.940188,3.369341,2.385069
8,Mattenkofen,9.709517,2.176875,7.646452,14.558389,16.044194,13.749745,11.874306,79.317241,70.916667,...,201.133065,221.989247,185.979167,2.590345,2.619,3.253266,2.100472,2.068051,2.242594,1.962125
9,Oberviehhausen,9.566138,1.155083,6.128602,13.328611,14.552944,13.186196,11.405736,79.510345,65.6625,...,204.823925,220.802419,190.452778,2.570483,2.725528,3.275914,2.257181,2.163898,2.339879,2.110153


In [26]:
df_weatherlocations_monthlypiv.to_pickle('pickles/df_openweather_monthly.pkl')

### merge with sugarbeet data

In [27]:
# merge sugar beet dataframe with the pivoted monthly weather info
df_merge_weatherloc_monthly = merge_frames(df_weatherlocations_monthlypiv, df_sugarbeet, 'station_location')
print(f'the sugarbeet monthly dataframe has {df_merge_weatherloc_monthly.shape[0]} rows and {df_merge_weatherloc_monthly.shape[1]} columns')

the sugarbeet monthly dataframe has 16479 rows and 88 columns


In [28]:
drop_rows(df_merge_weatherloc_monthly, 'seednames_coded', [np.nan])

Unnamed: 0,station_location,dew_point_monthly_10.0,dew_point_monthly_4.0,dew_point_monthly_5.0,dew_point_monthly_6.0,dew_point_monthly_7.0,dew_point_monthly_8.0,dew_point_monthly_9.0,humidity_monthly_10.0,humidity_monthly_4.0,...,ms_comp,obj,otype_comp,pollinator_comp,sc_nir,seednames_coded,seriesid,totaln_nir,x,y
0,Anklam,7.891169,1.361167,7.320282,13.370319,15.453817,12.789691,12.337028,83.663978,74.056944,...,2.0,10.0,1.0,12.0,17.1952,108.0,1503.0,0.1521,96.0,13.0
1,Anklam,7.891169,1.361167,7.320282,13.370319,15.453817,12.789691,12.337028,83.663978,74.056944,...,2.0,14.0,1.0,26.0,17.6798,1191.0,1503.0,0.1631,96.0,14.0
2,Anklam,7.891169,1.361167,7.320282,13.370319,15.453817,12.789691,12.337028,83.663978,74.056944,...,2.0,13.0,1.0,19.0,17.3721,103.0,1503.0,0.1553,96.0,15.0
3,Anklam,7.891169,1.361167,7.320282,13.370319,15.453817,12.789691,12.337028,83.663978,74.056944,...,2.0,9.0,1.0,11.0,16.9056,107.0,1503.0,0.1477,96.0,16.0
4,Anklam,7.891169,1.361167,7.320282,13.370319,15.453817,12.789691,12.337028,83.663978,74.056944,...,2.0,17.0,1.0,37.0,17.1998,1194.0,1503.0,0.1542,97.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16474,Vierhoefen,,,,,,,,,,...,1.0,50.0,3.0,1096.0,14.5338,628.0,1516.0,0.1641,83.0,8.0
16475,Vierhoefen,,,,,,,,,,...,7.0,4.0,8.0,1192.0,13.6961,1205.0,1516.0,0.1639,83.0,9.0
16476,Vierhoefen,,,,,,,,,,...,1.0,33.0,3.0,589.0,12.9078,466.0,1516.0,0.1445,83.0,10.0
16477,Vierhoefen,,,,,,,,,,...,1.0,51.0,3.0,1097.0,13.4423,629.0,1516.0,0.1346,83.0,11.0


In [29]:
# merge sugar beet dataframe with the pivoted devstage weather info
df_merge_weatherloc_devstage = merge_frames(df_weatherlocations_devstageypiv, df_sugarbeet, 'station_location')
print(f'the sugarbeet devstage dataframe has {df_merge_weatherloc_devstage.shape[0]} rows and {df_merge_weatherloc_devstage.shape[1]} columns')

the sugarbeet devstage dataframe has 16479 rows and 48 columns


In [30]:
drop_rows(df_merge_weatherloc_devstage, 'seednames_coded', [np.nan])

Unnamed: 0,station_location,dew_point_dev_stage_1,dew_point_dev_stage_2,dew_point_dev_stage_3,humidity_dev_stage_1,humidity_dev_stage_2,humidity_dev_stage_3,latitude_1,latitude_2,latitude_3,...,ms_comp,obj,otype_comp,pollinator_comp,sc_nir,seednames_coded,seriesid,totaln_nir,x,y
0,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,2.0,10.0,1.0,12.0,17.1952,108.0,1503.0,0.1521,96.0,13.0
1,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,2.0,14.0,1.0,26.0,17.6798,1191.0,1503.0,0.1631,96.0,14.0
2,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,2.0,13.0,1.0,19.0,17.3721,103.0,1503.0,0.1553,96.0,15.0
3,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,2.0,9.0,1.0,11.0,16.9056,107.0,1503.0,0.1477,96.0,16.0
4,Anklam,3.540666,12.959269,7.24527,73.570042,76.972884,82.676492,53.939204,53.939204,53.939204,...,2.0,17.0,1.0,37.0,17.1998,1194.0,1503.0,0.1542,97.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16474,Vierhoefen,,,,,,,,,,...,1.0,50.0,3.0,1096.0,14.5338,628.0,1516.0,0.1641,83.0,8.0
16475,Vierhoefen,,,,,,,,,,...,7.0,4.0,8.0,1192.0,13.6961,1205.0,1516.0,0.1639,83.0,9.0
16476,Vierhoefen,,,,,,,,,,...,1.0,33.0,3.0,589.0,12.9078,466.0,1516.0,0.1445,83.0,10.0
16477,Vierhoefen,,,,,,,,,,...,1.0,51.0,3.0,1097.0,13.4423,629.0,1516.0,0.1346,83.0,11.0


In [31]:
# pickle it
df_merge_weatherloc_monthly.to_pickle('pickles/df_openweather_monthly_sugarbeet.pkl')
df_merge_weatherloc_devstage.to_pickle('pickles/df_openweather_devstage_sugarbeet.pkl')

### Weatherdata from fieldstations

In [32]:
df_weatherstations.columns

Index(['station_location', 'country', 'date_time', 'day', 'month', 'year',
       'hour', 'air_temperature_avg', 'air_temperature_max',
       'air_temperature_min', 'dew_point_avg', 'dew_point_min',
       'solar_radiation', 'saturation_vpd_avg', 'saturation_vpd_min',
       'relative_humidity_avg', 'relative_humidity_max',
       'relative_humidity_min', 'precipitation', 'leaf_wetness',
       'wind_speed_avg', 'wind_speed_max', 'wind_speed_gusts',
       'wind_direction', 'eag_soil_moisture_1', 'eag_soil_moisture_2',
       'eag_soil_moisture_3', 'eag_soil_moisture_4', 'eag_soil_moisture_5',
       'eag_soil_moisture_6', 'soil_salinity_1', 'soil_salinity_2',
       'soil_salinity_3', 'soil_salinity_4', 'soil_salinity_5',
       'soil_salinity_6', 'soil_temperature_1_vg', 'soil_temperature_1_max',
       'soil_temperature_1_min', 'soil_temperature_2_vg',
       'soil_temperature_2_max', 'soil_temperature_2_min',
       'soil_temperature_3_vg', 'soil_temperature_3_max',
       'soil_t

In [33]:
# exclude jan, feb, mar, nov, dec because most plants were not 
monthkeep = [4, 5,  6,  7,  8,  9, 10]
df_weatherstations = df_weatherstations[df_weatherstations.month.isin(monthkeep)]

In [34]:
# join weatherdata and location data on station_location columns
# to be able to include sowing and harvesting dates for development stage calculations
df_fieldweatherlocations = merge_frames(df_weatherstations, df_locations, 'station_location', 'outer')

In [35]:
# create sowing and harvesting date columns
combine_datetime(df_fieldweatherlocations, 'sowing_year', 'sowing_month', 'sowing_day', 'sowing_date')
combine_datetime(df_fieldweatherlocations, 'harvesting_year', 'harvesting_month', 'havesting_day', 'harvesting_date')

Unnamed: 0,station_location,country,date_time,day,month,year,hour,air_temperature_avg,air_temperature_max,air_temperature_min,...,latitude,longitude,sowing_year,sowing_month,sowing_day,harvesting_year,harvesting_month,havesting_day,sowing_date,harvesting_date
0,Anklam,D,2021-04-14 17:00:00,14.0,4.0,2021.0,17:00,8.12,8.49,7.55,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
1,Anklam,D,2021-04-14 18:00:00,14.0,4.0,2021.0,18:00,7.76,8.17,7.32,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
2,Anklam,D,2021-04-14 19:00:00,14.0,4.0,2021.0,19:00,6.56,7.23,5.68,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
3,Anklam,D,2021-04-14 20:00:00,14.0,4.0,2021.0,20:00,4.02,5.34,2.69,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
4,Anklam,D,2021-04-14 21:00:00,14.0,4.0,2021.0,21:00,2.82,3.34,2.17,...,53.939204,13.595342,2021.0,4.0,14.0,2021.0,11.0,1.0,2021-04-14,2021-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53375,Rittershausen,,NaT,,,,,,,,...,49.603966,10.013308,2021.0,4.0,14.0,2021.0,10.0,13.0,2021-04-14,2021-10-13
53376,Pithiviers,,NaT,,,,,,,,...,48.169585,2.326618,2021.0,3.0,24.0,2021.0,9.0,30.0,2021-03-24,2021-09-30
53377,Oberviehhausen,,NaT,,,,,,,,...,48.702083,12.862987,2021.0,4.0,9.0,2021.0,10.0,7.0,2021-04-09,2021-10-07
53378,Goderville,,NaT,,,,,,,,...,49.645645,0.433038,2021.0,4.0,2.0,2021.0,11.0,10.0,2021-04-02,2021-11-10


In [36]:
# drop unnecessary columns
dropcollist5 = ['sowing_year', 'sowing_month', 'sowing_day', 'harvesting_year', 'harvesting_month', 'havesting_day']
drop_columns(df_fieldweatherlocations, dropcollist5)

Unnamed: 0,station_location,country,date_time,day,month,year,hour,air_temperature_avg,air_temperature_max,air_temperature_min,...,solar_panel,battery,deltat_avg,deltat_max,deltat_min,et0,latitude,longitude,sowing_date,harvesting_date
0,Anklam,D,2021-04-14 17:00:00,14.0,4.0,2021.0,17:00,8.12,8.49,7.55,...,7069.0,6747.0,5.0,5.0,5.0,,53.939204,13.595342,2021-04-14,2021-11-01
1,Anklam,D,2021-04-14 18:00:00,14.0,4.0,2021.0,18:00,7.76,8.17,7.32,...,6906.0,6639.0,5.0,5.0,4.0,,53.939204,13.595342,2021-04-14,2021-11-01
2,Anklam,D,2021-04-14 19:00:00,14.0,4.0,2021.0,19:00,6.56,7.23,5.68,...,2545.0,6525.0,4.0,5.0,4.0,,53.939204,13.595342,2021-04-14,2021-11-01
3,Anklam,D,2021-04-14 20:00:00,14.0,4.0,2021.0,20:00,4.02,5.34,2.69,...,0.0,6434.0,3.0,3.0,2.0,,53.939204,13.595342,2021-04-14,2021-11-01
4,Anklam,D,2021-04-14 21:00:00,14.0,4.0,2021.0,21:00,2.82,3.34,2.17,...,0.0,6368.0,1.0,2.0,1.0,,53.939204,13.595342,2021-04-14,2021-11-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53375,Rittershausen,,NaT,,,,,,,,...,,,,,,,49.603966,10.013308,2021-04-14,2021-10-13
53376,Pithiviers,,NaT,,,,,,,,...,,,,,,,48.169585,2.326618,2021-03-24,2021-09-30
53377,Oberviehhausen,,NaT,,,,,,,,...,,,,,,,48.702083,12.862987,2021-04-09,2021-10-07
53378,Goderville,,NaT,,,,,,,,...,,,,,,,49.645645,0.433038,2021-04-02,2021-11-10


In [37]:
# the merge of the dataframes created some extra columns that are mostly empty.
#remove missing values according to the datetime columns
df_fieldweatherlocations = df_fieldweatherlocations.dropna(subset=['date_time'], axis=0)

#### Development stage definition

In [38]:
# create columns for the first and the last growth stage
df_fieldweatherlocations['s1'] = 30
df_fieldweatherlocations['s2'] = - 45

# to create a development category, use np.where. First: create conditions (time frames) and values (category names)
conditions = [(df_fieldweatherlocations.date_time >= (df_fieldweatherlocations.sowing_date)) & (df_fieldweatherlocations.date_time <= (df_fieldweatherlocations.sowing_date + pd.to_timedelta(df_fieldweatherlocations.s1, unit='d'))),
              (df_fieldweatherlocations.date_time > (df_fieldweatherlocations.sowing_date + pd.to_timedelta(df_fieldweatherlocations.s1, unit='d'))) & (df_fieldweatherlocations.date_time <= (df_fieldweatherlocations.harvesting_date + pd.to_timedelta(df_fieldweatherlocations.s2, unit='d'))),
              (df_fieldweatherlocations.date_time > (df_fieldweatherlocations.sowing_date + pd.to_timedelta(df_fieldweatherlocations.s2, unit='d'))) & (df_fieldweatherlocations.date_time <= (df_fieldweatherlocations.harvesting_date))]
              #(df_fieldweatherlocations.date_time > (df_fieldweatherlocations.sowing_date + pd.to_timedelta(df_fieldweatherlocations.s3, unit='d'))) & (df_fieldweatherlocations.date_time <= (df_fieldweatherlocations.sowing_date + pd.to_timedelta(df_fieldweatherlocations.s4, unit='d')))
            

values = [1,2,3]
df_fieldweatherlocations['development_category'] = np.select(conditions, values)

In [39]:
devstagedroplist = [0] # drop because most field weatherdata is missing
# drop location with lots of missing values
drop_rows(df_fieldweatherlocations, 'development_category', devstagedroplist)

Unnamed: 0,station_location,country,date_time,day,month,year,hour,air_temperature_avg,air_temperature_max,air_temperature_min,...,deltat_max,deltat_min,et0,latitude,longitude,sowing_date,harvesting_date,s1,s2,development_category
0,Anklam,D,2021-04-14 17:00:00,14.0,4.0,2021.0,17:00,8.12,8.49,7.55,...,5.0,5.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
1,Anklam,D,2021-04-14 18:00:00,14.0,4.0,2021.0,18:00,7.76,8.17,7.32,...,5.0,4.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
2,Anklam,D,2021-04-14 19:00:00,14.0,4.0,2021.0,19:00,6.56,7.23,5.68,...,5.0,4.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
3,Anklam,D,2021-04-14 20:00:00,14.0,4.0,2021.0,20:00,4.02,5.34,2.69,...,3.0,2.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
4,Anklam,D,2021-04-14 21:00:00,14.0,4.0,2021.0,21:00,2.82,3.34,2.17,...,2.0,1.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53367,Stadthagen,D,2021-10-31 19:00:00,31.0,10.0,2021.0,19:00,16.73,16.98,16.59,...,1.0,1.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
53368,Stadthagen,D,2021-10-31 20:00:00,31.0,10.0,2021.0,20:00,17.05,17.54,16.75,...,2.0,2.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
53369,Stadthagen,D,2021-10-31 21:00:00,31.0,10.0,2021.0,21:00,16.78,17.09,16.29,...,2.0,1.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
53370,Stadthagen,D,2021-10-31 22:00:00,31.0,10.0,2021.0,22:00,16.93,17.65,16.61,...,2.0,2.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3


In [40]:
# make lists based on which columns are transformed (mean or sum) according to specific grouping (in this case month and location)

new_col_mean = ['deltat_avg_devstage', 
           'deltat_max_devstage', 
           'deltat_min_devstage',
           'dew_point_avg_devstage', 
           'dew_point_min_devstage', 
           'saturation_vpd_avg_devstage', 
           'saturation_vpd_min_devstage',
           'wind_speed_avg_devstage',
           'wind_speed_max_devstage', 
           'wind_speed_gusts_devstage', 
           'wind_direction_devstage',
           'air_temperature_avg_devstage', 
           'air_temperature_max_devstage', 
           'air_temperature_min_devstage', 
           'eag_soil_moisture_1_devstage',
           'eag_soil_moisture_2_devstage',
           'eag_soil_moisture_3_devstage',
           'eag_soil_moisture_4_devstage',
           'eag_soil_moisture_5_devstage',
           'eag_soil_moisture_6_devstage',
           'soil_salinity_1_devstage',
           'soil_salinity_2_devstage',
           'soil_salinity_3_devstage',
           'soil_salinity_4_devstage',
           'soil_salinity_5_devstage',
           'soil_salinity_6_devstage',
           'soil_temperature_1_vg_devstage',
           'soil_temperature_2_vg_devstage',
           'soil_temperature_3_vg_devstage',
           'soil_temperature_4_vg_devstage',
           'soil_temperature_5_vg_devstage',
           'soil_temperature_6_vg_devstage',
           'soil_temperature_1_min_devstage',
           'soil_temperature_2_min_devstage',
           'soil_temperature_3_min_devstage',
           'soil_temperature_4_min_devstage',
           'soil_temperature_5_min_devstage',
           'soil_temperature_6_min_devstage',
           'soil_temperature_1_max_devstage',
           'soil_temperature_2_max_devstage',
           'soil_temperature_3_max_devstage',
           'soil_temperature_4_max_devstage',
           'soil_temperature_5_max_devstage',
           'soil_temperature_6_max_devstage',
           'relative_humidity_avg_devstage',
           'relative_humidity_max_devstage',
           'relative_humidity_min_devstage'
           ]
new_col_sum = ['solar_radiation_devstage',
                'et0_devstage',
                'precipitation_devstage',
                'leaf_wetness_devstage']
grouping = ['country', 'station_location', 'development_category']
col_transform_mean = ['deltat_avg', 
           'deltat_max', 
           'deltat_min',
           'dew_point_avg', 
           'dew_point_min', 
           'saturation_vpd_avg', 
           'saturation_vpd_min',
           'wind_speed_avg',
           'wind_speed_max', 
           'wind_speed_gusts', 
           'wind_direction',
           'air_temperature_avg', 
           'air_temperature_max', 
           'air_temperature_min', 
           'eag_soil_moisture_1',
           'eag_soil_moisture_2',
           'eag_soil_moisture_3',
           'eag_soil_moisture_4',
           'eag_soil_moisture_5',
           'eag_soil_moisture_6',
           'soil_salinity_1',
           'soil_salinity_2',
           'soil_salinity_3',
           'soil_salinity_4',
           'soil_salinity_5',
           'soil_salinity_6',
           'soil_temperature_1_vg',
           'soil_temperature_2_vg',
           'soil_temperature_3_vg',
           'soil_temperature_4_vg',
           'soil_temperature_5_vg',
           'soil_temperature_6_vg',
           'soil_temperature_1_min',
           'soil_temperature_2_min',
           'soil_temperature_3_min',
           'soil_temperature_4_min',
           'soil_temperature_5_min',
           'soil_temperature_6_min',
           'soil_temperature_1_max',
           'soil_temperature_2_max',
           'soil_temperature_3_max',
           'soil_temperature_4_max',
           'soil_temperature_5_max',
           'soil_temperature_6_max',
           'relative_humidity_avg',
           'relative_humidity_max',
           'relative_humidity_min'
           ]
col_transform_sum = ['solar_radiation',
                'et0',
                'precipitation',
                'leaf_wetness']
dropcollist6 = ['year', 
                'solar_panel', 
                'battery'
                ]

In [41]:
# create dev_stage dataframe
df_fieldweatherlocations_dev_stage = df_fieldweatherlocations.copy()
drop_columns(df_fieldweatherlocations_dev_stage, dropcollist6)

Unnamed: 0,station_location,country,date_time,day,month,hour,air_temperature_avg,air_temperature_max,air_temperature_min,dew_point_avg,...,deltat_max,deltat_min,et0,latitude,longitude,sowing_date,harvesting_date,s1,s2,development_category
0,Anklam,D,2021-04-14 17:00:00,14.0,4.0,17:00,8.12,8.49,7.55,-3.8,...,5.0,5.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
1,Anklam,D,2021-04-14 18:00:00,14.0,4.0,18:00,7.76,8.17,7.32,-3.8,...,5.0,4.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
2,Anklam,D,2021-04-14 19:00:00,14.0,4.0,19:00,6.56,7.23,5.68,-2.8,...,5.0,4.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
3,Anklam,D,2021-04-14 20:00:00,14.0,4.0,20:00,4.02,5.34,2.69,-1.9,...,3.0,2.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
4,Anklam,D,2021-04-14 21:00:00,14.0,4.0,21:00,2.82,3.34,2.17,-0.3,...,2.0,1.0,,53.939204,13.595342,2021-04-14,2021-11-01,30,-45,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53367,Stadthagen,D,2021-10-31 19:00:00,31.0,10.0,19:00,16.73,16.98,16.59,15.2,...,1.0,1.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
53368,Stadthagen,D,2021-10-31 20:00:00,31.0,10.0,20:00,17.05,17.54,16.75,14.2,...,2.0,2.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
53369,Stadthagen,D,2021-10-31 21:00:00,31.0,10.0,21:00,16.78,17.09,16.29,13.8,...,2.0,1.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3
53370,Stadthagen,D,2021-10-31 22:00:00,31.0,10.0,22:00,16.93,17.65,16.61,13.7,...,2.0,2.0,,52.260091,10.261494,2021-04-23,2021-11-15,30,-45,3


In [42]:
# make dev_stage average out of columns in col_transform_mean list
column_transform(df_fieldweatherlocations_dev_stage, new_col_mean, grouping, col_transform_mean, how='mean')
# make dev_stage average out of columns in col_transform_sum list
column_transform(df_fieldweatherlocations_dev_stage, new_col_sum, grouping, col_transform_sum, how='sum')



Unnamed: 0,station_location,country,date_time,day,month,hour,latitude,longitude,sowing_date,harvesting_date,...,soil_temperature_4_max_devstage,soil_temperature_5_max_devstage,soil_temperature_6_max_devstage,relative_humidity_avg_devstage,relative_humidity_max_devstage,relative_humidity_min_devstage,solar_radiation_devstage,et0_devstage,precipitation_devstage,leaf_wetness_devstage
0,Anklam,D,2021-04-14 17:00:00,14.0,4.0,17:00,53.939204,13.595342,2021-04-14,2021-11-01,...,8.761974,8.551234,8.411030,77.222472,80.229205,74.118153,160993.0,81.0,15.2,10235.0
1,Anklam,D,2021-04-14 18:00:00,14.0,4.0,18:00,53.939204,13.595342,2021-04-14,2021-11-01,...,8.761974,8.551234,8.411030,77.222472,80.229205,74.118153,160993.0,81.0,15.2,10235.0
2,Anklam,D,2021-04-14 19:00:00,14.0,4.0,19:00,53.939204,13.595342,2021-04-14,2021-11-01,...,8.761974,8.551234,8.411030,77.222472,80.229205,74.118153,160993.0,81.0,15.2,10235.0
3,Anklam,D,2021-04-14 20:00:00,14.0,4.0,20:00,53.939204,13.595342,2021-04-14,2021-11-01,...,8.761974,8.551234,8.411030,77.222472,80.229205,74.118153,160993.0,81.0,15.2,10235.0
4,Anklam,D,2021-04-14 21:00:00,14.0,4.0,21:00,53.939204,13.595342,2021-04-14,2021-11-01,...,8.761974,8.551234,8.411030,77.222472,80.229205,74.118153,160993.0,81.0,15.2,10235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43892,Stadthagen,D,2021-10-31 19:00:00,31.0,10.0,19:00,52.260091,10.261494,2021-04-23,2021-11-15,...,10.988551,10.967966,11.098051,91.840158,93.624214,89.926248,69972.0,29.3,41.4,1735.0
43893,Stadthagen,D,2021-10-31 20:00:00,31.0,10.0,20:00,52.260091,10.261494,2021-04-23,2021-11-15,...,10.988551,10.967966,11.098051,91.840158,93.624214,89.926248,69972.0,29.3,41.4,1735.0
43894,Stadthagen,D,2021-10-31 21:00:00,31.0,10.0,21:00,52.260091,10.261494,2021-04-23,2021-11-15,...,10.988551,10.967966,11.098051,91.840158,93.624214,89.926248,69972.0,29.3,41.4,1735.0
43895,Stadthagen,D,2021-10-31 22:00:00,31.0,10.0,22:00,52.260091,10.261494,2021-04-23,2021-11-15,...,10.988551,10.967966,11.098051,91.840158,93.624214,89.926248,69972.0,29.3,41.4,1735.0


In [43]:
df_fieldweatherlocations_dev_stage.isnull().sum()

station_location                    0
country                             0
date_time                           0
day                                 0
month                               0
                                 ... 
relative_humidity_min_devstage    900
solar_radiation_devstage            0
et0_devstage                        0
precipitation_devstage              0
leaf_wetness_devstage               0
Length: 64, dtype: int64

In [44]:
fieldpivotvaluedevstagelist = ['deltat_avg_devstage', 'deltat_max_devstage',
       'deltat_min_devstage', 'dew_point_avg_devstage', 'dew_point_min_devstage',
       'saturation_vpd_avg_devstage', 'saturation_vpd_min_devstage',
       'wind_speed_avg_devstage', 'wind_speed_max_devstage',
       'wind_speed_gusts_devstage', 'wind_direction_devstage',
       'air_temperature_avg_devstage', 'air_temperature_max_devstage',
       'air_temperature_min_devstage', 'eag_soil_moisture_1_devstage',
       'eag_soil_moisture_2_devstage', 'eag_soil_moisture_3_devstage',
       'eag_soil_moisture_4_devstage', 'eag_soil_moisture_5_devstage',
       'eag_soil_moisture_6_devstage', 'soil_salinity_1_devstage',
       'soil_salinity_2_devstage', 'soil_salinity_3_devstage',
       'soil_salinity_4_devstage', 'soil_salinity_5_devstage',
       'soil_salinity_6_devstage', 'soil_temperature_1_vg_devstage',
       'soil_temperature_2_vg_devstage', 'soil_temperature_3_vg_devstage',
       'soil_temperature_4_vg_devstage', 'soil_temperature_5_vg_devstage',
       'soil_temperature_6_vg_devstage', 'soil_temperature_1_min_devstage',
       'soil_temperature_2_min_devstage', 'soil_temperature_3_min_devstage',
       'soil_temperature_4_min_devstage', 'soil_temperature_5_min_devstage',
       'soil_temperature_6_min_devstage', 'soil_temperature_1_max_devstage',
       'soil_temperature_2_max_devstage', 'soil_temperature_3_max_devstage',
       'soil_temperature_4_max_devstage', 'soil_temperature_5_max_devstage',
       'soil_temperature_6_max_devstage', 'solar_radiation_devstage', 'et0_devstage',
       'relative_humidity_avg_devstage', 'relative_humidity_max_devstage',
       'relative_humidity_min_devstage', 'precipitation_devstage',
       'leaf_wetness_devstage', 'latitude', 'longitude']

In [45]:
# make column content to str to facilitate flattening in the pivot dataframe later
df_fieldweatherlocations_dev_stage.development_category = df_fieldweatherlocations_dev_stage.development_category.astype(str)
# pivot table on station_location and create dev-stage sorted weather phenomena
df_fieldweatherlocations_dev_stage = pivot_frame(df_fieldweatherlocations_dev_stage, 'station_location', 'development_category', fieldpivotvaluedevstagelist)
df_fieldweatherlocations_dev_stage

Unnamed: 0,station_location,air_temperature_avg_devstage_1,air_temperature_avg_devstage_2,air_temperature_avg_devstage_3,air_temperature_max_devstage_1,air_temperature_max_devstage_2,air_temperature_max_devstage_3,air_temperature_min_devstage_1,air_temperature_min_devstage_2,air_temperature_min_devstage_3,...,wind_direction_devstage_3,wind_speed_avg_devstage_1,wind_speed_avg_devstage_2,wind_speed_avg_devstage_3,wind_speed_gusts_devstage_1,wind_speed_gusts_devstage_2,wind_speed_gusts_devstage_3,wind_speed_max_devstage_1,wind_speed_max_devstage_2,wind_speed_max_devstage_3
0,Anklam,8.338935,16.866417,11.236964,8.900398,17.500953,11.651974,7.801023,16.263338,10.837335,...,196.921844,2.932386,1.555776,1.903607,5.117756,3.252698,3.642184,3.56733,2.009434,2.362525
1,Bautzen,11.665617,13.581773,,12.238316,14.303045,,11.09775,12.872182,,...,271.285556,2.657184,1.091258,1.594667,5.323803,2.80625,3.173556,3.230044,1.506005,1.894222
2,Berklingen,5.772407,16.144389,16.712087,6.376752,16.752382,17.37796,5.184957,15.561804,16.091486,...,204.290094,2.774786,1.661174,1.579599,5.405556,3.665246,3.593396,3.630199,2.262737,2.09092
3,Emmeloord,9.86423,16.986667,16.63934,10.311323,17.47545,17.161149,9.43064,16.512095,16.133584,...,162.221843,2.899709,1.797241,1.1,5.343605,3.870664,2.951536,3.606541,2.293074,1.495336
4,Lamotte,8.227941,16.281905,14.64,8.987353,16.922718,15.247335,7.499412,15.671448,14.055506,...,42.688022,1.567647,1.31334,1.903807,2.705882,3.214214,3.364253,1.829412,1.679738,2.220427
5,Lelystad,,17.800221,16.833018,,18.291215,17.357777,,17.332319,16.319648,...,257.288732,,1.597654,1.510865,,3.575431,3.3334,,2.034576,1.877465
6,Mattenkofen,6.581278,16.67691,14.751236,7.260045,17.329443,15.431581,5.918083,16.035846,14.097695,...,143.919822,2.327043,1.605354,0.824833,4.571322,3.409845,2.190312,3.137444,2.221667,1.211247
7,Soest,10.136539,17.089907,12.274703,10.73278,17.72246,12.818569,9.557773,16.491147,11.744155,...,275.863014,3.569645,2.177893,2.789802,6.020709,3.882094,4.567428,4.28539,2.699967,3.307763
8,Sommepy,9.549906,15.581569,16.968401,10.491415,16.229912,17.629574,8.682264,14.948308,16.310085,...,53.747863,3.780189,2.397759,2.797436,5.585849,4.253725,4.179487,4.379245,2.945991,3.222863
9,Stadthagen,10.308391,16.988278,10.786236,10.896602,17.596174,11.313812,9.732399,16.397193,10.268697,...,267.689403,2.957143,1.791407,1.863946,5.638558,3.509198,3.941291,3.756865,2.220146,2.327527


In [46]:
df_fieldweatherlocations_dev_stage.to_pickle('pickles/df_fieldweather_devstage.pkl')

### merge with sugarbeet data

In [47]:
# merge sugar beet dataframe with the pivoted monthly weather info
df_merge_fieldweather_monthly = merge_frames(df_fieldweatherlocations_dev_stage, df_sugarbeet, 'station_location')
print(f'the sugarbeet monthly dataframe has {df_merge_fieldweather_monthly.shape[0]} rows and {df_merge_fieldweather_monthly.shape[1]} columns')

the sugarbeet monthly dataframe has 16479 rows and 177 columns


In [48]:
drop_rows(df_merge_fieldweather_monthly, 'seednames_coded', [np.nan])

Unnamed: 0,station_location,air_temperature_avg_devstage_1,air_temperature_avg_devstage_2,air_temperature_avg_devstage_3,air_temperature_max_devstage_1,air_temperature_max_devstage_2,air_temperature_max_devstage_3,air_temperature_min_devstage_1,air_temperature_min_devstage_2,air_temperature_min_devstage_3,...,ms_comp,obj,otype_comp,pollinator_comp,sc_nir,seednames_coded,seriesid,totaln_nir,x,y
0,Anklam,8.338935,16.866417,11.236964,8.900398,17.500953,11.651974,7.801023,16.263338,10.837335,...,2.0,10.0,1.0,12.0,17.1952,108.0,1503.0,0.1521,96.0,13.0
1,Anklam,8.338935,16.866417,11.236964,8.900398,17.500953,11.651974,7.801023,16.263338,10.837335,...,2.0,14.0,1.0,26.0,17.6798,1191.0,1503.0,0.1631,96.0,14.0
2,Anklam,8.338935,16.866417,11.236964,8.900398,17.500953,11.651974,7.801023,16.263338,10.837335,...,2.0,13.0,1.0,19.0,17.3721,103.0,1503.0,0.1553,96.0,15.0
3,Anklam,8.338935,16.866417,11.236964,8.900398,17.500953,11.651974,7.801023,16.263338,10.837335,...,2.0,9.0,1.0,11.0,16.9056,107.0,1503.0,0.1477,96.0,16.0
4,Anklam,8.338935,16.866417,11.236964,8.900398,17.500953,11.651974,7.801023,16.263338,10.837335,...,2.0,17.0,1.0,37.0,17.1998,1194.0,1503.0,0.1542,97.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16474,Goderville,,,,,,,,,,...,1.0,37.0,3.0,428.0,15.4889,222.0,1513.0,0.1254,106.0,7.0
16475,Goderville,,,,,,,,,,...,1.0,11.0,3.0,338.0,17.5512,144.0,1513.0,0.1176,106.0,8.0
16476,Goderville,,,,,,,,,,...,1.0,65.0,3.0,969.0,15.6572,351.0,1513.0,0.1271,106.0,9.0
16477,Goderville,,,,,,,,,,...,1.0,57.0,3.0,957.0,14.4877,339.0,1513.0,0.1429,106.0,11.0


In [49]:
df_merge_fieldweather_monthly.to_pickle('pickles/df_fieldweather_devstage_sugarbeet.pkl')