In [1]:
# Import packages
import numpy as np
import pandas as pd
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, date, time, timedelta, timezone
import dateutil.parser as parser


import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.2f' % x) # change decimal places



In [2]:
df_sugarbeet = pd.read_pickle('pickles/01_df_sugarbeet.pkl')
df_weatherstations = pd.read_pickle('pickles/01_df_weatherstations.pkl')
df_locations = pd.read_pickle('pickles/01_df_locations.pkl')

### join field weather data with location data

In [3]:
# check the columns to see on which column the tables can be joined
print(f'the columns of the field weatherstation data are: {df_weatherstations.columns}')
print(f'the columns of the field location data are: {df_locations.columns}')

the columns of the field weatherstation data are: Index(['station_location', 'country', 'date_time', 'day', 'month', 'year',
       'hour', 'air_temperature_avg', 'air_temperature_max',
       'air_temperature_min', 'dew_point_avg', 'dew_point_min',
       'solar_radiation', 'saturation_vpd_avg', 'saturation_vpd_min',
       'relative_humidity_avg', 'relative_humidity_max',
       'relative_humidity_min', 'precipitation', 'leaf_wetness',
       'wind_speed_avg', 'wind_speed_max', 'wind_speed_gusts',
       'wind_direction', 'eag_soil_moisture_1', 'eag_soil_moisture_2',
       'eag_soil_moisture_3', 'eag_soil_moisture_4', 'eag_soil_moisture_5',
       'eag_soil_moisture_6', 'soil_salinity_1', 'soil_salinity_2',
       'soil_salinity_3', 'soil_salinity_4', 'soil_salinity_5',
       'soil_salinity_6', 'soil_temperature_1_vg', 'soil_temperature_1_max',
       'soil_temperature_1_min', 'soil_temperature_2_vg',
       'soil_temperature_2_max', 'soil_temperature_2_min',
       'soil_temperatu

In [4]:
# rename location fieldid column to station_location for table join
df_locations.rename(columns={'fieldid': 'station_location'}, inplace=True)

In [5]:
# join them on station_location columns
df_weatherlocations = df_weatherstations.merge(df_locations,
                                       on='station_location',
                                       how='outer')

In [6]:
df_weatherlocations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53691 entries, 0 to 53690
Data columns (total 68 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   station_location        53691 non-null  object        
 1   country                 53683 non-null  object        
 2   date_time               53683 non-null  datetime64[ns]
 3   day                     53683 non-null  float64       
 4   month                   53683 non-null  float64       
 5   year                    53683 non-null  float64       
 6   hour                    53683 non-null  object        
 7   air_temperature_avg     50554 non-null  float64       
 8   air_temperature_max     50554 non-null  float64       
 9   air_temperature_min     50554 non-null  float64       
 10  dew_point_avg           50554 non-null  float64       
 11  dew_point_min           50554 non-null  float64       
 12  solar_radiation         53682 non-null  float6

In [7]:
# create sowing and harvesting date columns as datetime
df_weatherlocations['sowing_date'] = pd.to_datetime(dict(year=df_weatherlocations['sowing_year'], 
                                                        month=df_weatherlocations['sowing_month'],
                                                        day=df_weatherlocations['sowing_day']))
df_weatherlocations['harvesting_date'] = pd.to_datetime(dict(year=df_weatherlocations['harvesting_year'], 
                                                        month=df_weatherlocations['harvesting_month'],
                                                        day=df_weatherlocations['havesting_day']))


In [8]:
# create timedelta to check growth times to see how to tackle growth stage classification
df_weatherlocations['growth_time'] = df_weatherlocations['harvesting_date'] - df_weatherlocations['sowing_date']
df_weatherlocations.groupby('station_location')['growth_time'].value_counts()

station_location  growth_time
Anklam            201 days       4726
Bautzen           177 days       4037
Berklingen        163 days       3662
Emmeloord         149 days       3343
Goderville        222 days          1
Hamm              199 days          1
Herchsheim        185 days          1
Herchsheim_2      170 days          1
Lamotte           199 days       4361
Lelystad          149 days       2443
Mattenkofen       191 days       4387
Oberviehhausen    181 days          1
Pithiviers        190 days          1
Rittershausen     182 days          1
Soest             200 days       4362
Sommepy           207 days       3743
Stadthagen        206 days       4962
Söllingen         191 days       4456
Vierhöfen         194 days          1
Name: growth_time, dtype: int64

In [9]:
df_weatherlocations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53691 entries, 0 to 53690
Data columns (total 71 columns):
 #   Column                  Non-Null Count  Dtype          
---  ------                  --------------  -----          
 0   station_location        53691 non-null  object         
 1   country                 53683 non-null  object         
 2   date_time               53683 non-null  datetime64[ns] 
 3   day                     53683 non-null  float64        
 4   month                   53683 non-null  float64        
 5   year                    53683 non-null  float64        
 6   hour                    53683 non-null  object         
 7   air_temperature_avg     50554 non-null  float64        
 8   air_temperature_max     50554 non-null  float64        
 9   air_temperature_min     50554 non-null  float64        
 10  dew_point_avg           50554 non-null  float64        
 11  dew_point_min           50554 non-null  float64        
 12  solar_radiation         53682 no

In [10]:
# drop unnecessary columns
df_weatherlocations.drop(['sowing_year', 'sowing_month', 'sowing_day',
       'harvesting_year', 'harvesting_month', 'havesting_day'], axis=1, inplace=True)

In [11]:
# replace station locations according to discussions with the stakeholder
df_weatherlocations['station_location'] = df_weatherlocations['station_location'].replace(r'Hamm', r'Soest', regex=True)
df_weatherlocations['station_location'] = df_weatherlocations['station_location'].replace(r'Herchsheim_2', r'Herchsheim', regex=True)
df_weatherlocations['station_location'] = df_weatherlocations['station_location'].replace(r'Rittershausen', r'Herchsheim', regex=True)
df_weatherlocations['station_location'] = df_weatherlocations['station_location'].replace(r'Oberviehhausen', r'Mattenkofen', regex=True)
df_weatherlocations['station_location'] = df_weatherlocations['station_location'].replace(r'Vierhöfen', r'Mattenkofen', regex=True)

In [12]:
# pickle the frame
df_weatherlocations.to_pickle('pickles/df_weatherlocations.pkl')

In [13]:
# drop because most field weatherdata is missing
#locationdroplist = ['Bautzen', 'Berklingen', 'Groningen', 'Peine', 'Söllingen'] 
# drop location with lots of missing values
#df_weatherlocations.drop(df_weatherlocations[
    #df_weatherlocations.station_location
    #.isin(locationdroplist)]
    #.index, 
    #axis=0, 
    #inplace=True)

##### join weatherlocations with sugarbeet dataframe

first: define growth stages based on sowing and harvesting date, then group weather data according to these stages

In [14]:
# create columns for the first and the last growth stage
df_weatherlocations['s1'] = 30
df_weatherlocations['s2'] = - 45

# to create a development category, use np.where. First: create conditions (time frames) and values (category names)
conditions = [(df_weatherlocations.date_time >= (df_weatherlocations.sowing_date)) & (df_weatherlocations.date_time <= (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s1, unit='d'))),
              (df_weatherlocations.date_time > (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s1, unit='d'))) & (df_weatherlocations.date_time <= (df_weatherlocations.harvesting_date + pd.to_timedelta(df_weatherlocations.s2, unit='d'))),
              (df_weatherlocations.date_time > (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s2, unit='d'))) & (df_weatherlocations.date_time <= (df_weatherlocations.harvesting_date))]
              #(df_weatherlocations.date_time > (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s3, unit='d'))) & (df_weatherlocations.date_time <= (df_weatherlocations.sowing_date + pd.to_timedelta(df_weatherlocations.s4, unit='d')))
            

values = [1,2,3]
df_weatherlocations['development_category'] = np.select(conditions, values)

In [15]:
# we see there is also a fourth condition (0), which we can drop
df_weatherlocations.development_category.value_counts()

2    28662
3     9788
0     9483
1     5758
Name: development_category, dtype: int64

In [16]:
locationdroplist = [0] # drop because most field weatherdata is missing
# drop location with lots of missing values
df_weatherlocations.drop(df_weatherlocations[
    df_weatherlocations.development_category
    .isin(locationdroplist)]
    .index, 
    axis=0, 
    inplace=True)

In [17]:
# pickle the frame
df_weatherlocations.to_pickle('pickles/df_weatherlocations_devcat.pkl')

create a monthly calculation dataframe as well

In [20]:
df_weatherlocations_monthly = df_weatherlocations.drop(['year', 'solar_panel', 'battery'], axis=1)
# ddeltat monthly average
df_weatherlocations_monthly['deltat_avg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['deltat_avg'].transform('mean')
df_weatherlocations_monthly['deltat_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['deltat_max'].transform('mean')
df_weatherlocations_monthly['deltat_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['deltat_min'].transform('mean')


# dew point and saturation monthly average
df_weatherlocations_monthly['dew_point_avg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['dew_point_avg'].transform('mean')
df_weatherlocations_monthly['dew_point_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['dew_point_min'].transform('mean')
df_weatherlocations_monthly['saturation_vpd_avg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['saturation_vpd_avg'].transform('mean')
df_weatherlocations_monthly['saturation_vpd_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['saturation_vpd_min'].transform('mean')


# wind speed monthly average
df_weatherlocations_monthly['wind_speed_avg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['wind_speed_avg'].transform('mean')
df_weatherlocations_monthly['wind_speed_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['wind_speed_max'].transform('mean')
df_weatherlocations_monthly['wind_speed_gusts_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['wind_speed_gusts'].transform('mean')
df_weatherlocations_monthly['wind_direction_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['wind_direction'].transform('mean')

# air temp monthly average 
df_weatherlocations_monthly['air_temperature_avg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['air_temperature_avg'].transform('mean')
df_weatherlocations_monthly['air_temperature_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['air_temperature_max'].transform('mean')
df_weatherlocations_monthly['air_temperature_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['air_temperature_min'].transform('mean')

# soil monthly average
df_weatherlocations_monthly['eag_soil_moisture_1_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['eag_soil_moisture_1'].transform('mean')
df_weatherlocations_monthly['eag_soil_moisture_2_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['eag_soil_moisture_2'].transform('mean')
df_weatherlocations_monthly['eag_soil_moisture_3_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['eag_soil_moisture_3'].transform('mean')
df_weatherlocations_monthly['eag_soil_moisture_4_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['eag_soil_moisture_4'].transform('mean')
df_weatherlocations_monthly['eag_soil_moisture_5_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['eag_soil_moisture_5'].transform('mean')
df_weatherlocations_monthly['eag_soil_moisture_6_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['eag_soil_moisture_6'].transform('mean')

df_weatherlocations_monthly['soil_salinity_1_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_salinity_1'].transform('mean')
df_weatherlocations_monthly['soil_salinity_2_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_salinity_2'].transform('mean')
df_weatherlocations_monthly['soil_salinity_3_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_salinity_3'].transform('mean')
df_weatherlocations_monthly['soil_salinity_4_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_salinity_4'].transform('mean')
df_weatherlocations_monthly['soil_salinity_5_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_salinity_5'].transform('mean')
df_weatherlocations_monthly['soil_salinity_6_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_salinity_6'].transform('mean')

df_weatherlocations_monthly['soil_temperature_1_vg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_1_vg'].transform('mean')
df_weatherlocations_monthly['soil_temperature_2_vg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_2_vg'].transform('mean')
df_weatherlocations_monthly['soil_temperature_3_vg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_3_vg'].transform('mean')
df_weatherlocations_monthly['soil_temperature_4_vg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_4_vg'].transform('mean')
df_weatherlocations_monthly['soil_temperature_5_vg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_5_vg'].transform('mean')
df_weatherlocations_monthly['soil_temperature_6_vg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_6_vg'].transform('mean')

df_weatherlocations_monthly['soil_temperature_1_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_1_min'].transform('mean')
df_weatherlocations_monthly['soil_temperature_2_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_2_min'].transform('mean')
df_weatherlocations_monthly['soil_temperature_3_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_3_min'].transform('mean')
df_weatherlocations_monthly['soil_temperature_4_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_4_min'].transform('mean')
df_weatherlocations_monthly['soil_temperature_5_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_5_min'].transform('mean')
df_weatherlocations_monthly['soil_temperature_6_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_6_min'].transform('mean')

df_weatherlocations_monthly['soil_temperature_1_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_1_max'].transform('mean')
df_weatherlocations_monthly['soil_temperature_2_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_2_max'].transform('mean')
df_weatherlocations_monthly['soil_temperature_3_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_3_max'].transform('mean')
df_weatherlocations_monthly['soil_temperature_4_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_4_max'].transform('mean')
df_weatherlocations_monthly['soil_temperature_5_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_5_max'].transform('mean')
df_weatherlocations_monthly['soil_temperature_6_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['soil_temperature_6_max'].transform('mean')

#sunshine and evapotranspiration
df_weatherlocations_monthly['solar_radiation_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['solar_radiation'].transform('sum')
df_weatherlocations_monthly['et0_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['et0'].transform('sum')

#humidity
df_weatherlocations_monthly['relative_humidity_avg_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['relative_humidity_avg'].transform('mean')
df_weatherlocations_monthly['relative_humidity_max_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['relative_humidity_max'].transform('mean')
df_weatherlocations_monthly['relative_humidity_min_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['relative_humidity_min'].transform('mean')
df_weatherlocations_monthly['precipitation_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['precipitation'].transform('sum')
df_weatherlocations_monthly['leaf_wetness_monthly'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'development_category'])['leaf_wetness'].transform('sum')


In [21]:
# # reset index of weather frame
df_weatherlocations_monthly = df_weatherlocations_monthly.reset_index()
# # drop index column created by step above
df_weatherlocations_monthly.drop(['index'], axis=1, inplace=True)

In [22]:
df_weatherlocations_monthly.to_pickle('pickles/df_weatherlocations_monthly.pkl')

daily

In [23]:
# # create shell for monthly calculations of weatherdata
df_weatherlocations_daily = df_weatherlocations.drop(['year', 'solar_panel', 'battery'], axis=1)
# # ddeltat dayly average
df_weatherlocations_daily['deltat_avg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['deltat_avg'].transform('mean')
df_weatherlocations_daily['deltat_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['deltat_max'].transform('mean')
df_weatherlocations_daily['deltat_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['deltat_min'].transform('mean')


# dew point and saturation dayly average
df_weatherlocations_daily['dew_point_avg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['dew_point_avg'].transform('mean')
df_weatherlocations_daily['dew_point_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['dew_point_min'].transform('mean')
df_weatherlocations_daily['saturation_vpd_avg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['saturation_vpd_avg'].transform('mean')
df_weatherlocations_daily['saturation_vpd_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['saturation_vpd_min'].transform('mean')


# wind speed dayly average
df_weatherlocations_daily['wind_speed_avg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['wind_speed_avg'].transform('mean')
df_weatherlocations_daily['wind_speed_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['wind_speed_max'].transform('mean')
df_weatherlocations_daily['wind_speed_gusts_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['wind_speed_gusts'].transform('mean')
df_weatherlocations_daily['wind_direction_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['wind_direction'].transform('mean')

# air temp dayly average 
df_weatherlocations_daily['air_temperature_avg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['air_temperature_avg'].transform('mean')
df_weatherlocations_daily['air_temperature_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['air_temperature_max'].transform('mean')
df_weatherlocations_daily['air_temperature_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['air_temperature_min'].transform('mean')

# soil dayly average
df_weatherlocations_daily['eag_soil_moisture_1_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['eag_soil_moisture_1'].transform('mean')
df_weatherlocations_daily['eag_soil_moisture_2_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['eag_soil_moisture_2'].transform('mean')
df_weatherlocations_daily['eag_soil_moisture_3_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['eag_soil_moisture_3'].transform('mean')
df_weatherlocations_daily['eag_soil_moisture_4_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['eag_soil_moisture_4'].transform('mean')
df_weatherlocations_daily['eag_soil_moisture_5_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['eag_soil_moisture_5'].transform('mean')
df_weatherlocations_daily['eag_soil_moisture_6_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['eag_soil_moisture_6'].transform('mean')

df_weatherlocations_daily['soil_salinity_1_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_salinity_1'].transform('mean')
df_weatherlocations_daily['soil_salinity_2_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_salinity_2'].transform('mean')
df_weatherlocations_daily['soil_salinity_3_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_salinity_3'].transform('mean')
df_weatherlocations_daily['soil_salinity_4_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_salinity_4'].transform('mean')
df_weatherlocations_daily['soil_salinity_5_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_salinity_5'].transform('mean')
df_weatherlocations_daily['soil_salinity_6_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_salinity_6'].transform('mean')

df_weatherlocations_daily['soil_temperature_1_vg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_1_vg'].transform('mean')
df_weatherlocations_daily['soil_temperature_2_vg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_2_vg'].transform('mean')
df_weatherlocations_daily['soil_temperature_3_vg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_3_vg'].transform('mean')
df_weatherlocations_daily['soil_temperature_4_vg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_4_vg'].transform('mean')
df_weatherlocations_daily['soil_temperature_5_vg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_5_vg'].transform('mean')
df_weatherlocations_daily['soil_temperature_6_vg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_6_vg'].transform('mean')

df_weatherlocations_daily['soil_temperature_1_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_1_min'].transform('mean')
df_weatherlocations_daily['soil_temperature_2_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_2_min'].transform('mean')
df_weatherlocations_daily['soil_temperature_3_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_3_min'].transform('mean')
df_weatherlocations_daily['soil_temperature_4_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_4_min'].transform('mean')
df_weatherlocations_daily['soil_temperature_5_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_5_min'].transform('mean')
df_weatherlocations_daily['soil_temperature_6_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_6_min'].transform('mean')

df_weatherlocations_daily['soil_temperature_1_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_1_max'].transform('mean')
df_weatherlocations_daily['soil_temperature_2_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_2_max'].transform('mean')
df_weatherlocations_daily['soil_temperature_3_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_3_max'].transform('mean')
df_weatherlocations_daily['soil_temperature_4_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_4_max'].transform('mean')
df_weatherlocations_daily['soil_temperature_5_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_5_max'].transform('mean')
df_weatherlocations_daily['soil_temperature_6_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['soil_temperature_6_max'].transform('mean')

#sunshine and evapotranspiration
df_weatherlocations_daily['solar_radiation_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['solar_radiation'].transform('sum')
df_weatherlocations_daily['et0_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['et0'].transform('sum')

#humidity
df_weatherlocations_daily['relative_humidity_avg_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['relative_humidity_avg'].transform('mean')
df_weatherlocations_daily['relative_humidity_max_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['relative_humidity_max'].transform('mean')
df_weatherlocations_daily['relative_humidity_min_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['relative_humidity_min'].transform('mean')
df_weatherlocations_daily['precipitation_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['precipitation'].transform('sum')
df_weatherlocations_daily['leaf_wetness_daily'] = df_weatherlocations.groupby(['country', 'station_location', 'month', 'day', 'development_category'])['leaf_wetness'].transform('sum')

#reset index of weather frame
df_weatherlocations_daily = df_weatherlocations_daily.reset_index()
#drop index column created by step above
df_weatherlocations_daily.drop(['index'], axis=1, inplace=True)

developmental stage

In [24]:
df_weatherlocations_dev_stage = df_weatherlocations.drop(['year', 'solar_panel', 'battery'], axis=1)
# development stage
# ddeltat dayly average
df_weatherlocations_dev_stage['deltat_avg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['deltat_avg'].transform('mean')
df_weatherlocations_dev_stage['deltat_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['deltat_max'].transform('mean')
df_weatherlocations_dev_stage['deltat_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['deltat_min'].transform('mean')


# dew point and saturation dayly average
df_weatherlocations_dev_stage['dew_point_avg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['dew_point_avg'].transform('mean')
df_weatherlocations_dev_stage['dew_point_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['dew_point_min'].transform('mean')
df_weatherlocations_dev_stage['saturation_vpd_avg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['saturation_vpd_avg'].transform('mean')
df_weatherlocations_dev_stage['saturation_vpd_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['saturation_vpd_min'].transform('mean')


# wind speed dayly average
df_weatherlocations_dev_stage['wind_speed_avg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['wind_speed_avg'].transform('mean')
df_weatherlocations_dev_stage['wind_speed_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['wind_speed_max'].transform('mean')
df_weatherlocations_dev_stage['wind_speed_gusts_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['wind_speed_gusts'].transform('mean')
df_weatherlocations_dev_stage['wind_direction_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['wind_direction'].transform('mean')

# air temp dayly average 
df_weatherlocations_dev_stage['air_temperature_avg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['air_temperature_avg'].transform('mean')
df_weatherlocations_dev_stage['air_temperature_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['air_temperature_max'].transform('mean')
df_weatherlocations_dev_stage['air_temperature_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['air_temperature_min'].transform('mean')

# soil dayly average
df_weatherlocations_dev_stage['eag_soil_moisture_1_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['eag_soil_moisture_1'].transform('mean')
df_weatherlocations_dev_stage['eag_soil_moisture_2_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['eag_soil_moisture_2'].transform('mean')
df_weatherlocations_dev_stage['eag_soil_moisture_3_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['eag_soil_moisture_3'].transform('mean')
df_weatherlocations_dev_stage['eag_soil_moisture_4_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['eag_soil_moisture_4'].transform('mean')
df_weatherlocations_dev_stage['eag_soil_moisture_5_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['eag_soil_moisture_5'].transform('mean')
df_weatherlocations_dev_stage['eag_soil_moisture_6_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['eag_soil_moisture_6'].transform('mean')

df_weatherlocations_dev_stage['soil_salinity_1_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_salinity_1'].transform('mean')
df_weatherlocations_dev_stage['soil_salinity_2_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_salinity_2'].transform('mean')
df_weatherlocations_dev_stage['soil_salinity_3_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_salinity_3'].transform('mean')
df_weatherlocations_dev_stage['soil_salinity_4_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_salinity_4'].transform('mean')
df_weatherlocations_dev_stage['soil_salinity_5_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_salinity_5'].transform('mean')
df_weatherlocations_dev_stage['soil_salinity_6_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_salinity_6'].transform('mean')

df_weatherlocations_dev_stage['soil_temperature_1_vg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_1_vg'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_2_vg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_2_vg'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_3_vg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_3_vg'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_4_vg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_4_vg'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_5_vg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_5_vg'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_6_vg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_6_vg'].transform('mean')

df_weatherlocations_dev_stage['soil_temperature_1_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_1_min'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_2_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_2_min'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_3_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_3_min'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_4_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_4_min'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_5_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_5_min'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_6_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_6_min'].transform('mean')

df_weatherlocations_dev_stage['soil_temperature_1_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_1_max'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_2_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_2_max'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_3_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_3_max'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_4_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_4_max'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_5_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_5_max'].transform('mean')
df_weatherlocations_dev_stage['soil_temperature_6_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['soil_temperature_6_max'].transform('mean')

#sunshine and evapotranspiration
df_weatherlocations_dev_stage['solar_radiation_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['solar_radiation'].transform('sum')
df_weatherlocations_dev_stage['et0_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['et0'].transform('sum')

#humidity
df_weatherlocations_dev_stage['relative_humidity_avg_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['relative_humidity_avg'].transform('mean')
df_weatherlocations_dev_stage['relative_humidity_max_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['relative_humidity_max'].transform('mean')
df_weatherlocations_dev_stage['relative_humidity_min_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['relative_humidity_min'].transform('mean')
df_weatherlocations_dev_stage['precipitation_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['precipitation'].transform('sum')
df_weatherlocations_dev_stage['leaf_wetness_devstage'] = df_weatherlocations.groupby(['country', 'station_location', 'development_category'])['leaf_wetness'].transform('sum')


In [25]:
dropcollist = ['air_temperature_avg', 'air_temperature_max',
       'air_temperature_min', 'dew_point_avg', 'dew_point_min',
       'solar_radiation', 'saturation_vpd_avg', 'saturation_vpd_min',
       'relative_humidity_avg', 'relative_humidity_max',
       'relative_humidity_min', 'precipitation', 'leaf_wetness',
       'wind_speed_avg', 'wind_speed_max', 'wind_speed_gusts',
       'wind_direction', 'eag_soil_moisture_1', 'eag_soil_moisture_2',
       'eag_soil_moisture_3', 'eag_soil_moisture_4', 'eag_soil_moisture_5',
       'eag_soil_moisture_6', 'soil_salinity_1', 'soil_salinity_2',
       'soil_salinity_3', 'soil_salinity_4', 'soil_salinity_5',
       'soil_salinity_6', 'soil_temperature_1_vg', 'soil_temperature_1_max',
       'soil_temperature_1_min', 'soil_temperature_2_vg',
       'soil_temperature_2_max', 'soil_temperature_2_min',
       'soil_temperature_3_vg', 'soil_temperature_3_max',
       'soil_temperature_3_min', 'soil_temperature_4_vg',
       'soil_temperature_4_max', 'soil_temperature_4_min',
       'soil_temperature_5_vg', 'soil_temperature_5_max',
       'soil_temperature_5_min', 'soil_temperature_6_vg',
       'soil_temperature_6_max', 'soil_temperature_6_min', 'deltat_avg', 'deltat_max', 'deltat_min', 'et0']

In [29]:
#df_weatherlocations_dev_stage.drop(dropcollist, axis=1, inplace=True)
#df_weatherlocations_dev_stage.drop(dropcollist, axis=1, inplace=True)
#df_weatherlocations_monthly.drop(dropcollist, axis=1, inplace=True)

In [30]:
df_weatherlocations_dev_stage.isnull().sum()

station_location                    0
country                             0
date_time                           0
day                                 0
month                               0
                                 ... 
relative_humidity_avg_devstage    900
relative_humidity_max_devstage    900
relative_humidity_min_devstage    900
precipitation_devstage              0
leaf_wetness_devstage               0
Length: 65, dtype: int64

## PIVOT!

create a pivot table to join the dataframes

In [31]:
df_weatherlocations_monthly.columns

Index(['station_location', 'country', 'date_time', 'day', 'month', 'hour',
       'air_temperature_avg', 'air_temperature_max', 'air_temperature_min',
       'dew_point_avg',
       ...
       'soil_temperature_4_max_monthly', 'soil_temperature_5_max_monthly',
       'soil_temperature_6_max_monthly', 'solar_radiation_monthly',
       'et0_monthly', 'relative_humidity_avg_monthly',
       'relative_humidity_max_monthly', 'relative_humidity_min_monthly',
       'precipitation_monthly', 'leaf_wetness_monthly'],
      dtype='object', length=116)

In [32]:
# create lists of columns to include in different pivot tables
# long list for more complicated/detailed analysis, short list for baseline model
pivotvaluemonthlylist = ['deltat_avg_monthly', 'deltat_max_monthly',
       'deltat_min_monthly', 'dew_point_avg_monthly', 'dew_point_min_monthly',
       'saturation_vpd_avg_monthly', 'saturation_vpd_min_monthly',
       'wind_speed_avg_monthly', 'wind_speed_max_monthly',
       'wind_speed_gusts_monthly', 'wind_direction_monthly',
       'air_temperature_avg_monthly', 'air_temperature_max_monthly',
       'air_temperature_min_monthly', 'eag_soil_moisture_1_monthly',
       'eag_soil_moisture_2_monthly', 'eag_soil_moisture_3_monthly',
       'eag_soil_moisture_4_monthly', 'eag_soil_moisture_5_monthly',
       'eag_soil_moisture_6_monthly', 'soil_salinity_1_monthly',
       'soil_salinity_2_monthly', 'soil_salinity_3_monthly',
       'soil_salinity_4_monthly', 'soil_salinity_5_monthly',
       'soil_salinity_6_monthly', 'soil_temperature_1_vg_monthly',
       'soil_temperature_2_vg_monthly', 'soil_temperature_3_vg_monthly',
       'soil_temperature_4_vg_monthly', 'soil_temperature_5_vg_monthly',
       'soil_temperature_6_vg_monthly', 'soil_temperature_1_min_monthly',
       'soil_temperature_2_min_monthly', 'soil_temperature_3_min_monthly',
       'soil_temperature_4_min_monthly', 'soil_temperature_5_min_monthly',
       'soil_temperature_6_min_monthly', 'soil_temperature_1_max_monthly',
       'soil_temperature_2_max_monthly', 'soil_temperature_3_max_monthly',
       'soil_temperature_4_max_monthly', 'soil_temperature_5_max_monthly',
       'soil_temperature_6_max_monthly', 'solar_radiation_monthly',
       'et0_monthly', 'relative_humidity_avg_monthly',
       'relative_humidity_max_monthly', 'relative_humidity_min_monthly',
       'precipitation_monthly', 'leaf_wetness_monthly']
pivotvaluedailylist = ['deltat_avg_daily', 'deltat_max_daily',
       'deltat_min_daily', 'dew_point_avg_daily', 'dew_point_min_daily',
       'saturation_vpd_avg_daily', 'saturation_vpd_min_daily',
       'wind_speed_avg_daily', 'wind_speed_max_daily',
       'wind_speed_gusts_daily', 'wind_direction_daily',
       'air_temperature_avg_daily', 'air_temperature_max_daily',
       'air_temperature_min_daily', 'eag_soil_moisture_1_daily',
       'eag_soil_moisture_2_daily', 'eag_soil_moisture_3_daily',
       'eag_soil_moisture_4_daily', 'eag_soil_moisture_5_daily',
       'eag_soil_moisture_6_daily', 'soil_salinity_1_daily',
       'soil_salinity_2_daily', 'soil_salinity_3_daily',
       'soil_salinity_4_daily', 'soil_salinity_5_daily',
       'soil_salinity_6_daily', 'soil_temperature_1_vg_daily',
       'soil_temperature_2_vg_daily', 'soil_temperature_3_vg_daily',
       'soil_temperature_4_vg_daily', 'soil_temperature_5_vg_daily',
       'soil_temperature_6_vg_daily', 'soil_temperature_1_min_daily',
       'soil_temperature_2_min_daily', 'soil_temperature_3_min_daily',
       'soil_temperature_4_min_daily', 'soil_temperature_5_min_daily',
       'soil_temperature_6_min_daily', 'soil_temperature_1_max_daily',
       'soil_temperature_2_max_daily', 'soil_temperature_3_max_daily',
       'soil_temperature_4_max_daily', 'soil_temperature_5_max_daily',
       'soil_temperature_6_max_daily', 'solar_radiation_daily', 'et0_daily',
       'relative_humidity_avg_daily', 'relative_humidity_max_daily',
       'relative_humidity_min_daily', 'precipitation_daily',
       'leaf_wetness_daily', 'development_category', 'latitude', 'longitude']
pivotvaluedevstagelist = ['deltat_avg_devstage', 'deltat_max_devstage',
       'deltat_min_devstage', 'dew_point_avg_devstage', 'dew_point_min_devstage',
       'saturation_vpd_avg_devstage', 'saturation_vpd_min_devstage',
       'wind_speed_avg_devstage', 'wind_speed_max_devstage',
       'wind_speed_gusts_devstage', 'wind_direction_devstage',
       'air_temperature_avg_devstage', 'air_temperature_max_devstage',
       'air_temperature_min_devstage', 'eag_soil_moisture_1_devstage',
       'eag_soil_moisture_2_devstage', 'eag_soil_moisture_3_devstage',
       'eag_soil_moisture_4_devstage', 'eag_soil_moisture_5_devstage',
       'eag_soil_moisture_6_devstage', 'soil_salinity_1_devstage',
       'soil_salinity_2_devstage', 'soil_salinity_3_devstage',
       'soil_salinity_4_devstage', 'soil_salinity_5_devstage',
       'soil_salinity_6_devstage', 'soil_temperature_1_vg_devstage',
       'soil_temperature_2_vg_devstage', 'soil_temperature_3_vg_devstage',
       'soil_temperature_4_vg_devstage', 'soil_temperature_5_vg_devstage',
       'soil_temperature_6_vg_devstage', 'soil_temperature_1_min_devstage',
       'soil_temperature_2_min_devstage', 'soil_temperature_3_min_devstage',
       'soil_temperature_4_min_devstage', 'soil_temperature_5_min_devstage',
       'soil_temperature_6_min_devstage', 'soil_temperature_1_max_devstage',
       'soil_temperature_2_max_devstage', 'soil_temperature_3_max_devstage',
       'soil_temperature_4_max_devstage', 'soil_temperature_5_max_devstage',
       'soil_temperature_6_max_devstage', 'solar_radiation_devstage', 'et0_devstage',
       'relative_humidity_avg_devstage', 'relative_humidity_max_devstage',
       'relative_humidity_min_devstage', 'precipitation_devstage',
       'leaf_wetness_devstage', 'latitude', 'longitude']

In [33]:
# reset index of weather frame
df_weatherlocations_monthly = df_weatherlocations_monthly.reset_index()
# drop index column created by step above
df_weatherlocations_monthly.drop(['index'], axis=1, inplace=True)

# reset index of weather frame
df_weatherlocations_daily = df_weatherlocations_daily.reset_index()
# drop index column created by step above
df_weatherlocations_daily.drop(['index'], axis=1, inplace=True)

# reset index of weather frame
df_weatherlocations_dev_stage = df_weatherlocations_dev_stage.reset_index()
# drop index column created by step above
df_weatherlocations_dev_stage.drop(['index'], axis=1, inplace=True)

In [34]:
# make column name to str to facilitate flattening in the pivot dataframe later
df_weatherlocations_monthly.month = df_weatherlocations_monthly.month.astype(str)
df_weatherlocations_daily.month = df_weatherlocations_daily.month.astype(str)
df_weatherlocations_daily.day = df_weatherlocations_daily.day.astype(str)
df_weatherlocations_dev_stage.development_category = df_weatherlocations_dev_stage.development_category.astype(str)

In [35]:
# change dataframe shape from long to wide with pivot
df_weatherlocations_dev_stagepiv = pd.pivot_table(df_weatherlocations_dev_stage, index='station_location', columns=['development_category'], values=pivotvaluedevstagelist)
# flatten the multi-index columns
df_weatherlocations_dev_stagepiv.columns = ['_'.join(col) for col in df_weatherlocations_dev_stagepiv.columns.values]
# flatten all columns to one level
df_weatherlocations_dev_stagepiv.reset_index()

Unnamed: 0,station_location,air_temperature_avg_devstage_1,air_temperature_avg_devstage_2,air_temperature_avg_devstage_3,air_temperature_max_devstage_1,air_temperature_max_devstage_2,air_temperature_max_devstage_3,air_temperature_min_devstage_1,air_temperature_min_devstage_2,air_temperature_min_devstage_3,...,wind_direction_devstage_3,wind_speed_avg_devstage_1,wind_speed_avg_devstage_2,wind_speed_avg_devstage_3,wind_speed_gusts_devstage_1,wind_speed_gusts_devstage_2,wind_speed_gusts_devstage_3,wind_speed_max_devstage_1,wind_speed_max_devstage_2,wind_speed_max_devstage_3
0,Anklam,8.34,16.87,11.24,8.9,17.5,11.65,7.8,16.26,10.84,...,196.92,2.93,1.56,1.9,5.12,3.25,3.64,3.57,2.01,2.36
1,Bautzen,11.67,13.58,,12.24,14.3,,11.1,12.87,,...,271.29,2.66,1.09,1.59,5.32,2.81,3.17,3.23,1.51,1.89
2,Berklingen,5.77,16.14,16.71,6.38,16.75,17.38,5.18,15.56,16.09,...,204.29,2.77,1.66,1.58,5.41,3.67,3.59,3.63,2.26,2.09
3,Emmeloord,9.86,16.99,16.64,10.31,17.48,17.16,9.43,16.51,16.13,...,162.22,2.9,1.8,1.1,5.34,3.87,2.95,3.61,2.29,1.5
4,Lamotte,8.23,16.28,14.64,8.99,16.92,15.25,7.5,15.67,14.06,...,42.69,1.57,1.31,1.9,2.71,3.21,3.36,1.83,1.68,2.22
5,Lelystad,,17.8,16.83,,18.29,17.36,,17.33,16.32,...,257.29,,1.6,1.51,,3.58,3.33,,2.03,1.88
6,Mattenkofen,6.87,16.68,14.75,7.56,17.33,15.43,6.19,16.04,14.1,...,143.92,2.26,1.61,0.82,4.49,3.41,2.19,3.07,2.22,1.21
7,Soest,10.14,17.09,12.27,10.73,17.72,12.82,9.56,16.49,11.74,...,275.86,3.57,2.18,2.79,6.02,3.88,4.57,4.29,2.7,3.31
8,Sommepy,9.55,15.58,16.97,10.49,16.23,17.63,8.68,14.95,16.31,...,53.75,3.78,2.4,2.8,5.59,4.25,4.18,4.38,2.95,3.22
9,Stadthagen,10.31,16.99,9.95,10.9,17.6,10.45,9.73,16.4,9.47,...,271.15,2.96,1.79,2.02,5.64,3.51,4.05,3.76,2.22,2.47


In [36]:
# change dataframe shape from long to wide with pivot
df_weatherlocations_dailypiv = pd.pivot_table(df_weatherlocations_daily, index='station_location', columns=['month', 'day'], values=pivotvaluedailylist)
# flatten the multi-index columns
df_weatherlocations_dailypiv.columns = ['_'.join(col) for col in df_weatherlocations_dailypiv.columns.values]
# flatten all columns to one level
df_weatherlocations_dailypiv.reset_index()


Unnamed: 0,station_location,air_temperature_avg_daily_10.0_1.0,air_temperature_avg_daily_10.0_10.0,air_temperature_avg_daily_10.0_11.0,air_temperature_avg_daily_10.0_12.0,air_temperature_avg_daily_10.0_13.0,air_temperature_avg_daily_10.0_14.0,air_temperature_avg_daily_10.0_15.0,air_temperature_avg_daily_10.0_16.0,air_temperature_avg_daily_10.0_17.0,...,wind_speed_max_daily_9.0_28.0,wind_speed_max_daily_9.0_29.0,wind_speed_max_daily_9.0_3.0,wind_speed_max_daily_9.0_30.0,wind_speed_max_daily_9.0_4.0,wind_speed_max_daily_9.0_5.0,wind_speed_max_daily_9.0_6.0,wind_speed_max_daily_9.0_7.0,wind_speed_max_daily_9.0_8.0,wind_speed_max_daily_9.0_9.0
0,Anklam,11.35,6.43,8.03,7.02,7.37,10.75,10.66,8.32,9.25,...,1.68,1.72,1.45,3.38,1.3,2.18,1.18,0.89,0.88,1.62
1,Bautzen,,,,,,,,,,...,1.98,1.79,1.29,2.42,1.34,1.57,1.22,1.43,1.63,1.08
2,Berklingen,,,,,,,,,,...,,,,,,,,,,
3,Emmeloord,,,,,,,,,,...,,,1.1,,1.53,1.34,0.64,0.74,1.47,
4,Lamotte,12.34,10.75,11.29,10.45,8.49,9.5,10.12,9.41,8.98,...,2.25,2.22,2.24,2.5,1.66,2.0,1.95,3.54,2.5,2.4
5,Lelystad,,,,,,,,,,...,,,1.88,,1.75,1.69,1.19,1.08,1.47,1.56
6,Mattenkofen,,,,,,,,,,...,0.67,0.98,1.39,,1.3,1.46,1.46,1.18,1.63,1.43
7,Soest,13.78,7.54,10.66,9.24,8.91,11.48,10.65,4.43,,...,2.94,3.77,2.35,4.07,2.2,2.28,2.26,2.43,2.48,2.09
8,Sommepy,,,,,,,,,,...,,,3.45,,3.47,3.34,2.97,3.97,3.95,2.12
9,Stadthagen,13.55,6.78,10.29,8.98,8.48,11.19,10.73,7.29,9.54,...,1.72,1.77,2.17,2.55,2.25,2.53,1.87,1.71,1.87,1.58


In [37]:
# change dataframe shape from long to wide with pivot
df_weatherlocations_monthlypiv = pd.pivot_table(df_weatherlocations_monthly, index='station_location', columns=['month'], values=pivotvaluemonthlylist)
# flatten the multi-index columns
df_weatherlocations_monthlypiv.columns = ['_'.join(col) for col in df_weatherlocations_monthlypiv.columns.values]
# flatten all columns to one level
df_weatherlocations_monthlypiv.reset_index()


Unnamed: 0,station_location,air_temperature_avg_monthly_10.0,air_temperature_avg_monthly_11.0,air_temperature_avg_monthly_3.0,air_temperature_avg_monthly_4.0,air_temperature_avg_monthly_5.0,air_temperature_avg_monthly_6.0,air_temperature_avg_monthly_7.0,air_temperature_avg_monthly_8.0,air_temperature_avg_monthly_9.0,...,wind_speed_gusts_monthly_9.0,wind_speed_max_monthly_10.0,wind_speed_max_monthly_11.0,wind_speed_max_monthly_3.0,wind_speed_max_monthly_4.0,wind_speed_max_monthly_5.0,wind_speed_max_monthly_6.0,wind_speed_max_monthly_7.0,wind_speed_max_monthly_8.0,wind_speed_max_monthly_9.0
0,Anklam,10.01,,,6.61,11.34,19.07,19.06,15.82,14.95,...,3.18,2.31,,,3.33,3.3,2.35,1.65,1.74,2.0
1,Bautzen,,,,11.12,11.61,16.59,,,,...,2.92,2.21,,,3.03,3.16,1.51,1.37,1.45,1.69
2,Berklingen,,,,5.77,11.24,18.98,18.55,16.56,13.53,...,3.24,,,,3.63,3.02,2.07,1.67,2.09,1.87
3,Emmeloord,,,,7.94,11.17,17.91,17.91,16.75,15.99,...,2.02,,,,3.43,3.89,2.37,1.59,1.62,0.94
4,Lamotte,11.11,,,9.62,11.65,17.85,18.37,17.03,16.76,...,3.48,2.13,,,1.74,1.77,1.41,1.76,1.74,2.27
5,Lelystad,,,,,,17.82,17.84,16.81,16.81,...,2.68,,,,,,2.35,1.75,1.97,1.61
6,Mattenkofen,,,12.72,7.01,11.39,19.78,18.42,16.71,14.93,...,2.02,,,1.75,3.15,3.24,2.05,1.72,1.57,1.14
7,Soest,11.36,,,8.44,11.8,19.37,18.22,16.63,15.17,...,3.75,3.41,,,3.97,4.39,2.42,2.31,2.76,2.76
8,Sommepy,,,,9.17,10.94,18.16,17.79,16.97,16.9,...,4.36,,,,4.4,3.21,2.86,2.48,2.79,3.36
9,Stadthagen,11.04,7.49,,7.95,11.43,19.02,18.45,16.69,15.26,...,3.25,2.14,2.89,,4.02,3.68,2.21,1.94,2.24,2.07
