In [25]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import missingno as msno

In [26]:
# import pickle
df_sugarbeet = pd.read_pickle('data_strube/pickles/sugarbeet.pkl')
df_weather_monthly = pd.read_pickle('data_strube/pickles/df_openweather_monthly.pkl')

In [27]:
df_weather_monthly.columns

Index(['station_location', 'month', 'lat', 'lon', 'temp', 'dew_point',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'year', 'day', 'plotting_date'],
      dtype='object')

In [28]:
# create lists of columns to include in different pivot tables
# long list for more complicated/detailed analysis, short list for baseline model
pivotvaluelist = ['temp', 'dew_point', 'temp_min',
       'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg']
simplepivotvaluelist = ['temp', 'temp_min','temp_max']

In [29]:
# make column name to str to facilitate flattening in the pivot dataframe later
df_weather_monthly.month = df_weather_monthly.month.astype(str)

In [30]:
# change dataframe shape from long to wide with pivot
df_weather_piv = pd.pivot(df_weather_monthly, index='station_location', columns=['month'], values=simplepivotvaluelist)

In [31]:
# flatten the multi-index columns
df_weather_piv.columns = ['_'.join(col) for col in df_weather_piv.columns.values]
#df_weather_piv.columns = ["_".join(tuple(map(str, t)))for t in df_weather_piv.columns.values]
# flatten all columns to one level
df_weather_piv.reset_index()

Unnamed: 0,station_location,temp_1,temp_10,temp_11,temp_12,temp_2,temp_3,temp_4,temp_5,temp_6,...,temp_max_11,temp_max_12,temp_max_2,temp_max_3,temp_max_4,temp_max_5,temp_max_6,temp_max_7,temp_max_8,temp_max_9
0,Anklam,1.08629,10.668239,6.711778,1.461263,0.793199,4.675874,5.893292,11.360444,19.118833,...,7.238194,2.184839,1.603988,5.276411,6.614986,11.957876,19.9135,20.775511,16.99254,15.521389
1,Bautzen,0.35043,10.487151,5.689181,2.38043,0.588661,5.005376,6.322181,11.751667,20.137514,...,6.646625,3.426196,1.977396,6.276599,7.571431,12.98789,21.66225,21.21168,18.444758,17.232097
2,Emmeloord,3.093602,11.768656,7.617889,4.790659,3.450461,6.37172,6.757444,11.166156,18.049931,...,8.849764,6.147957,5.066131,7.640806,8.23275,12.522003,19.447611,19.62461,18.49914,17.533431
3,Goderville,5.089247,12.667003,8.230681,7.39082,6.102351,7.703804,7.248069,11.40461,15.709931,...,9.322806,8.268159,6.621607,8.307056,7.955764,12.211546,17.106583,18.71121,17.602298,18.118556
4,Hamm,2.593978,11.493374,6.368931,4.80422,3.930818,6.631398,7.032486,11.812661,19.905944,...,7.355931,6.011613,5.512396,7.978078,8.446819,12.921895,21.308403,19.986116,18.091129,17.063917
5,Herchsheim,0.942527,9.359032,4.274792,3.488575,2.766696,5.511532,7.038778,11.045632,19.638944,...,5.380792,4.536223,4.157604,6.727944,8.310097,12.231129,21.051708,19.799032,18.063454,17.347361
6,Lamotte,3.072898,11.151721,6.216202,6.055119,4.798294,7.022662,6.907458,11.224953,17.452747,...,7.171774,6.719648,5.510742,7.714093,7.805764,12.066378,18.262107,18.678823,17.365567,17.501563
7,Lelystad,3.304086,11.907984,7.848056,5.257863,3.786399,6.547702,6.945021,11.280296,18.235542,...,9.034625,6.541196,5.274509,7.783508,8.378036,12.61121,19.565625,19.581465,18.508562,17.622875
8,Mattenkofen,0.10496,9.373602,3.872347,1.777218,2.59872,5.043091,7.834819,11.640215,20.413569,...,5.296792,3.491707,4.705685,6.795605,9.356806,12.931277,21.612764,20.644704,18.551747,17.122111
9,Oberviehhausen,0.007298,9.257863,3.850375,1.697298,2.598348,4.866089,7.578917,11.553468,20.502111,...,5.067986,3.11621,4.535685,6.462016,9.040264,12.953306,21.654958,20.601855,18.466142,16.971667


In [32]:
# merge sugar beet dataframe with the monthly weather info
df_merge_monthly = df_sugarbeet.merge(df_weather_piv, 
                                      on='station_location',
                                      how='outer')

In [33]:
# compare shapes of original and new merge dataframe
# --> we gained some rows
print(f'the sugarbeet dataframe has {df_sugarbeet.shape[0]} rows and {df_sugarbeet.shape[1]} columns')
print(f'the sugarbeet dataframe has {df_merge_monthly.shape[0]} rows and {df_merge_monthly.shape[1]} columns')

the sugarbeet dataframe has 16477 rows and 19 columns
the sugarbeet dataframe has 16481 rows and 55 columns


In [34]:
df_merge_monthly.groupby(['station_location']).mean()

Unnamed: 0_level_0,betaine_nir,cry_nir,csy_nir,dm_nir,invert_nir,mark_nir,ms_comp,obj,otype_comp,pollinator_comp,...,temp_max_11,temp_max_12,temp_max_2,temp_max_3,temp_max_4,temp_max_5,temp_max_6,temp_max_7,temp_max_8,temp_max_9
station_location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anklam,0.118164,60.410328,10.247223,21.789179,0.081215,4.81565,1.750357,30.583452,3.119829,567.205421,...,7.238194,2.184839,1.603988,5.276411,6.614986,11.957876,19.9135,20.775511,16.99254,15.521389
Bautzen,0.242435,68.591283,11.36155,22.157877,0.2409,5.584027,1.545833,34.495,3.335,682.448333,...,6.646625,3.426196,1.977396,6.276599,7.571431,12.98789,21.66225,21.21168,18.444758,17.232097
Emmeloord,0.120584,53.409332,8.488441,21.425727,0.171103,5.529328,1.550543,34.309942,3.33584,680.071011,...,8.849764,6.147957,5.066131,7.640806,8.23275,12.522003,19.447611,19.62461,18.49914,17.533431
Goderville,0.127166,61.337006,10.234917,22.277686,0.098934,5.639084,1.595777,34.677225,3.277526,653.582202,...,9.322806,8.268159,6.621607,8.307056,7.955764,12.211546,17.106583,18.71121,17.602298,18.118556
Hamm,,,,,,,,,,,...,7.355931,6.011613,5.512396,7.978078,8.446819,12.921895,21.308403,19.986116,18.091129,17.063917
Herchsheim,0.169137,67.267432,12.460393,23.914446,0.123959,5.409333,1.972973,28.797297,2.932432,345.77027,...,5.380792,4.536223,4.157604,6.727944,8.310097,12.231129,21.051708,19.799032,18.063454,17.347361
Lamotte,0.144019,74.656006,12.798154,23.559348,0.105824,6.406146,1.586317,33.578361,3.280032,663.757359,...,7.171774,6.719648,5.510742,7.714093,7.805764,12.066378,18.262107,18.678823,17.365567,17.501563
Lelystad,0.124812,63.954298,10.759825,22.246339,0.153983,5.413601,1.687825,32.760666,3.188345,628.330905,...,9.034625,6.541196,5.274509,7.783508,8.378036,12.61121,19.565625,19.581465,18.508562,17.622875
Mattenkofen,0.215736,77.811741,12.120567,20.601693,0.225379,5.034997,1.606965,34.491827,3.258706,639.10661,...,5.296792,3.491707,4.705685,6.795605,9.356806,12.931277,21.612764,20.644704,18.551747,17.122111
Oberviehhausen,,,,,,,,,,,...,5.067986,3.11621,4.535685,6.462016,9.040264,12.953306,21.654958,20.601855,18.466142,16.971667


In [35]:
df_merge_monthly.isnull().sum()

betaine_nir            4
cry_nir                4
csy_nir                4
dm_nir                 4
fieldid                4
region                 4
invert_nir             4
mark_nir               4
ms_comp                4
obj                    4
otype_comp             4
pollinator_comp        4
sc_nir                 4
seednames_coded        4
seriesid               4
totaln_nir             4
x                      4
y                      4
station_location       0
temp_1              2050
temp_10             2050
temp_11             2050
temp_12             2050
temp_2              2050
temp_3              2050
temp_4              2050
temp_5              2050
temp_6              2050
temp_7              2050
temp_8              2050
temp_9              2050
temp_min_1          2050
temp_min_10         2050
temp_min_11         2050
temp_min_12         2050
temp_min_2          2050
temp_min_3          2050
temp_min_4          2050
temp_min_5          2050
temp_min_6          2050


In [36]:
locationdroplist = ['Hamm', 'Oberviehhausen', 'Sommepy1', 'Sommepy2']

In [37]:
# create a baseline model df with the core dates where we have weather data,
df_baseline_model = df_merge_monthly.drop(['temp_10', 'temp_11', 'temp_12', 'temp_1', 'temp_2', 'temp_3',
                                           'temp_max_10', 'temp_max_11', 'temp_max_12', 'temp_max_1', 'temp_max_2', 'temp_max_3',
                                           'temp_min_10', 'temp_min_11', 'temp_min_12', 'temp_min_1', 'temp_min_2', 'temp_min_3',],
                                           axis=1)


In [38]:
df_baseline_model.shape

(16481, 37)

In [39]:
df_baseline_model.station_location.unique()

array(['Emmeloord', 'Lelystad', 'Rittershausen', 'Sommepy', 'Herchsheim',
       'Lamotte', 'Mattenkofen', 'Pithiviers', 'Vierhoefen', 'Bautzen',
       'Stadthagen', 'Goderville', 'Soest', 'Anklam', 'Hamm',
       'Oberviehhausen', 'Sommepy1', 'Sommepy2'], dtype=object)

In [40]:
# drop location with lots of missing values
df_baseline_model.drop(df_baseline_model[
    df_baseline_model.station_location
    .isin(locationdroplist)]
    .index, 
    axis=0, 
    inplace=True)
#df_baseline_model.drop(df_baseline_model.loc[df_baseline_model['station_location'].isin(locationdroplist)], inplace=True)

In [41]:
df_baseline_model.shape

(16477, 37)

In [42]:
df_baseline_model.station_location.unique()

array(['Emmeloord', 'Lelystad', 'Rittershausen', 'Sommepy', 'Herchsheim',
       'Lamotte', 'Mattenkofen', 'Pithiviers', 'Vierhoefen', 'Bautzen',
       'Stadthagen', 'Goderville', 'Soest', 'Anklam'], dtype=object)

In [43]:
df = df_baseline_model.dropna()

In [45]:
# PICKLE RICK
df.to_pickle('data_strube/pickles/baseline_model_openweather.pkl')
