In [1]:
# Import packages
import numpy as np
import pandas as pd
import missingno as msno

import warnings
warnings.filterwarnings('ignore')

from datetime import datetime, date, time, timedelta, timezone
import dateutil.parser as parser


import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.2f' % x) # change decimal places



In [2]:
# load dataframes
df_sugarbeet_coded = pd.read_csv('/Users/clara/Neue_Fische/capstone_project/upload_files/data/DatenfürGina_recoded_CSV.csv', delimiter=';', decimal=',')
df_weatherstations = pd.read_pickle('pickles/01_df_openweather_2021.pkl')

In [3]:
def column_rename(dataframe, old_colname, new_colname):
    ''' 
    rename a specific column in a dataframe. Enter the columnnames as strings.
    Arguments:
    dataframe: the dataframe with a column to be renamed
    old_colname: the column to be renamed
    new_colname: the name the column is supposed to have
    '''
    dataframe.rename(columns={old_colname: new_colname}, inplace=True)
    return dataframe

def drop_rows(dataframe, column, row_drop_list):
    ''' 
    Function to drop specific rows according to location. Prepare a list of row-conditions to be dropped beforehand.
    Arguments:
    dataframe: dataframe that needs to be modified
    columns: column which contains the instances to be dropped, enter as a string
    row_drop_list: list of instances to be dropped
    '''
    dataframe.drop(
        dataframe[
        dataframe[column]
        .isin(row_drop_list)]
        .index, 
        axis=0, 
        inplace=True)
    return dataframe

def drop_columns(dataframe, column_drop_list):
    ''' 
    Function to drop specific rows according to location. Prepare a list of row-conditions to be dropped beforehand.
    Arguments:
    dataframe: dataframe that needs to be modified
    columns: column which contains the instances to be dropped, enter as a string
    column_drop_list: list of instances to be dropped
    '''
    dataframe.drop(column_drop_list, axis=1, inplace=True)
    return dataframe

def merge_frames(dataframe_1, dataframe_2, merge_column, how='outer'):
    ''' 
    Function to merge two dataframes on a common column. Enter merge_column and how as a string.
    Arguments:
    dataframe_1: first dataframe to merge
    dataframe_2: second dataframe to merge
    merge_column: column on which to merge
    how: type of merge (outer, inner, left, etc.)
    Output:
    new dataframe called merge_frame
    '''
    merge_frame = dataframe_1.merge(dataframe_2,
                                  on=merge_column,
                                  how=how)
    return merge_frame

def make_datetime(dataframe, column):
    ''' 
    Function to turn object columns into datetime format. Enter column as a string.
    Arguments:
    dataframe: the dataframe containing the column to be turned into datetime format
    column: column to be changed to datetime format
    Output:
    dataframe containing column as new format
    '''
    dataframe[column] = pd.to_datetime(dataframe[column], yearfirst=True, format="%Y-%m-%d %H:%M:%S")
    return dataframe

def combine_datetime(dataframe, column_year, column_month, column_day, new_col):
    ''' 
    Function to compile separate year, month and day columns into one date column in datetime format. Enter columns as a string.
    Arguments:
    dataframe: the dataframe containing the column to be turned into datetime format
    column_year: column containing the year
    column_month: column containing the month
    column_day: column containing the day
    new_col: name of the new column
    Output:
    dataframe containing a new combined date column in the format year-month-day)
    '''
    dataframe[new_col] = pd.to_datetime(dict(year=dataframe[column_year],
                                       month=dataframe[column_month],
                                       day=dataframe[column_day]))
    return dataframe

def column_transform(dataframe, new_col, grouping, col_transform, how='mean', drop_old=True):
    ''' 
    Function to transform columns. It groups a dataframe according to given columns and calculates the mean values of given columns.
    how argument needs to be given as string. After transformation, the old columns are dropped. 
    If columns do not have a new name, drop_old must be set to False!
    Arguments:
    dataframe: dataframe with which to work
    new_col: list of new column names (if they do not need to be changed, the col_transform list can be given here as well)
    grouping: list of columns according to which the dataframe needs to be grouped for calculations
    col_transform: columns to be transformed
    how: method of transformation. Default is mean, can also be sum.
    drop_old: Default as true. Drops the old columns out of the dataframe. If set to False, it keeps them in.
    Output:
    Dataframe with transformed values.
    '''
    count_new_col = 0
    count_old_col = 0
    for col in new_col:
        if count_new_col <= len(new_col):
            dataframe[new_col[count_new_col]] = dataframe.groupby(grouping)[col_transform[count_old_col]].transform(how)
            if drop_old==True:
                dataframe.drop([col_transform[count_old_col]], axis=1, inplace=True)
            count_new_col += 1
            count_old_col += 1
    dataframe.reset_index(inplace=True)
    dataframe.drop(['index'], axis=1, inplace=True)
    return dataframe

def pivot_frame(dataframe, index, column, values):
    '''
    Function to pivot a dataframe and flatten newly generated columns. Enter index and column as string.
    Arguments:
    dataframe = dataframe to pivot
    index = column on which the dataframe is supposed to be pivoted
    column = column or list of columns to be retained
    values = (previously defined) list of columns which are supposed to be sorted according to column entry
    Output:
    pivoted dataframe with flattened columns.
    '''
    dataframe = pd.pivot_table(dataframe, index=index, columns=[column], values=values)
    # flatten the multi-index columns
    dataframe.columns = ['_'.join(col) for col in dataframe.columns.values]
    # flatten all columns to one level
    dataframe.reset_index(inplace=True)
    return dataframe

In [4]:
# change the remaining column names to lowercase for easier use later on
df_sugarbeet_coded.columns = df_sugarbeet_coded.columns.str.lower()

# rename some of the columns
df_sugarbeet_coded.rename(columns={'ginams!': 'ms_comp', 'ginaotype!': 'otype_comp', 'ginapoll!': 'pollinator_comp', 'ginaseednames!': 'seednames_coded', 'ped_coded': 'pollinator', 'fieldid': 'station_location'}, inplace=True)
column_rename(df_weatherstations, 'city_name', 'station_location')
#remove the numbers from the fieldid
df_sugarbeet_coded['station_location'] = df_sugarbeet_coded['station_location'].replace(r'1', r'', regex=True)
df_sugarbeet_coded['station_location'] = df_sugarbeet_coded['station_location'].replace(r'_2', r'', regex=True)
df_sugarbeet_coded['station_location'] = df_sugarbeet_coded['station_location'].replace(r'2', r'', regex=True)
#drop the columns with unnecessary information after discussion with the stakeholder
drop_columns(df_sugarbeet_coded, ['cropid','bm', 'breedid', 'locationid', 'fieldblock', 'fieldsubblock', 'filler',
                                  'labnr', 'layoutnr','plotid', 'plotindex', 'rep','spectraname', 'trial', 'year', 'anzahl', 
                                  'standardind', 'pollinator_comp'])
drop_columns(df_weatherstations, ['dt', 'dt_iso', 'timezone', 'lat', 'lon', 'clouds_all', 'weather_id', 'weather_main', 
                                  'weather_description', 'weather_icon', 'plotting_date'])
drop_rows(df_sugarbeet_coded, 'station_location', ['Rittershausen', 'Sommepy'])
#remove one outlier with negative value
df_sugarbeet_coded = df_sugarbeet_coded[df_sugarbeet_coded.betaine_nir >= 0]
#exclude missing value (0.19 %)
df_sugarbeet_coded = df_sugarbeet_coded.dropna()

#replace location
df_weatherstations['station_location'] = df_weatherstations['station_location'].replace(r'Herchsheim_2', r'Herchsheim', regex=True)
df_weatherstations['station_location'] = df_weatherstations['station_location'].replace(r'VierhÃ¶fen', r'Vierhoefen', regex=True)
df_weatherstations['station_location'] = df_weatherstations['station_location'].replace(r'Sommepy1', r'Sommepy', regex=True)
df_weatherstations['station_location'] = df_weatherstations['station_location'].replace(r'Sommepy2', r'Sommepy', regex=True)

In [5]:
df_sugarbeet_coded.isnull().sum()

betaine_nir         0
cry_nir             0
csy_nir             0
dm_nir              0
station_location    0
region              0
invert_nir          0
mark_nir            0
ms_comp             0
obj                 0
otype_comp          0
sc_nir              0
seednames_coded     0
seriesid            0
totaln_nir          0
x                   0
y                   0
pollinator          0
dtype: int64

In [6]:
combine_datetime(df_weatherstations, 'year', 'month', 'day', 'date')

Unnamed: 0,station_location,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,date,year,month,day
0,Anklam,-0.89,-3.12,-4.97,-1.70,-0.52,1000,83,3.46,55,2010-01-01,2010,1,1
1,Anklam,-0.81,-3.04,-4.96,-1.40,-0.45,999,83,3.58,53,2010-01-01,2010,1,1
2,Anklam,-0.70,-2.93,-4.83,-1.40,-0.33,999,83,3.58,52,2010-01-01,2010,1,1
3,Anklam,-0.72,-2.95,-4.57,-1.08,-0.33,999,83,3.23,52,2010-01-01,2010,1,1
4,Anklam,-0.73,-3.11,-4.68,-1.11,-0.33,998,82,3.35,55,2010-01-01,2010,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795030,Vierhoefen,9.67,7.61,8.78,8.71,10.34,1002,87,2.03,246,2022-09-28,2022,9,28
1795031,Vierhoefen,9.31,7.26,7.45,7.90,9.79,1004,87,3.42,213,2022-09-28,2022,9,28
1795032,Vierhoefen,9.30,7.42,7.43,7.84,9.79,1004,88,3.42,213,2022-09-28,2022,9,28
1795033,Vierhoefen,9.04,7.16,7.12,7.84,9.79,1004,88,3.42,213,2022-09-28,2022,9,28


In [7]:
df_weatherstations.columns

Index(['station_location', 'temp', 'dew_point', 'feels_like', 'temp_min',
       'temp_max', 'pressure', 'humidity', 'wind_speed', 'wind_deg', 'date',
       'year', 'month', 'day'],
      dtype='object')

In [8]:
# exclude jan, feb, mar, nov, dec
monthkeep = [4,  5,  6,  7,  8,  9, 10]
df_weatherstations = df_weatherstations[df_weatherstations.month.isin(monthkeep)]

In [9]:
# make lists based on which columns are transformed (mean or sum) according to specific grouping (in this case month and location)
# 
new_col = ['temp_monthly', 
           'temp_min_monthly', 
           'temp_max_monthly', 
            'dew_point_monthly', 
           'pressure_monthly', 
           'humidity_monthly', 
           'wind_speed_monthly',
           'wind_deg_monthly'
           ]
grouping = ['station_location', 'month']
col_transform = ['temp',
                 'temp_min',
                 'temp_max',
                 'dew_point',
                 'pressure',
                 'humidity',
                 'wind_speed',
                 'wind_deg'
                 ]
dropcollist3 = ['year', 
                'day', 
                'date',
                'feels_like'
                ]

In [10]:
# create monthly dataframe
df_weatherstations_monthly = drop_columns(df_weatherstations, dropcollist3)

In [11]:
# make monthly average out of columns in col_transform list
column_transform(df_weatherstations_monthly, new_col, grouping, col_transform, how='mean')

Unnamed: 0,station_location,month,temp_monthly,temp_min_monthly,temp_max_monthly,dew_point_monthly,pressure_monthly,humidity_monthly,wind_speed_monthly,wind_deg_monthly
0,Anklam,4,8.11,7.04,9.09,3.44,1015.48,74.09,3.65,186.75
1,Anklam,4,8.11,7.04,9.09,3.44,1015.48,74.09,3.65,186.75
2,Anklam,4,8.11,7.04,9.09,3.44,1015.48,74.09,3.65,186.75
3,Anklam,4,8.11,7.04,9.09,3.44,1015.48,74.09,3.65,186.75
4,Anklam,4,8.11,7.04,9.09,3.44,1015.48,74.09,3.65,186.75
...,...,...,...,...,...,...,...,...,...,...
1058801,Vierhoefen,9,14.99,14.17,15.60,9.60,1017.85,74.37,2.41,196.29
1058802,Vierhoefen,9,14.99,14.17,15.60,9.60,1017.85,74.37,2.41,196.29
1058803,Vierhoefen,9,14.99,14.17,15.60,9.60,1017.85,74.37,2.41,196.29
1058804,Vierhoefen,9,14.99,14.17,15.60,9.60,1017.85,74.37,2.41,196.29


In [12]:
df_weatherstations_monthly.drop_duplicates()

Unnamed: 0,station_location,month,temp_monthly,temp_min_monthly,temp_max_monthly,dew_point_monthly,pressure_monthly,humidity_monthly,wind_speed_monthly,wind_deg_monthly
0,Anklam,4,8.11,7.04,9.09,3.44,1015.48,74.09,3.65,186.75
720,Anklam,5,12.45,11.20,13.56,7.72,1015.10,74.83,3.45,188.86
1464,Anklam,6,16.98,15.92,17.84,12.13,1015.00,75.02,3.16,201.06
2184,Anklam,7,18.71,17.85,19.42,13.88,1014.37,75.48,3.27,208.02
2928,Anklam,8,18.54,17.76,19.25,13.99,1014.93,76.77,3.11,196.87
...,...,...,...,...,...,...,...,...,...,...
994294,Vierhoefen,6,18.75,17.87,19.44,12.17,1015.79,69.27,2.45,204.74
995014,Vierhoefen,7,20.14,19.29,20.79,13.25,1016.10,68.07,2.44,214.32
995758,Vierhoefen,8,19.75,18.76,20.41,13.00,1016.55,69.35,2.26,199.23
996502,Vierhoefen,9,14.99,14.17,15.60,9.60,1017.85,74.37,2.41,196.29


In [13]:
df_weatherstations_monthly.columns

Index(['station_location', 'month', 'temp_monthly', 'temp_min_monthly',
       'temp_max_monthly', 'dew_point_monthly', 'pressure_monthly',
       'humidity_monthly', 'wind_speed_monthly', 'wind_deg_monthly'],
      dtype='object')

## PIVOT

In [14]:
# create list of columns to include in pivot table
pivotlist = ['temp_monthly', 
             'temp_min_monthly',
             'temp_max_monthly', 
             'dew_point_monthly', 
             'pressure_monthly',
             'humidity_monthly', 
             'wind_speed_monthly', 
             'wind_deg_monthly']
# make column name to str to facilitate flattening in the pivot dataframe later
df_weatherstations_monthly.month = df_weatherstations_monthly.month.astype(str)

In [15]:
df_weatherlocations_monthlypiv = pivot_frame(df_weatherstations_monthly, 'station_location', 'month', pivotlist)
df_weatherlocations_monthlypiv

Unnamed: 0,station_location,dew_point_monthly_10,dew_point_monthly_4,dew_point_monthly_5,dew_point_monthly_6,dew_point_monthly_7,dew_point_monthly_8,dew_point_monthly_9,humidity_monthly_10,humidity_monthly_4,...,wind_deg_monthly_7,wind_deg_monthly_8,wind_deg_monthly_9,wind_speed_monthly_10,wind_speed_monthly_4,wind_speed_monthly_5,wind_speed_monthly_6,wind_speed_monthly_7,wind_speed_monthly_8,wind_speed_monthly_9
0,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,208.02,196.87,201.52,3.73,3.65,3.45,3.16,3.27,3.11,3.42
1,Bautzen,7.53,4.14,8.56,12.8,13.88,14.27,11.23,84.32,70.99,...,222.35,204.85,209.46,3.43,3.22,3.07,2.77,2.75,2.54,2.83
2,Emmeloord,8.81,4.67,7.95,11.99,13.65,13.74,11.46,85.17,73.38,...,208.36,194.73,192.99,3.76,3.99,3.75,3.33,3.04,3.08,3.16
3,Goderville,10.07,6.1,9.01,12.33,13.9,14.02,12.46,84.04,77.96,...,205.9,210.39,195.64,5.16,4.43,4.45,4.21,4.12,4.13,4.33
4,Hamm,8.55,4.79,8.47,12.53,13.73,13.9,11.44,84.32,71.5,...,209.16,198.19,195.55,3.65,2.98,2.96,2.85,2.71,2.89,2.97
5,Herchsheim,7.7,4.05,8.3,12.67,13.31,13.53,10.79,85.01,68.79,...,200.36,193.42,182.93,2.92,2.87,2.8,2.52,2.51,2.49,2.54
6,Lamotte,9.25,4.96,8.33,11.91,12.98,13.11,11.32,85.85,73.03,...,193.89,199.07,185.26,4.46,4.39,4.22,4.07,3.86,3.91,3.86
7,Lelystad,8.38,4.85,8.27,11.97,13.72,14.01,11.62,82.36,73.45,...,196.21,185.77,182.57,4.14,4.31,4.15,3.85,3.64,3.62,3.67
8,Mattenkofen,4.84,4.2,8.61,12.17,13.12,12.74,9.61,76.74,68.55,...,210.78,192.13,192.08,2.41,2.65,2.58,2.3,2.25,2.1,2.25
9,Oberviehhausen,7.12,3.97,8.76,12.94,13.86,14.01,10.96,82.36,66.62,...,212.57,196.37,196.77,2.53,2.75,2.67,2.39,2.35,2.17,2.33


In [16]:
# merge weather and sugarbeet frames on location
df_openweather_sugar_coded = merge_frames(df_weatherlocations_monthlypiv, df_sugarbeet_coded, 'station_location')

In [33]:
df_openweather_sugar_coded

Unnamed: 0,station_location,dew_point_monthly_10,dew_point_monthly_4,dew_point_monthly_5,dew_point_monthly_6,dew_point_monthly_7,dew_point_monthly_8,dew_point_monthly_9,humidity_monthly_10,humidity_monthly_4,...,ms_comp,obj,otype_comp,sc_nir,seednames_coded,seriesid,totaln_nir,x,y,pollinator
0,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,2.00,10.00,1.00,17.20,108.00,1503.00,0.15,96.00,13.00,6.00
1,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,2.00,14.00,1.00,17.68,1191.00,1503.00,0.16,96.00,14.00,16.00
2,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,2.00,13.00,1.00,17.37,103.00,1503.00,0.16,96.00,15.00,10.00
3,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,2.00,9.00,1.00,16.91,107.00,1503.00,0.15,96.00,16.00,6.00
4,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,2.00,17.00,1.00,17.20,1194.00,1503.00,0.15,97.00,13.00,17.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14423,Vierhoefen,4.82,4.07,8.63,12.17,13.25,13.00,9.60,76.70,67.98,...,1.00,50.00,3.00,14.53,628.00,1516.00,0.16,83.00,8.00,102.00
14424,Vierhoefen,4.82,4.07,8.63,12.17,13.25,13.00,9.60,76.70,67.98,...,7.00,4.00,8.00,13.70,1205.00,1516.00,0.16,83.00,9.00,113.00
14425,Vierhoefen,4.82,4.07,8.63,12.17,13.25,13.00,9.60,76.70,67.98,...,1.00,33.00,3.00,12.91,466.00,1516.00,0.14,83.00,10.00,80.00
14426,Vierhoefen,4.82,4.07,8.63,12.17,13.25,13.00,9.60,76.70,67.98,...,1.00,51.00,3.00,13.44,629.00,1516.00,0.13,83.00,11.00,102.00


In [29]:
df_openweather_sugar_coded.dropna(subset=['seriesid'], inplace=True)

In [32]:
df_openweather_sugar_coded.to_pickle('pickles/df_openweather_sugar_coded.pkl')

In [34]:
df_output = pd.read_csv('/Users/clara/Neue_Fische/capstone_project/upload_files/data/output_table.csv')
df_output.head(2)

Unnamed: 0.1,Unnamed: 0,seednames_coded,predicted_sugar_content,actual_sugar_content,weather_station
0,0,108.0,17.01,17.2,Anklam
1,1,1191.0,17.3,17.68,Anklam


In [35]:
drop_columns(df_output, 'Unnamed: 0')

Unnamed: 0,seednames_coded,predicted_sugar_content,actual_sugar_content,weather_station
0,108.00,17.01,17.20,Anklam
1,1191.00,17.30,17.68,Anklam
2,103.00,17.08,17.37,Anklam
3,107.00,17.01,16.91,Anklam
4,1194.00,16.96,17.20,Anklam
...,...,...,...,...
14422,628.00,14.12,14.53,Vierhoefen
14423,1205.00,13.29,13.70,Vierhoefen
14424,466.00,13.74,12.91,Vierhoefen
14425,629.00,14.12,13.44,Vierhoefen


In [36]:
df_output.weather_station.value_counts()

Pithiviers        2079
Stadthagen        1357
Goderville        1326
Mattenkofen       1300
Lamotte           1257
Bautzen           1200
Emmeloord         1197
Soest             1195
Lelystad           961
Vierhoefen         849
Anklam             701
Hamm               676
Herchsheim         222
Oberviehhausen     107
Name: weather_station, dtype: int64

In [38]:
df_output.groupby('seednames_coded')['weather_station'].value_counts()

seednames_coded  weather_station
1.00             Lelystad            2
                 Mattenkofen         2
                 Pithiviers          2
                 Anklam              1
                 Goderville          1
                                    ..
1205.00          Vierhoefen         24
                 Anklam             21
                 Hamm               20
                 Herchsheim          7
                 Oberviehhausen      3
Name: weather_station, Length: 11707, dtype: int64

In [41]:
df_output.query('seednames_coded == 1')

Unnamed: 0,seednames_coded,predicted_sugar_content,actual_sugar_content,weather_station
190,1.0,17.48,17.48,Anklam
4087,1.0,16.85,16.9,Goderville
4437,1.0,18.55,18.83,Hamm
5242,1.0,18.57,18.56,Herchsheim
7330,1.0,17.02,17.0,Lelystad
7473,1.0,17.02,17.2,Lelystad
8641,1.0,16.54,16.4,Mattenkofen
8762,1.0,16.54,16.84,Mattenkofen
9762,1.0,19.54,19.61,Pithiviers
10928,1.0,19.54,19.42,Pithiviers


In [44]:
seedcodes_list = df_sugarbeet_coded.seednames_coded.unique()

In [47]:
len(seedcodes_list)

1205

In [55]:
df_sugarbeet_coded.columns

Index(['betaine_nir', 'cry_nir', 'csy_nir', 'dm_nir', 'station_location',
       'region', 'invert_nir', 'mark_nir', 'ms_comp', 'obj', 'otype_comp',
       'sc_nir', 'seednames_coded', 'seriesid', 'totaln_nir', 'x', 'y',
       'pollinator'],
      dtype='object')

In [56]:
df_seedcode = df_sugarbeet_coded[['seednames_coded', 'pollinator', 'otype_comp', 'ms_comp']]
df_seedcode.drop_duplicates(inplace=True)
df_seedcode.shape
df_seedcode.to_csv('/Users/clara/Neue_Fische/capstone_project/upload_files/data/seedcode.csv')

In [50]:
df_seedcode.head(2)

Unnamed: 0,seednames_coded,pollinator
0,85,59
1,1204,113


In [68]:
df_weatherloc_prev = pd.read_csv('/Users/clara/Neue_Fische/capstone_project/upload_files/data/weatherloc_ped.csv', delimiter=';')
df_weatherloc_prev.head()

Unnamed: 0,seednames_coded,station_location,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,1.0,Anklam,85.0,59.0,2.0,2.0
1,2.0,Anklam,1204.0,113.0,7.0,6.0
2,3.0,Anklam,90.0,61.0,2.0,2.0
3,4.0,Anklam,1142.0,16.0,3.0,1.0
4,5.0,Anklam,95.0,16.0,2.0,2.0


In [69]:
drop_columns(df_weatherloc_prev, 'seednames_coded')
column_rename(df_weatherloc_prev, 'Unnamed: 2', 'seednames_coded')
column_rename(df_weatherloc_prev, 'Unnamed: 3',  'pollinator')
column_rename(df_weatherloc_prev, 'Unnamed: 4', 'otype_comp')
column_rename(df_weatherloc_prev, 'Unnamed: 5', 'ms_comp')
df_weatherloc_prev.head()

Unnamed: 0,station_location,seednames_coded,pollinator,otype_comp,ms_comp
0,Anklam,85.0,59.0,2.0,2.0
1,Anklam,1204.0,113.0,7.0,6.0
2,Anklam,90.0,61.0,2.0,2.0
3,Anklam,1142.0,16.0,3.0,1.0
4,Anklam,95.0,16.0,2.0,2.0


In [70]:
df_prediction = merge_frames(df_weatherlocations_monthlypiv, df_weatherloc_prev, 'station_location')
df_prediction

Unnamed: 0,station_location,dew_point_monthly_10,dew_point_monthly_4,dew_point_monthly_5,dew_point_monthly_6,dew_point_monthly_7,dew_point_monthly_8,dew_point_monthly_9,humidity_monthly_10,humidity_monthly_4,...,wind_speed_monthly_4,wind_speed_monthly_5,wind_speed_monthly_6,wind_speed_monthly_7,wind_speed_monthly_8,wind_speed_monthly_9,seednames_coded,pollinator,otype_comp,ms_comp
0,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,3.65,3.45,3.16,3.27,3.11,3.42,85.00,59.00,2.00,2.00
1,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,3.65,3.45,3.16,3.27,3.11,3.42,1204.00,113.00,7.00,6.00
2,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,3.65,3.45,3.16,3.27,3.11,3.42,90.00,61.00,2.00,2.00
3,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,3.65,3.45,3.16,3.27,3.11,3.42,1142.00,16.00,3.00,1.00
4,Anklam,7.81,3.44,7.72,12.13,13.88,13.99,11.19,86.31,74.09,...,3.65,3.45,3.16,3.27,3.11,3.42,95.00,16.00,2.00,2.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19275,,,,,,,,,,,...,,,,,,,,,,
19276,,,,,,,,,,,...,,,,,,,,,,
19277,,,,,,,,,,,...,,,,,,,,,,
19278,,,,,,,,,,,...,,,,,,,,,,


In [71]:
df_prediction.isnull().sum()

station_location        1205
dew_point_monthly_10    1205
dew_point_monthly_4     1205
dew_point_monthly_5     1205
dew_point_monthly_6     1205
                        ... 
wind_speed_monthly_9    1205
seednames_coded         1205
pollinator              1205
otype_comp              1205
ms_comp                 1205
Length: 61, dtype: int64

In [73]:
df_prediction = df_prediction.dropna()

In [74]:
df_prediction.station_location.unique()

array(['Anklam', 'Bautzen', 'Emmeloord', 'Goderville', 'Hamm',
       'Herchsheim', 'Lamotte', 'Lelystad', 'Mattenkofen',
       'Oberviehhausen', 'Pithiviers', 'Soest', 'Sommepy', 'Stadthagen',
       'Vierhoefen'], dtype=object)

In [75]:
df_prediction.shape

(18075, 61)

In [76]:
df_prediction.to_pickle('pickles/weatherprediction.pkl')