In [None]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import default_timer as timer

from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import  mean_squared_error, r2_score
import xgboost as xgb

# import functions
from src.modeling.modeling_functions import error_analysis

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.2f' % x) # change decimal places

RSEED = 2

## Loading the processed dataframe (sugarbeet and openweather station data - all fields)

In [None]:
df_og = pd.read_pickle('pickles/df_openweather_monthly_sugarbeet.pkl')
df_og

In [None]:
df_og.station_location.unique()

In [None]:
# convert to string for categorizing
df_og['seednames_coded'] = df_og['seednames_coded'].astype(str)
df_og['pollinator_comp'] = df_og['pollinator_comp'].astype(str)
df_og['ms_comp'] = df_og['ms_comp'].astype(str)
df_og['otype_comp'] = df_og['otype_comp'].astype(str)

In [None]:
df_og.shape

In [None]:
df = df_og
# dropping columns 
df.drop(['betaine_nir', 
         'cry_nir', 
         'dm_nir', 
         'invert_nir', 
         'mark_nir', 
         'sc_nir',
         #'csy_nir', 
         'totaln_nir', 
         'obj',  
         'seriesid', 
         'x', 
         'y', 
         'ms_comp',
         'otype_comp', 
         #'pollinator_comp',
         #'seednames_coded',
         'region',
         'station_location'
         ], axis=1, inplace=True)
df.columns

In [None]:
df.shape

In [None]:
# reset index after drpping columns
df = df.reset_index(drop=True)

#### Creating the pipeline

In [None]:
# creating list for categorical predictors/features 
cat_features = list(df.columns[df.dtypes==object])
cat_features

In [None]:
# creating list for numerical predictors/features
# since 'sc_nir' is our target variable we will exclude this feature from the list of numerical predictors 
# latitude and longitude are also excluded to avoid location influence on prediction
num_features = [
 'dew_point_monthly_10.0',
 'dew_point_monthly_4.0',
 'dew_point_monthly_5.0',
 'dew_point_monthly_6.0',
 'dew_point_monthly_7.0',
 'dew_point_monthly_8.0',
 'dew_point_monthly_9.0',
 'humidity_monthly_10.0',
 'humidity_monthly_4.0',
 'humidity_monthly_5.0',
 'humidity_monthly_6.0',
 'humidity_monthly_7.0',
 'humidity_monthly_8.0',
 'humidity_monthly_9.0',
 'pressure_monthly_10.0',
 'pressure_monthly_4.0',
 'pressure_monthly_5.0',
 'pressure_monthly_6.0',
 'pressure_monthly_7.0',
 'pressure_monthly_8.0',
 'pressure_monthly_9.0',
 'temp_max_monthly_10.0',
 'temp_max_monthly_4.0',
 'temp_max_monthly_5.0',
 'temp_max_monthly_6.0',
 'temp_max_monthly_7.0',
 'temp_max_monthly_8.0',
 'temp_max_monthly_9.0',
 'temp_min_monthly_10.0',
 'temp_min_monthly_4.0',
 'temp_min_monthly_5.0',
 'temp_min_monthly_6.0',
 'temp_min_monthly_7.0',
 'temp_min_monthly_8.0',
 'temp_min_monthly_9.0',
 'temp_monthly_10.0',
 'temp_monthly_4.0',
 'temp_monthly_5.0',
 'temp_monthly_6.0',
 'temp_monthly_7.0',
 'temp_monthly_8.0',
 'temp_monthly_9.0',
 'wind_deg_monthly_10.0',
 'wind_deg_monthly_4.0',
 'wind_deg_monthly_5.0',
 'wind_deg_monthly_6.0',
 'wind_deg_monthly_7.0',
 'wind_deg_monthly_8.0',
 'wind_deg_monthly_9.0',
 'wind_speed_monthly_10.0',
 'wind_speed_monthly_4.0',
 'wind_speed_monthly_5.0',
 'wind_speed_monthly_6.0',
 'wind_speed_monthly_7.0',
 'wind_speed_monthly_8.0',
 'wind_speed_monthly_9.0'
]
num_features

#### Preparing test set

In [None]:
# SHUFFLE!!!
df = df.sample(frac=1)

In [None]:
# define predictors and target variable
X_train = df.drop('csy_nir', axis=1)
y_train = df['csy_nir']
print(f"We have {X_train.shape[0]} observations in our dataset and {X_train.shape[1]} features")
print(f"Our target vector has also {y_train.shape[0]} values")

#### Preprocessing Pipeline

In [None]:
# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Pipeline for numerical features
num_pipeline = Pipeline([
    ('std_scaler', RobustScaler())
])

In [None]:
# complete pipeline for numerical features
# apply transformers to numerical pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

### Predictive modeling using Pipeline and GridSearch

#### XGBoost

In [None]:
# building a full pipeline with our preprocessor and the RandomForestRegressor
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', xgb.XGBRegressor(n_estimators = 100))
])

In [None]:
# making predictions on the training set using cross validation as well as calculating the probabilities
# cross_val_predict expects an estimator (model), X, y and number of cv-splits (cv)
y_train_predicted = cross_val_predict(pipe, X_train, y_train, cv=5)

In [None]:
# Mean Squared Error
print('MSE XGBoost Train:\n', mean_squared_error(y_train, y_train_predicted))

# Root Mean Squared Error
print('RMSE XGBoost Train:\n', mean_squared_error(y_train, y_train_predicted, squared = False))

# R^2 Score
print('R^2 XGBoost Train:\n', r2_score(y_train, y_train_predicted))


#### Optimizing via GridSearch

In [None]:
# defining parameter space for grid-search
# add 'xgb__' infront of the corresponding hyperparameters
param = {'xgb__max_depth': [10,20,30,40,50],
                  'xgb__max_features': ["sqrt"],
                  'xgb__max_leaf_nodes': [500, 5000],
                  'xgb__min_samples_split': [10, 50, 100]}

grid = GridSearchCV(pipe, param_grid=param, scoring = r2_score, cv=5,
                           verbose=1, n_jobs=-1)

In [None]:
# train the model
grid.fit(X_train, y_train)

In [None]:
# show best parameters
print('Best score:\n{:.2f}'.format(grid.best_score_))
print("Best parameters:\n{}".format(grid.best_params_))

In [None]:
# save best model (including fitted preprocessing steps) as best_model 
best_model = grid.best_estimator_
best_model

In [None]:
df_unseen = pd.read_pickle('pickles/weatherprediction.pkl')
df_unseen

In [None]:
df_unseen['seednames_coded'] = df_unseen['seednames_coded'].astype(str)
df_unseen['pollinator'] = df_unseen['pollinator'].astype(str)
df_unseen['ms_comp'] = df_unseen['ms_comp'].astype(str)
df_unseen['otype_comp'] = df_unseen['otype_comp'].astype(str)

In [None]:
# define predictors and target variable
X_test = df_unseen
print(f"We have {X_test.shape[0]} observations in our dataset and {X_test.shape[1]} features")

In [None]:
X_test

In [None]:
# calculating the mean squared error, root mean squared error and r^2 score for the test set with the optimized model
y_predicted = best_model.predict(X_test)

In [None]:
# table for output
data = {'seednames_coded': df_unseen.seednames_coded, 'predicted_sugar_content': y_predicted, 'weather_station': df_unseen.station_location, 'pollinator': df_unseen.pollinator, 'otype': df_unseen.otype_comp, 'ms': df_unseen.ms_comp}
output_table = pd.DataFrame(data)
output_table

In [None]:
output_table.to_csv('data/prediction_sugar_content_table_csy.csv')

In [None]:
df_output =output_table.groupby(['seednames_coded'])['weather_station'].value_counts()
#['predicted_sugar_content', 'actual_sugar_content'].mean()
df_output