In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import layers
from tensorflow.keras import regularizers
import tensorflow_docs as tfdocs
import tensorflow_docs.modeling
import tensorflow_docs.plots
import datetime, time, os
from tensorflow.keras.layers.experimental import preprocessing
    
print('Using TensorFlow version: %s' % tf.__version__)
np.set_printoptions(precision=3, suppress=True)

import datetime, time, os
from datetime import datetime

from sklearn.pipeline import Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn import preprocessing
from sklearn import utils
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import StratifiedKFold, KFold

#!pip install -q git+https://github.com/tensorflow/docs # install first time

RSEED = 42

In [None]:
# With this command you can clear any logs from previous runs
# If you want to compare different runs you can skip this cell 
!rm -rf my_logs/

## Loading the processed dataframe (sugarbeet and field weatherstation data)


In [None]:
df = pd.read_pickle('/Users/isabellecarinaflaig/neuefische/capstone_project/data_strube/pickles/df_openweather_sugar_coded.pkl')
df

In [None]:
# convert to string for categorizing
df['seednames_coded'] = df['seednames_coded'].astype(str)
df['pollinator'] = df['pollinator'].astype(str)
df['ms_comp'] = df['ms_comp'].astype(str)
df['otype_comp'] = df['otype_comp'].astype(str)

In [None]:
df.shape

In [None]:
# dropping columns 
df.drop(['betaine_nir', 
         'cry_nir', 
         'dm_nir', 
         'invert_nir', 
         'mark_nir', 
         'sc_nir',
         #'csy_nir', 
         'totaln_nir', 
         'obj',  
         'seriesid', 
         'x', 
         'y', 
         'ms_comp',
         'otype_comp', 
         #'pollinator',
         #'seednames_coded',
         'region',
         'station_location'
         ], axis=1, inplace=True)
df.columns


In [None]:
df.shape

In [None]:
# reset index after drpping columns
df = df.reset_index(drop=True)

#### Creating the pipeline

In [None]:
# creating list for categorical predictors/features 
cat_features = list(df.columns[df.dtypes==object])
cat_features

In [None]:
# creating list for numerical predictors/features
# since 'sc_nir' is our target variable we will exclude this feature from the list of numerical predictors 
# latitude and longitude are also excluded to avoid location influence on prediction
num_features = [
 'dew_point_monthly_10', 'dew_point_monthly_4', 'dew_point_monthly_5',
       'dew_point_monthly_6', 'dew_point_monthly_7', 'dew_point_monthly_8',
       'dew_point_monthly_9', 'humidity_monthly_10', 'humidity_monthly_4',
       'humidity_monthly_5', 'humidity_monthly_6', 'humidity_monthly_7',
       'humidity_monthly_8', 'humidity_monthly_9', 'pressure_monthly_10',
       'pressure_monthly_4', 'pressure_monthly_5', 'pressure_monthly_6',
       'pressure_monthly_7', 'pressure_monthly_8', 'pressure_monthly_9',
       'temp_max_monthly_10', 'temp_max_monthly_4', 'temp_max_monthly_5',
       'temp_max_monthly_6', 'temp_max_monthly_7', 'temp_max_monthly_8',
       'temp_max_monthly_9', 'temp_min_monthly_10', 'temp_min_monthly_4',
       'temp_min_monthly_5', 'temp_min_monthly_6', 'temp_min_monthly_7',
       'temp_min_monthly_8', 'temp_min_monthly_9', 'temp_monthly_10',
       'temp_monthly_4', 'temp_monthly_5', 'temp_monthly_6', 'temp_monthly_7',
       'temp_monthly_8', 'temp_monthly_9', 'wind_deg_monthly_10',
       'wind_deg_monthly_4', 'wind_deg_monthly_5', 'wind_deg_monthly_6',
       'wind_deg_monthly_7', 'wind_deg_monthly_8', 'wind_deg_monthly_9',
       'wind_speed_monthly_10', 'wind_speed_monthly_4', 'wind_speed_monthly_5',
       'wind_speed_monthly_6', 'wind_speed_monthly_7', 'wind_speed_monthly_8',
       'wind_speed_monthly_9'
]
num_features

#### Preparing test set

In [None]:
# SHUFFLE!!!
df = df.sample(frac=1)

In [None]:
# define predictors and target variable
X_train = df.drop('csy_nir', axis=1)
y_train = df['csy_nir']
print(f"We have {X_train.shape[0]} observations in our dataset and {X_train.shape[1]} features")
print(f"Our target vector has also {y_train.shape[0]} values")

#### Preprocessing Pipeline

In [None]:
# Pipeline for categorical features 
cat_pipeline = Pipeline([
    ('1hot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# pipeline for numerical features
num_pipeline = Pipeline([
    ('scaler', RobustScaler())
])

In [None]:
# complete pipeline for numerical features
# apply transformers to numerical pipeline
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
], sparse_threshold=0)

#### Transform X_train

In [None]:
X_train_transformed = preprocessor.fit_transform(X_train)

In [None]:
X_train_transformed.shape

X_tf_train = tf.convert_to_tensor(X_train_transformed)
y_tf_train = tf.convert_to_tensor(y_train)


## Training
For training you need a train/val split (hopefully you did a train/test split before (and you should use the same as in your ML project to make results comparable). 

In [None]:
# define dictionary to store results
training_history = {}

# define number of epochs and learning rate decay
N_TRAIN = len(X_train)
N_VAL = 0.2
EPOCHS = 100
BATCH_SIZE = N_TRAIN // 10
STEPS_PER_EPOCH = N_TRAIN // BATCH_SIZE
# lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
#     0.001,
#     decay_steps=STEPS_PER_EPOCH*1000,
#     decay_rate=1,
#     staircase=False)

### Build, compile and fit your model


In [None]:
# define path for new directory 
root_logdir = os.path.join(os.curdir, "my_logs")

# define function for creating a new folder for each run
def get_run_logdir():
    now = datetime.now()
    run_id = now.strftime('%Y-%m-%d %H:%M:%S')
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir()

In [None]:
# define path where checkpoints should be stored
checkpoint_path = "training_1/ML_model.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=0) # Set verbose != 0 if you want output during training 
# return [list of your callbacks]
def get_callbacks(name):
    return tf.keras.callbacks.TensorBoard(run_logdir+name, histogram_freq=1)

You can implement your callbacks in the `model.fit()` method below.

In [None]:
def model_compile_and_fit(model, name, optimizer=None, max_epochs=EPOCHS):
  
    # model.compile
    model.compile(optimizer = 'adam', loss = 'mae', metrics = ['mse'])
    
    # model.fit
    history = model.fit(X_tf_train, y_train, batch_size = BATCH_SIZE, validation_split=N_VAL, epochs = max_epochs, callbacks=get_callbacks(name))
    
    # return results
    return history

In [None]:
X_train.shape

#### Build your model
You can build your model by using `tf.keras.Sequential()` that helps you to sequentially define your different layers from input to output. 

## First model

In [None]:
with tf.device('/cpu:0'):
    model = tf.keras.Sequential([
        layers.Dense(100, activation='relu'),
        layers.Dense(100, activation='relu'),
        layers.Dense(1)
        ])


#### Train your model
Train your model by using your `model_compile_and_fit()` function you defined above.

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=RSEED)
cvscores = []

for train, test in kfold.split(X_tf_train,y_tf_train):

#your_history = model_compile_and_fit(your_model, ....)
    with tf.device('/cpu:0'):
        training_history["first_model"] = model_compile_and_fit(model, "first_model")
#         scores = model.evaluate(X[test], Y[test], verbose=0)
#     print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
#     cvscores.append(scores[1] * 100)
# print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

model.summary()

#### Evaluate your model training
TensorFlow offers now (this was more cumbersome before) a simple history plotter that you can use to plot training histories and see how the model performed on training and validation data set.

In [None]:
training_history

In [None]:
# plotting function for MSE
def plot_metric(history):
    plt.plot(history.history['mse'])
    plt.plot(history.history['val_mse'])
    plt.ylim([0, 2.5])
    plt.title('Model MSE')
    plt.ylabel('MSE')
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='upper right')
    plt.show()

In [None]:
plot_metric(training_history['first_model'])

In [None]:
# plotting function for loss
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.ylim([0, 5])
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.legend()
    plt.grid(True)

In [None]:
plot_loss(training_history['first_model'])

In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(training_history)

In [None]:
training_history

In [None]:
df_unseen = pd.read_pickle('/Users/isabellecarinaflaig/neuefische/capstone_project/data_strube/pickles/weatherprediction.pkl')
df_unseen

In [None]:
df_unseen['seednames_coded'] = df_unseen['seednames_coded'].astype(str)
df_unseen['pollinator'] = df_unseen['pollinator'].astype(str)
df_unseen['ms_comp'] = df_unseen['ms_comp'].astype(str)
df_unseen['otype_comp'] = df_unseen['otype_comp'].astype(str)

In [None]:
# define predictors and target variable
X_test = df_unseen
print(f"We have {X_test.shape[0]} observations in our dataset and {X_test.shape[1]} features")

In [None]:
X_test_transformed = preprocessor.transform(X_test)
X_tf_test = tf.convert_to_tensor(X_test_transformed)

In [None]:
y_predicted = model.predict(X_tf_test).flatten()

In [None]:
# table for output
data = {'seednames_coded': df_unseen.seednames_coded, 'predicted_sugar_content': y_predicted, 'weather_station': df_unseen.station_location, 'pollinator': df_unseen.pollinator, 'otype': df_unseen.otype_comp, 'ms': df_unseen.ms_comp}
output_table = pd.DataFrame(data)
output_table

In [None]:
output_table.to_csv('prediction_sugar_content_dnn_model_1_csy.csv')

## Model tuning

## Second model

In [None]:
#===========#
# Second Model with more nodes
#===========#

with tf.device('/cpu:0'):
    model = tf.keras.Sequential([
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(1)
    ])  


kfold = KFold(n_splits=5, shuffle=True, random_state=RSEED)
cvscores = []

for train, test in kfold.split(X_tf_train,y_tf_train):
#your_history = model_compile_and_fit(your_model, ....)
    with tf.device('/cpu:0'):
        training_history["second_model"] = model_compile_and_fit(model, "second_model")

model.summary()

In [None]:
plot_metric(training_history['second_model'])

In [None]:
plot_loss(training_history['second_model'])

In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(training_history)

In [None]:
training_history

In [None]:
y_predicted = model.predict(X_tf_test).flatten()

In [None]:
# table for output
data = {'seednames_coded': df_unseen.seednames_coded, 'predicted_sugar_content': y_predicted, 'weather_station': df_unseen.station_location, 'pollinator': df_unseen.pollinator, 'otype': df_unseen.otype_comp, 'ms': df_unseen.ms_comp}
output_table = pd.DataFrame(data)
output_table

In [None]:
output_table.to_csv('prediction_sugar_content_dnn_model_2_csy.csv')

## Third model

In [None]:
#===========#
# Third Model with more layers
#===========#

with tf.device('/cpu:0'):
    model = tf.keras.Sequential([
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        # layers.Dense(500, activation='relu'),
        layers.Dense(1)
    ])  

kfold = KFold(n_splits=5, shuffle=True, random_state=RSEED)
cvscores = []

for train, test in kfold.split(X_tf_train,y_tf_train):

#your_history = model_compile_and_fit(your_model, ....)
    with tf.device('/cpu:0'):
        training_history["third_model"] = model_compile_and_fit(model, "third_model")

model.summary()        

In [None]:
plot_metric(training_history['third_model'])

In [None]:
plot_loss(training_history['third_model'])

In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(training_history)

In [None]:
training_history

In [None]:
y_predicted = model.predict(X_tf_test).flatten()

In [None]:
# table for output
data = {'seednames_coded': df_unseen.seednames_coded, 'predicted_sugar_content': y_predicted, 'weather_station': df_unseen.station_location, 'pollinator': df_unseen.pollinator, 'otype': df_unseen.otype_comp, 'ms': df_unseen.ms_comp}
output_table = pd.DataFrame(data)
output_table

In [None]:
output_table.to_csv('prediction_sugar_content_dnn_model_3_csy.csv')

## Fourth model

In [None]:
#===========#
# Fourth Model with dropout
#===========#

with tf.device('/cpu:0'):
    model = tf.keras.Sequential([
        layers.Dense(500, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        layers.Dense(500, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        layers.Dense(1)
    ])  

kfold = KFold(n_splits=5, shuffle=True, random_state=RSEED)
cvscores = []

for train, test in kfold.split(X_tf_train,y_tf_train):

#your_history = model_compile_and_fit(your_model, ....)
    with tf.device('/cpu:0'):
        training_history["fourth_model"] = model_compile_and_fit(model, "fourth_model")

model.summary()        

In [None]:
plot_metric(training_history['fourth_model'])

In [None]:
plot_loss(training_history['fourth_model'])

In [None]:
history_plotter = tfdocs.plots.HistoryPlotter(metric = 'mse', smoothing_std=10)
history_plotter.plot(training_history)

In [None]:
training_history

In [None]:
y_predicted = model.predict(X_tf_test).flatten()

In [None]:
# table for output
data = {'seednames_coded': df_unseen.seednames_coded, 'predicted_sugar_content': y_predicted, 'weather_station': df_unseen.station_location, 'pollinator': df_unseen.pollinator, 'otype': df_unseen.otype_comp, 'ms': df_unseen.ms_comp}
output_table = pd.DataFrame(data)
output_table

In [None]:
output_table.to_csv('prediction_sugar_content_dnn_model_4_csy.csv')

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
%tensorboard --logdir=./my_logs --port=6005