## Neural Network Training

* In this Jupyter Notebook both the dynamic models and reward models are trained and
later exported as a paramter dictionary for later usage in with pytorch
* We use a neural network for both dynamics and reward
* You can define one of the two environemnts in the `Settings` block

In [None]:
import logging

import gym
import matplotlib.pyplot as plt
import numpy as np
import quanser_robots

import sys
sys.path.insert(0,'../')
from Challenge_1.Algorithms.PolicyIteration import PolicyIteration
from Challenge_1.Algorithms.ValueIteration import ValueIteration
from Challenge_1.Models.NNModelPendulum import NNModelPendulum
from Challenge_1.Models.NNModelQube import NNModelQube
from Challenge_1.Models.SklearnModel import SklearnModel
from Challenge_1.util.ColorLogger import enable_color_logging
from Challenge_1.util.DataGenerator import DataGenerator
from Challenge_1.util.Discretizer import Discretizer
from Challenge_1.util.state_preprocessing import reconvert_state_to_angle, normalize_input, get_feature_space_boundaries, convert_state_to_sin_cos
import itertools
from torch.optim.lr_scheduler import *
enable_color_logging(debug_lvl=logging.INFO)
import matplotlib.pyplot as plt
%matplotlib inline
import torch.nn as nn
import torch
import torch.optim as optim

seed = 1234
# avoid auto removal of import with pycharm
quanser_robots

env_name = "Pendulum-v2"
#env_name = "Qube-v0"

## Settings

In [None]:
n_samples = 10000
n_steps = 500 #10000
batch_size_dynamics = 64
batch_size_reward = 256
lr = 1e-3
path = "./NN-state_dict"
optimizer = 'rmsprop'
export_plots = True

# index list of angle features
if env_name == 'Pendulum-v2':
    angle_features = [0]
elif env_name == "Qube-v0":
    angle_features = [0, 1]

## Create the gym-environment

In [None]:
env = gym.make(env_name)

## Create both neural net models

In [None]:
X_low, X_high = get_feature_space_boundaries(env, angle_features)

In [None]:
# scaling defines how our outputs will be scaled after the tanh function
# for this we use all state features ergo all of X_high excluding the last action feature
scaling = X_high[:-1]

In [None]:
n_inputs = env.observation_space.shape[0] + env.action_space.shape[0] + len(angle_features)
n_outputs = env.observation_space.shape[0] + len(angle_features)
if env_name == 'Pendulum-v2':
    dynamics_model = NNModelPendulum(n_inputs=n_inputs,
                             n_outputs=n_outputs,
                             scaling=scaling, optimizer='adam')

    reward_model = NNModelPendulum(n_inputs=n_inputs,
                           n_outputs=1,
                           scaling=None, optimizer='adam')
elif env_name == 'Qube-v0':
    dynamics_model = NNModelQube(n_inputs=n_inputs,
                         n_outputs=n_outputs,
                         scaling=scaling, optimizer='adam')

    reward_model = NNModelQube(n_inputs=n_inputs,
                           n_outputs=1,
                           scaling=None, optimizer='adam')

In [None]:
lossfunction = nn.MSELoss()

## Create the training data

In [None]:
def create_dataset(env_name, seed, n_samples):
    """
    Creates the dataset for training the NN
    """
    
    dg_train = DataGenerator(env_name=env_name, seed=seed)

    # s_prime - future state after you taken the action from state s
    state_prime, state, action, reward = dg_train.get_samples(n_samples)

    state_sincos = convert_state_to_sin_cos(state, angle_features)
    state_prime = convert_state_to_sin_cos(state_prime, angle_features)
    
    # create training input pairs
    s_a_pairs = np.concatenate([state_sincos, action[:, np.newaxis]], axis=1).reshape(-1, state_sincos.shape[1] +
                                                                               env.action_space.shape[0])
    reward = reward.reshape(-1, 1)

    return s_a_pairs, state_prime, reward

In [None]:
s_a_pairs_train, state_prime_train, reward_train = create_dataset(env_name, seed, n_samples)

### Create test input pairs

In [None]:
s_a_pairs_test, state_prime_test, reward_test = create_dataset(env_name, seed+1, n_samples)

## Normalize the input X for the neural network

In [None]:
s_a_pairs_train = normalize_input(s_a_pairs_train, X_low, X_high)
s_a_pairs_test = normalize_input(s_a_pairs_test, X_low, X_high)

state_prime_train = normalize_input(state_prime_train, X_low[:-1], X_high[:-1])
state_prime_test = normalize_input(state_prime_test, X_low[:-1], X_high[:-1])

## Define the optimizer

In [None]:
if optimizer == 'rmsprop':
    optimizer_dynamics = optim.RMSprop(dynamics_model.parameters(), lr=lr)
    optimizer_reward = optim.RMSprop(reward_model.parameters(), lr=lr)
elif optimizer == 'adam':
    optimizer_dynamics = optim.Adam(dynamics_model.parameters(), lr=lr)
    optimizer_reward = optim.Adam(reward_model.parameters(), lr=lr)
elif optimizer == 'sgd':
    optimizer_dynamics = optim.SGD(dynamics_model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
    optimizer_reward = optim.SGD(reward_model.parameters(), lr=0.01, momentum=0.9, nesterov=True)
else:
    raise Exception('Unsupported optimizer')

In [None]:
def validate_model(model, X, y):

    model.eval()

    with torch.no_grad():

        out = model(X)

        mse_test = ((out.detach().numpy() - y) ** 2).mean(axis=0)

        print("Test MSE: {}".format(mse_test))
        print("Test MSE (mean): {}".format(mse_test.mean()))

    return mse_test.mean()

In [None]:
def train(model, optimizer, X, Y, X_val, Y_val, n_epochs=150, batch_size=32):
    
    X = torch.from_numpy(X).float()
    Y = torch.from_numpy(Y).float()

    X_val = torch.from_numpy(X_val)
    Y_val = Y_val

    # https://stackoverflow.com/questions/45113245/how-to-get-mini-batches-in-pytorch-in-a-clean-and-efficient-way

    train_loss = []
    val_loss = []
    for epoch in range(n_epochs):

        # X is a torch Variable
        permutation = torch.randperm(X.size()[0])

        for i in range(0,X.size()[0], batch_size):
            optimizer.zero_grad()

            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X[indices], Y[indices]

            # in case you wanted a semi-full example
            outputs = model.forward(batch_x)
            loss = lossfunction(outputs,batch_y)

            loss.backward()
            optimizer.step()

        if epoch % 50 == 0:
            for g in optimizer.param_groups:
                g['lr'] /= 2

        print("Epoch: {:d} -- total loss: {:3.8f}".format(epoch+1, loss.item()))
        train_loss.append(loss.item())
        val_loss.append(validate_model(model, X_val, Y_val))

    return train_loss, val_loss

## Start the training process

## Train the Dynamics Model

In [None]:
train_loss_dynamics, val_loss_dynamics = train(dynamics_model, optimizer=optimizer_dynamics,
                             X=s_a_pairs_train, Y=state_prime_train, X_val=s_a_pairs_test, Y_val=state_prime_test,batch_size=batch_size_dynamics, n_epochs=150)

## Visualize the training process

In [None]:
plt.title('%s: Learning Dynamics\n Batch-Size=%d, lr=%f, optimizer=%s' %
          (env_name, batch_size_dynamics, lr, optimizer))
#plt.plot(train_loss, label='train_loss')
plt.plot(train_loss_dynamics, label='train-loss')
plt.plot(val_loss_dynamics, label='val-loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
if export_plots is True:
    plt.savefig('Plots/%s_Dynamics.png' % env_name)

### Save the trained weights for later usage

In [None]:
export_name = "./Weights/model_dynamics_%s_mse_%.8f.params" % (env_name, val_loss_dynamics[-1])
torch.save(dynamics_model.state_dict(), export_name)
print('Your weights have been saved to %s successfully!' % export_name)

### Train the reward model

In [None]:
reward_model

In [None]:
train_loss_reward, val_loss_reward = train(reward_model, optimizer=optimizer_reward,
                             X=s_a_pairs_train, Y=reward_train, X_val=s_a_pairs_test, Y_val=reward_test, batch_size=batch_size_reward)

## Visualize the training process

In [None]:
plt.title('%s: Learning Rewards\n Batch-Size=%d, lr=%f, optimizer=%s' %
          (env_name, batch_size_dynamics, lr, optimizer))
plt.plot(train_loss_reward, label='train-loss')
plt.plot(val_loss_reward, label='val-loss')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
if export_plots is True:
    plt.savefig('Plots/%s_Reward.png' % env_name)

### Save the weights of the trained model

In [None]:
export_name = "./Weights/model_reward_%s_mse_%.8f.params" % (env_name, val_loss_reward[-1])
torch.save(reward_model.state_dict(), export_name)
print('Your weights have been saved to %s successfully!' % export_name)