## Data Replication

In [None]:
import numpy as np
import pandas as pd

## Create a function which takes in a dataset and replicates it
def replicate_data(data, replications, noise):                                                      # Create a function which accepts a dataset and replicates it
    cols = list(data.columns) 
    dataR = data[cols[0:4]]                                                                         # Create a secondary dataframe containing only columns 1-3 (the columns we want to replicate)
    df = data                                                                                       # Create the output dataframe that will contain both the original and the replicated data
    new_data = pd.DataFrame(columns=data.columns)
    i = 0                                                                                           # Initialise replication counter to 0
    while i < replications:
        replicated_data =  np.random.uniform(dataR-dataR*noise, dataR+dataR*noise)                  # Create random noise for each value in columns 2-4 of dataset
        replicated_data = pd.DataFrame(data=replicated_data, index=None, columns=dataR.columns)      # Cast the replicated data as a pandas DataFrame Object
        replicated_data['LI'] = df[cols[4]]                                                          # Add the missing light intensity column back into the replicated_data set
        new_data = new_data.append(replicated_data, ignore_index=True, sort=False)
        i += 1
    return new_data


## Test FunctioN

In [None]:
import torch
import numpy as np
import pandas as pd
from torch.autograd import Variable

def test(test_inputs, test_labels, net, BATCH_SIZE):
    net.eval()
    test_X = Variable(torch.Tensor(test_inputs)) 
    test_y = Variable(torch.Tensor(test_labels))

    hidden = net.init_hidden(test_X)
    with torch.no_grad():
        net_out, _ = net(test_X, hidden)        # Hidden state not required for manual feeding

    squared_error_X = []
    squared_error_N = []
    squared_error_L = [] 
    squared_error_C = [] 

    for index1, element in enumerate(test_y):
        for index2, row in enumerate(element):
            X_error = row[0] - net_out[index1][index2][0]
            N_error = row[1] - net_out[index1][index2][1]
            L_error = row[2] - net_out[index1][index2][2]
            C_error = row[3] - net_out[index1][index2][3]
            squared_error_X.append(X_error**2)
            squared_error_N.append(N_error**2)
            squared_error_L.append(L_error**2)
            squared_error_C.append(C_error**2)


    MSE_X1 = sum(squared_error_X[0:14])/14
    MSE_N1 = sum(squared_error_N[0:14])/14
    MSE_L1 = sum(squared_error_L[0:14])/14
    MSE_C1 = sum(squared_error_C[0:14])/14
    MSE_X2 = sum(squared_error_X[14:28])/14                        #This code has been designed to be compatible with more than one test data set
    MSE_N2 = sum(squared_error_N[14:28])/14
    MSE_L2 = sum(squared_error_L[14:28])/14
    MSE_C2 = sum(squared_error_C[14:28])/14
    MSE_list = [MSE_X1, MSE_N1, MSE_L1, MSE_C1, MSE_X2, MSE_N2, MSE_L2, MSE_C2]
    AVG_MSE = sum(MSE_list)/8

    LI1, LI2 = test_X[0][0][4], test_X[1][0][4]
    predictions_online = []
    for index1, element in enumerate(test_X):
        if index1 == 0:
            for index2, row in enumerate(element):
                B = row[0] + net_out[index1][index2][0]
                N = row[1] + net_out[index1][index2][1]
                F = row[2] + net_out[index1][index2][2]
                NIC = row[3] + net_out[index1][index2][3]

                predictions_online.append([B, N, F, NIC, LI1])
        
        if index1 == 1:
            for index2, row in enumerate(element):
                B = row[0] + net_out[index1][index2][0]
                N = row[1] + net_out[index1][index2][1]
                F = row[2] + net_out[index1][index2][2]
                NIC = row[3] + net_out[index1][index2][3]

                predictions_online.append([B, N, F, NIC, LI2])
    predictions_online = np.array(predictions_online)

    predictions_offline = []
    B1, B2 = test_X[0][0][0], test_X[1][0][0]
    N1, N2 = test_X[0][0][1], test_X[1][0][1]
    F1, F2 = test_X[0][0][2], test_X[1][0][2]
    NIC1, NIC2 = test_X[0][0][3], test_X[1][0][3]
    net.sequence_length = 1                                                     # We will now be feeding 1 input at a time to the network(offline prediction), start extracting and feeding hidden state per item in a sequence. 
    for index1, element in enumerate(test_X):
        hidden = net.init_hidden(Variable(torch.Tensor([[[]]])))                 # Initialise hidden state with a batch size of 1

        for index2, row in enumerate(element):

            if index1 == 0:
                                                                                 # Feed inputs with a batch size of 1, sequence length of 1 and feature vector length of 5 to the network
                net_out, hidden = net(Variable(torch.Tensor([    
                    [[B1, N1, F1, NIC1, LI1]]
                ])), hidden)
                B = B1 + net_out[0][0][0]
                N = N1 + net_out[0][0][1]
                F = F1 + net_out[0][0][2]
                NIC = NIC1 + net_out[0][0][3]
                predictions_offline.append([float(B), float(N), float(F), float(NIC), float(LI1)])
                B1 = B
                N1 = N
                F1 = F
                NIC1 = NIC
            
            if index1 == 1:
                net_out, hidden = net(Variable(torch.Tensor([    
                    [[B2, N2, F2, NIC2, LI2]]
                ])), hidden)
                B = B2 + net_out[0][0][0]
                N = N2 + net_out[0][0][1]
                F = F2 + net_out[0][0][2]
                NIC = NIC2 + net_out[0][0][3]
                predictions_offline.append([float(B), float(N), float(F), float(NIC), float(LI2)])
                B2 = B
                N2 = N
                F2 = F
                NIC2 = NIC
    predictions_offline = np.array(predictions_offline)

    return AVG_MSE, predictions_online, predictions_offline


In [None]:
#from train import train

import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.autograd import Variable

def train(net, inputs, labels, EPOCHS, l_rate, BATCH_SIZE):
    net.train()                                                                         
    optimiser = optim.Adam(net.parameters(), lr = l_rate)									   # net.parameters(): all of the adjustable parameters in our network. lr: a hyperparameter adjusts the size of the step that the optimizer will take to minimise the loss.
    loss_function = nn.MSELoss(reduction='mean')

    X = Variable(torch.Tensor(inputs))
    y = Variable(torch.Tensor(labels))

    for epoch in range(EPOCHS):
        for i in tqdm(range(0, len(X), BATCH_SIZE), disable=True):
            batch_X = X[i:i+BATCH_SIZE]
            batch_y = y[i:i+BATCH_SIZE]
            hidden = net.init_hidden(batch_X)
            optimiser.zero_grad()
            outputs, _ = net(batch_X, hidden)
            loss = loss_function(outputs, batch_y)
            loss.backward()
            optimiser.step()


##RNN Structure

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable

torch.manual_seed(777)

### Model ###
class RNN(nn.Module):
    def __init__(self, num_outputs, input_size, sequence_length, hidden_size, num_layers):
        super(RNN, self).__init__()

        self.num_outputs = num_outputs
        self.input_size = input_size
        self.sequence_length = sequence_length
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
        self.fc = nn.Linear(self.hidden_size, self.num_outputs)
    
    def forward(self, x, hidden):
        # Reshape input to (batch_size, sequence_length, input_size)
        x = x.view(x.size(0), self.sequence_length, self.input_size)

        # Propagate input through RNN
        # Input: (batch, seq_len, input_size)
        out, _ = self.rnn(x, hidden)
        fc_out = self.fc(out)
        return fc_out, _
    
    def init_hidden(self, x):
        # Initialse hidden and cell states
        # (num_layers * num_directions, batch_size, hidden_size)
        return Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))


## Data Sets Import

In [None]:
# Clone the entire repo.
!git clone -l -s git://github.com/Arymega/FAME_Bioprocess_Simulation_with_RNN_and_FNN.git cloned-repo
%cd cloned-repo
!ls

## Perform Training

In [None]:
import time
start_time = time.time()
import torch
import pandas as pd
import numpy as np 
import os
from sklearn.preprocessing import StandardScaler

# Load training and testing data as pd dataframe
training_data = pd.read_excel('/content/cloned-repo/Common Files/Datasets.xlsx', sheet_name='Train')
testing_data = pd.read_excel('/content/cloned-repo/Common Files/Datasets.xlsx', sheet_name='Test')

# Standardise training and testing data
scaler_train = StandardScaler()
scaler_test = StandardScaler()

scaler_train.fit(training_data)
scaler_test.fit(testing_data)

testing_data = scaler_test.transform(testing_data)

# Convert training data to pd dataframe
columns = "B N F NIC LI".split()
training_data = pd.DataFrame(data=training_data, index=None, columns=columns)

# Replicate the training data
replicated_data1 = replicate_data(training_data, 50, 0.03)
replicated_data2 = replicate_data(training_data, 50, 0.05)

training_data = training_data.append(replicated_data1, ignore_index=True, sort=False)
training_data = training_data.append(replicated_data2, ignore_index=True, sort=False)

training_data = scaler_train.transform(training_data)
training_data = np.array(training_data)

# Calculate training and testing labels
try:
    a = []
    for index, row in enumerate(training_data):
        dB = training_data[index + 1][0] - row[0]
        dN = training_data[index + 1][1] - row[1]
        dF = training_data[index + 1][2] - row[2]
        dNIC = training_data[index + 1][3] - row[3]        
        rates = [dB, dN, dF, dNIC]
        a.append(rates)
except IndexError:
    rates = [0, 0, 0, 0]
    a.append(rates)

a = np.array(a)
training_data = np.append(training_data, a, axis=1)

try:
    a = []
    for index, row in enumerate(testing_data):
        dB = testing_data[index + 1][0] - row[0]
        dN = testing_data[index + 1][1] - row[1]
        dF = testing_data[index + 1][2] - row[2]
        dNIC = testing_data[index + 1][3] - row[3]

        rates = [dB, dN, dF, dNIC]
        a.append(rates)
except IndexError:
    rates = [0, 0, 0, 0]
    a.append(rates)

a = np.array(a)
testing_data = np.append(testing_data, a, axis=1)

# Remove 15th datapoints from all corresponding training and testing sets
count = 0
decrement = 0
for index, row in enumerate(training_data):
    count += 1
    if count == 15:
        delete = index - decrement
        training_data = np.delete(training_data, delete, 0)
        decrement += 1
        count = 0

count = 0
decrement = 0
for index, row in enumerate(testing_data):
    count += 1
    if count == 15:
        delete = index - decrement
        testing_data = np.delete(testing_data, delete, 0)
        decrement += 1
        count = 0

HL = 1
HN = 17
EPOCHS = 260
LR = 0.004
BATCH_SIZE = 10
avg_mse=1

xcl_dir = '/content/drive/My Drive/Colab Notebooks/GitHub/MSc/RNN/Results/1HL/' #create a new folder for prediction rsults
try:
  os.mkdir(xcl_dir)
except:
  pass

for count in range(1):
  avg_mse=1
  min_mse=1
  while count < 50: # this count is related manually to if else for excel saving

    rnn = RNN(4, 5, 14, HN, HL)
    training_inputs = training_data[:, 0:5]
    training_labels = training_data[:, 5:]
    test_inputs = testing_data[:, 0:5]
    test_labels = testing_data[:, 5:]

    training_inputs = np.split(training_inputs, 505)
    training_labels = np.split(training_labels, 505)
    test_inputs = np.split(test_inputs, 2)
    test_labels = np.split(test_labels, 2)

    train(rnn, training_inputs, training_labels, EPOCHS, LR, BATCH_SIZE)
    avg_mse, predictions_online, predictions_offline = test(test_inputs, test_labels, rnn, BATCH_SIZE)
    count = count+1
    if min_mse >= avg_mse or count==49: #count=* is related to while count above
      min_mse = avg_mse
      count_min = count
      # Save file every minimum found
      predictions_online_inverse_transform = scaler_test.inverse_transform(predictions_online)
      predictions_offline_inverse_transform = scaler_test.inverse_transform(predictions_offline)

      online = pd.DataFrame(predictions_online_inverse_transform)
      offline = pd.DataFrame(predictions_offline_inverse_transform)
      avg_mse = pd.DataFrame([avg_mse, 0])
      f= round(min_mse.item(), 5)
      with pd.ExcelWriter('{xcl_dir}Predictions {f}_{x}_{y}_{a}_{b}_{c}_{count}.xlsx'.format(xcl_dir=xcl_dir, x=HL, y=HN, a=EPOCHS, b=LR, c=BATCH_SIZE, count=count, f=f)) as writer:  
          offline.to_excel(writer, sheet_name='Offline', startrow=1, startcol=1)
          online.to_excel(writer, sheet_name='Online', startrow=1, startcol=1)
          avg_mse.to_excel(writer, sheet_name='Avg_MSE', startrow=1, startcol=1)
      torch.save(rnn.state_dict(), '{xcl_dir}Model {f}_{x}_{y}_{a}_{b}_{c}_{count}.pt'.format(xcl_dir=xcl_dir, x=HL, y=HN, z=HN, a=EPOCHS, b=LR, c=BATCH_SIZE, count=count, f=f))
    print(avg_mse, min_mse, count, count_min)
print(f'\nDuration: {time.time() - start_time:.0f} seconds')