# Baseline Model - Deep Neural Network Reinforcement Learning

The baseline model will train an agent to manipulate two DMC's in series to maximize production. Instead of building a Q-table, the agent will learn a policy by approximating a reward function through neural networks.

In [None]:
DMCarr = [[] for i in [0, 1, 2]]
            # index, next, fung, goal, input (T, P, Keq)
DMCarr[0] = [0, [0, 1], "DMC1", 400, [350, 5, 1]]
DMCarr[1] = [1, [2], "DMC2", 500, [350, 5, 1]]
DMCarr[2] = [2, [], "Dummy", 0, [0, 0, 0]]

print("DMC array:", DMCarr)
struct = DMC_structure(DMCarr)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define the neural network
model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(2,)),  # First hidden layer
    layers.Dense(16, activation='relu'),  # Second hidden layer
    layers.Dense(1, activation='linear')  # Output layer (regression task)
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01), loss='mse')

# Reward function for reinforcement learning
def reward_function(struct, output, valid_range=(-1, 1)):
    """
    Computes the reward based on output and penalizes if DMC values are outside the valid range.
    """
    p_reward_scale = 1.0 

    p_reward = p_reward_scale * output # reward = m * output value (last value of last DMC)4
    
    n_reward = 0
    
    for dmc in struct():
        for bound in dmc.getConstraints()
            buffer = (bound[2]-bound[1])*0.1
            UBB = bound[2] - buffer
            LBB = bound[1] + buffer
            if bound[0] > LBB or bound[0] < UBB:
                nwd = 0
            else:

    n_reward = struct.getConstraints(
    if not (valid_range[0] <= dmc1 <= valid_range[1]):
        reward -= 100  # Heavy penalty if DMC1 is out of range
    if not (valid_range[0] <= dmc2 <= valid_range[1]):
        reward -= 100  # Heavy penalty if DMC2 is out of range
    
    reward = p_reward - n_reward

    return reward

# Function to generate training data
def generate_data(env_function, num_samples=500, valid_range=(-1, 1)):
    """
    env_function: A function that takes in (DMC1, DMC2) and returns the output quantity.
    num_samples: Number of data points to collect.
    """
    dmc_values = np.random.uniform(low=valid_range[0], high=valid_range[1], size=(num_samples, 2))
    outputs = np.array([env_function(dmc[0], dmc[1]) for dmc in dmc_values])
    rewards = np.array([reward_function(dmc[0], dmc[1], output, valid_range) for dmc, output in zip(dmc_values, outputs)])
    return dmc_values, rewards

# Train the model
def train_model(env_function, num_samples=500, epochs=100, valid_range=(-1, 1)):
    X_train, y_train = generate_data(env_function, num_samples, valid_range)
    model.fit(X_train, y_train, epochs=epochs, batch_size=32, verbose=1)
    return model

# Finding the optimal policy
def find_optimal_dmc(valid_range=(-1, 1)):
    """
    Optimizes DMC1 and DMC2 values to maximize the reward using the trained model.
    """
    best_dmc = None
    best_reward = -np.inf
    
    for _ in range(1000):  # Try 1000 random pairs
        dmc_candidate = np.random.uniform(valid_range[0], valid_range[1], size=(1, 2))
        predicted_reward = model.predict(dmc_candidate)[0, 0]
        
        if predicted_reward > best_reward:
            best_reward = predicted_reward
            best_dmc = dmc_candidate
    
    return best_dmc, best_reward

# Example usage:
# Define your environment function: env_function(DMC1, DMC2) -> output
# Train the model and find optimal values
# trained_model = train_model(env_function)
# optimal_dmc, optimal_reward = find_optimal_dmc()
