In [33]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import csv
import math
import time

# Preprocess the data
def preprocess_data(file_path, train_ratio=0.8, test_ratio=0.1, val_ratio=0.1):
    data = np.genfromtxt(file_path, delimiter=',', skip_header=1)
    states = data[:, :7]  # Extract the state information (pose, yaw, and laser data)
    actions = data[:, 7:9]  # Extract the action information (linear and angular velocities)
    timestamps = data[:, 11]  # Extract the timestamp information
    key_presses = data[:, 10]  # Extract the key pressed information
    target_distances = data[:, 9]  # Extract the target distance information

    # Convert key presses to action types
    action_types = []
    for key in key_presses:
        if key == 'w':
            action_types.append(0)  # Move forward
        elif key == 'a':
            action_types.append(1)  # Turn left
        elif key == 'd':
            action_types.append(2)  # Turn right
        elif key == 's':
            action_types.append(3)  # Move backward
        else:
            action_types.append(4)  # Stop

    # Calculate action durations based on timestamps
    action_durations = []
    for i in range(len(timestamps) - 1):
        duration = timestamps[i + 1] - timestamps[i]
        action_durations.append(duration)
    action_durations.append(0)  # Set the duration of the last action as 0

    max_laser_range = 3.5  # Assuming the maximum range is 3.5
    states[:, 3:] = np.where(states[:, 3:] == float('inf'), max_laser_range, states[:, 3:])

    # Normalize the states (positions, yaw, and laser data)
    states_normalized = states.copy()
    states_normalized[:, :2] /= np.max(states_normalized[:, :2])  # Normalize positions
    states_normalized[:, 2] /= 2 * np.pi  # Normalize yaw angle
    states_normalized[:, 3:] /= max_laser_range  # Normalize laser data (assuming maximum range is 3.5)

    # Normalize the target distances
    target_distances_normalized = target_distances / np.max(target_distances)

    # Convert action types and action durations to numpy arrays
    action_types = np.array(action_types)
    action_durations = np.array(action_durations)

    # Split the data into train, test, and validation sets
    train_size = int(len(states_normalized) * train_ratio)
    test_size = int(len(states_normalized) * test_ratio)

    train_states, test_states, val_states = states_normalized[:train_size], states_normalized[train_size:train_size+test_size], states_normalized[train_size+test_size:]
    train_actions, test_actions, val_actions = actions[:train_size], actions[train_size:train_size+test_size], actions[train_size+test_size:]
    train_action_types, test_action_types, val_action_types = action_types[:train_size], action_types[train_size:train_size+test_size], action_types[train_size+test_size:]
    train_action_durations, test_action_durations, val_action_durations = action_durations[:train_size], action_durations[train_size:train_size+test_size], action_durations[train_size+test_size:]
    train_target_distances, test_target_distances, val_target_distances = target_distances_normalized[:train_size], target_distances_normalized[train_size:train_size+test_size], target_distances_normalized[train_size+test_size:]

    return train_states, train_actions, train_action_types, train_action_durations, train_target_distances, \
        test_states, test_actions, test_action_types, test_action_durations, test_target_distances, \
        val_states, val_actions, val_action_types, val_action_durations, val_target_distances

# Create the policy network
def create_policy_network(state_dim, num_action_types):
    inputs = layers.Input(shape=(state_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dense(128, activation='relu')(x)
    action_type_output = layers.Dense(num_action_types, activation='softmax')(x)
    action_duration_output = layers.Dense(1, activation='relu')(x)
    model = tf.keras.Model(inputs=inputs, outputs=[action_type_output, action_duration_output])
    return model

# Create the discriminator network
def create_discriminator_network(state_dim, num_action_types):
    state_inputs = layers.Input(shape=(state_dim,))
    action_type_inputs = layers.Input(shape=(num_action_types,))  # Update the input shape
    action_duration_inputs = layers.Input(shape=(1,))
    x = layers.Concatenate()([state_inputs, action_type_inputs, action_duration_inputs])
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs=[state_inputs, action_type_inputs, action_duration_inputs], outputs=outputs)
    return model

# Create the dynamics model
def create_dynamics_model(state_dim, num_action_types):
    state_inputs = layers.Input(shape=(state_dim,))
    action_type_inputs = layers.Input(shape=(num_action_types,))
    action_duration_inputs = layers.Input(shape=(1,))
    x = layers.Concatenate()([state_inputs, action_type_inputs, action_duration_inputs])
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(128, activation='relu')(x)
    next_state_outputs = layers.Dense(state_dim)(x)
    model = tf.keras.Model(inputs=[state_inputs, action_type_inputs, action_duration_inputs], outputs=next_state_outputs)
    return model

# Create the value network
def create_value_network(state_dim):
    inputs = layers.Input(shape=(state_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(1)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# Create the Q-networks
def create_q_networks(state_dim, num_action_types):
    state_inputs = layers.Input(shape=(state_dim,))
    action_type_inputs = layers.Input(shape=(num_action_types,))
    action_duration_inputs = layers.Input(shape=(1,))
    x = layers.Concatenate()([state_inputs, action_type_inputs, action_duration_inputs])
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dense(128, activation='relu')(x)
    outputs = layers.Dense(1)(x)
    model = tf.keras.Model(inputs=[state_inputs, action_type_inputs, action_duration_inputs], outputs=outputs)
    return model, model

# Train the V-MAIL algorithm with SAC
def train_v_mail_with_sac(train_states, train_actions, train_action_types, train_action_durations, train_target_distances,
                          val_states, val_actions, val_target_distances,
                          num_epochs, batch_size, state_dim, num_action_types, gamma=0.99, alpha=0.2, target_entropy=-2):
    # Create the policy, discriminator, dynamics, value, and Q-networks
    policy_model = create_policy_network(state_dim, num_action_types)
    discriminator_model = create_discriminator_network(state_dim, num_action_types)
    dynamics_model = create_dynamics_model(state_dim, num_action_types)
    value_model = create_value_network(state_dim)
    q_model1, q_model2 = create_q_networks(state_dim, num_action_types)

    # Define the optimizers
    policy_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
    discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
    dynamics_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
    value_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
    q_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)
    alpha_optimizer = tf.keras.optimizers.Adam(learning_rate=3e-4)

    # Entropy regularization coefficient
    log_alpha = tf.Variable(tf.math.log(tf.cast(alpha, tf.float32)), trainable=True)
    # Training loop
    for epoch in range(num_epochs):
        # Sample a batch of expert states, action types, action durations, and target distances
        batch_indices = np.random.choice(len(train_states), size=batch_size).astype(int)
        batch_expert_states = np.array(train_states)[batch_indices]
        batch_expert_action_types = np.array(train_action_types)[batch_indices]
        batch_expert_action_durations = np.array(train_action_durations)[batch_indices]
        batch_expert_target_distances = np.array(train_target_distances)[batch_indices]


        # Generate simulated action types and durations using the policy model
        with tf.GradientTape() as tape:
            batch_simulated_action_types, batch_simulated_action_durations = policy_model(batch_expert_states)
            batch_simulated_states = dynamics_model([tf.convert_to_tensor(batch_expert_states), tf.convert_to_tensor(batch_simulated_action_types), tf.convert_to_tensor(batch_simulated_action_durations)])

            # Convert batch_expert_action_types to one-hot encoding
            batch_expert_action_types_one_hot = tf.one_hot(batch_expert_action_types, num_action_types)

            # Convert batch_simulated_action_types to one-hot encoding
            batch_simulated_action_types_one_hot = tf.one_hot(tf.argmax(batch_simulated_action_types, axis=1), num_action_types)

            # Calculate the discriminator loss
            expert_logits = discriminator_model([tf.convert_to_tensor(batch_expert_states), tf.convert_to_tensor(batch_expert_action_types_one_hot), tf.convert_to_tensor(batch_expert_action_durations)])
            simulated_logits = discriminator_model([tf.convert_to_tensor(batch_simulated_states), tf.convert_to_tensor(batch_simulated_action_types_one_hot), tf.convert_to_tensor(batch_simulated_action_durations)])
            discriminator_loss = tf.reduce_mean(tf.math.log(expert_logits) + tf.math.log(1 - simulated_logits))

        # Update the discriminator model
        discriminator_gradients = tape.gradient(discriminator_loss, discriminator_model.trainable_variables)
        discriminator_optimizer.apply_gradients(zip(discriminator_gradients, discriminator_model.trainable_variables))

        # Update the policy, dynamics, value, and Q-networks
        with tf.GradientTape(persistent=True) as tape:
            batch_simulated_action_types, batch_simulated_action_durations = policy_model(batch_expert_states)
            batch_simulated_states = dynamics_model([tf.convert_to_tensor(batch_expert_states), tf.convert_to_tensor(batch_simulated_action_types), tf.convert_to_tensor(batch_simulated_action_durations)])


            # Calculate the Q-values
            q_value1 = q_model1([tf.convert_to_tensor(batch_expert_states), tf.convert_to_tensor(batch_simulated_action_types), tf.convert_to_tensor(batch_simulated_action_durations)])
            q_value2 = q_model2([tf.convert_to_tensor(batch_expert_states), tf.convert_to_tensor(batch_simulated_action_types), tf.convert_to_tensor(batch_simulated_action_durations)])
            q_value = tf.minimum(q_value1, q_value2)

            # Calculate the value targets
            value_targets = q_value - tf.exp(log_alpha) * tf.reduce_sum(batch_simulated_action_types * tf.math.log(batch_simulated_action_types + 1e-8), axis=1, keepdims=True)

            # Calculate the value loss
            value_predictions = value_model(batch_expert_states)
            value_loss = tf.reduce_mean(tf.square(value_predictions - value_targets))

            # Calculate the Q-value targets
            rewards = -batch_expert_target_distances  # Negative target distances as rewards
            next_values = value_model(batch_simulated_states)
            q_value_targets = rewards + gamma * next_values

            # Calculate the Q-value loss
            q_value_loss = tf.reduce_mean(tf.square(q_value1 - q_value_targets) + tf.square(q_value2 - q_value_targets))

            # Calculate the policy loss
            policy_loss = tf.reduce_mean(tf.exp(log_alpha) * tf.reduce_sum(batch_simulated_action_types * tf.math.log(batch_simulated_action_types + 1e-8), axis=1, keepdims=True) - q_value)

            # Calculate the alpha loss
            alpha_loss = -tf.reduce_mean(log_alpha * tf.stop_gradient(tf.reduce_sum(batch_simulated_action_types * tf.math.log(batch_simulated_action_types + 1e-8), axis=1, keepdims=True) - target_entropy))

        # Update the policy model
        policy_gradients = tape.gradient(policy_loss, policy_model.trainable_variables)
        # print("Policy gradients:", policy_gradients)
        policy_optimizer.apply_gradients(zip(policy_gradients, policy_model.trainable_variables))

        # Update the dynamics model
        dynamics_gradients = tape.gradient(policy_loss, dynamics_model.trainable_variables)
        # print("Dynamics gradients:", dynamics_gradients)
        if any(grad is not None for grad in dynamics_gradients):
            dynamics_optimizer.apply_gradients(zip(dynamics_gradients, dynamics_model.trainable_variables))

        # Update the value model
        value_gradients = tape.gradient(value_loss, value_model.trainable_variables)
        # print("Value gradients:", value_gradients)
        value_optimizer.apply_gradients(zip(value_gradients, value_model.trainable_variables))

        # Update the Q-networks
        q_gradients = tape.gradient(q_value_loss, q_model1.trainable_variables + q_model2.trainable_variables)
        # print("Q-network gradients:", q_gradients)
        q_optimizer.apply_gradients(zip(q_gradients, q_model1.trainable_variables + q_model2.trainable_variables))

        # Update the alpha value
        alpha_gradients = tape.gradient(alpha_loss, [log_alpha])
        # print("Alpha gradients:", alpha_gradients)
        alpha_optimizer.apply_gradients(zip(alpha_gradients, [log_alpha]))

        # Print the training progress
        if epoch % 100 == 0:
            # Evaluate the model on the validation set
            val_action_types, val_action_durations = policy_model.predict(val_states)
            val_simulated_states = dynamics_model.predict([tf.convert_to_tensor(val_states), tf.convert_to_tensor(val_action_types), tf.convert_to_tensor(val_action_durations)])
            val_rewards = -val_target_distances
        
            # Convert val_action_types to one-hot encoding
            val_action_types_one_hot = tf.one_hot(tf.argmax(val_action_types, axis=1), num_action_types)
        
            val_q_values = q_model1.predict([tf.convert_to_tensor(val_states), tf.convert_to_tensor(val_action_types_one_hot), tf.convert_to_tensor(val_action_durations)])
            val_value_predictions = value_model.predict(val_states)
            val_loss = tf.reduce_mean(tf.square(val_value_predictions - val_rewards))
        
            print(f"Epoch {epoch}: Discriminator Loss = {discriminator_loss:.4f}, Policy Loss = {policy_loss:.4f}, Value Loss = {value_loss:.4f}, Q-Value Loss = {q_value_loss:.4f}, Alpha Loss = {alpha_loss:.4f}, Validation Loss = {val_loss:.4f}")
        
            # Convert val_action_types to numerical representations
            val_action_types_numerical = tf.argmax(val_action_types, axis=1)
        
            # Evaluate the trained policy on the validation set
            val_action_diff = np.mean(np.square(val_action_types_numerical - val_actions[:, 0]))  # Compare with the first dimension of val_actions
            print(f"Validation Action Difference: {val_action_diff:.4f}")

    return policy_model, dynamics_model, value_model, q_model1, q_model2

In [34]:
np.random.seed(42)
tf.random.set_seed(42)

# Set the file path for the collected data
file_path = 'imitation_learning_dataset.csv'

# Preprocess the data
train_states, train_actions, train_action_types, train_action_durations, train_target_distances, \
    test_states, test_actions, test_action_types, test_action_durations, test_target_distances, \
    val_states, val_actions, val_action_types, val_action_durations, val_target_distances = preprocess_data(file_path)

# Set the training parameters
num_epochs = 1000
batch_size = 64
state_dim = 7  # Number of state variables (pose, yaw, and laser data)
num_action_types = 5  # Number of action types (e.g., 'forward', 'left', 'right', 'backward', 'stop')

# Train the V-MAIL algorithm with SAC
policy_model, dynamics_model, value_model, q_model1, q_model2 = train_v_mail_with_sac(
    train_states, train_actions, train_action_types, train_action_durations, train_target_distances,
    val_states, val_actions, val_target_distances,
    num_epochs, batch_size, state_dim, num_action_types
)
# Save the trained models
policy_model.save('policy_model.h5')
dynamics_model.save('dynamics_model.h5')
value_model.save('value_model.h5')
q_model1.save('q_model1.h5')
q_model2.save('q_model2.h5')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Epoch 0: Discriminator Loss = -1.5992, Policy Loss = -0.3273, Value Loss = 0.1173, Q-Value Loss = 0.8648, Alpha Loss = 0.6312, Validation Loss = 0.0253
Validation Action Difference: 7.1429
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Epoch 100: Discriminator Loss = -12.6469, Policy Loss = 0.3220, Value Loss = 0.0006, Q-Value Loss = 0.0796, Alpha Loss = 0.6770, Validation Loss = 0.0973
Validation Action Difference: 8.0000
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



In [35]:
# Evaluate the trained models on the test set
test_action_types, test_action_durations = policy_model.predict(test_states)
test_simulated_states = dynamics_model.predict([test_states, test_action_types, test_action_durations])
test_rewards = -np.array(test_target_distances)
test_q_values = q_model1.predict([test_states, test_action_types, test_action_durations])
test_value_predictions = value_model.predict(test_states)
test_loss = tf.reduce_mean(tf.square(test_value_predictions - test_rewards))

print(f"Test Loss: {test_loss:.4f}")

# Use the trained policy model for inference
def predict_action(state):
    action_type, action_duration = policy_model.predict(np.array([state]))
    action_type = np.argmax(action_type[0])
    action_duration = action_duration[0][0]
    return action_type, action_duration

# Example usage of the predict_action function
current_state = test_states[0]  # Replace with the current state of the robot
predicted_action_type, predicted_action_duration = predict_action(current_state)

# Map the predicted action type to the corresponding action
action_mapping = {
    0: 'Move forward',
    1: 'Turn left',
    2: 'Turn right',
    3: 'Move backward',
    4: 'Stop'
}

predicted_action = action_mapping[predicted_action_type]
print(f"Predicted Action: {predicted_action}")
print(f"Predicted Action Duration: {predicted_action_duration:.2f} seconds")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 658ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Test Loss: 0.2291
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 303ms/step
Predicted Action: Move forward
Predicted Action Duration: 0.00 seconds
