# Q-Learning Pneumatic Teleoperation — Training Results

In [None]:
import sys, os
sys.path.insert(0, r"C:\Users\assaa\OneDrive - American University of Beirut\projects\Teleop\TeleopWithRL")
os.chdir(r"C:\Users\assaa\OneDrive - American University of Beirut\projects\Teleop\TeleopWithRL")

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Parse evaluation checkpoints from training log
eps, epsilons, rewards, rmses, coverages = [], [], [], [], []
with open("results/old/logs/training_output.log", 'r', encoding='utf-8', errors='replace') as f:
    for line in f:
        if 'Ep ' in line and 'TE' in line:
            parts = line.replace('\u2502', '|').split('|')
            try:
                ep = int(parts[0].strip().split()[-1])
                epsilon = float(parts[1].strip().split()[-1])
                r = float(parts[2].strip().split()[-1])
                te = float(parts[3].strip().split()[-2])
                cov = float(parts[4].strip().split()[-2].replace('%',''))
                eps.append(ep); epsilons.append(epsilon)
                rewards.append(r); rmses.append(te); coverages.append(cov)
            except:
                pass

print(f"Found {len(eps)} evaluation checkpoints (up to episode {eps[-1] if eps else '?'})")

In [2]:
# Debug: inspect log lines containing "Ep"
with open("training_output.log", 'r', encoding='utf-8', errors='replace') as f:
    for line in f:
        if 'Ep ' in line and 'TE' in line:
            print(repr(line[:120]))
            break

'  Ep    500 │ ε 0.8607 │ R  -234.85 │ TE  49.57 mm │ 206s  │  45.2% coverage\n'


In [None]:
# Training Progress Curves
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle("Q-Learning Training Progress — Pneumatic Teleoperation", fontsize=14, fontweight='bold')

# Tracking RMSE
ax = axes[0, 0]
ax.plot(eps, rmses, 'ro-', lw=2, markersize=8)
ax.axhline(22.2, color='gray', ls='--', lw=1.5, label='Baseline (no valve) = 22.2 mm')
ax.set_xlabel('Episode'); ax.set_ylabel('Tracking RMSE [mm]')
ax.set_title('Tracking Error (lower = better)')
ax.legend(fontsize=9); ax.grid(True, alpha=0.3)

# Mean Reward
ax = axes[0, 1]
ax.plot(eps, rewards, 'bo-', lw=2, markersize=8)
ax.set_xlabel('Episode'); ax.set_ylabel('Mean Reward')
ax.set_title('Evaluation Reward (higher = better)')
ax.grid(True, alpha=0.3)

# Epsilon decay
ax = axes[1, 0]
ax.plot(eps, epsilons, 'gs-', lw=2, markersize=8)
ax.set_xlabel('Episode'); ax.set_ylabel('Epsilon')
ax.set_title('Exploration Rate'); ax.grid(True, alpha=0.3)

# Q-table coverage
ax = axes[1, 1]
ax.plot(eps, coverages, 'ms-', lw=2, markersize=8)
ax.set_xlabel('Episode'); ax.set_ylabel('Coverage [%]')
ax.set_title('Q-table State-Action Coverage'); ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Run RL episode and Baseline episode for comparison
import config as cfg
from teleop_env import TeleopEnv
from q_learning_agent import QLearningAgent

env = TeleopEnv()
state_dims = env.get_state_dims()
agent = QLearningAgent(state_dims, cfg.N_ACTIONS)
agent.load("results/old/models/q_table.npy")
agent.epsilon = 0.0
print(f"Agent: {agent}")

# RL episode (deterministic env)
obs, _ = env.reset()
env.fh_amp = 10.0; env.fh_freq = 0.5; env.fh_phase = 0.0
env.Be, env.Ke = cfg.SKIN_BE, cfg.SKIN_KE
state = env.discretise_obs(obs)
done = False
while not done:
    action = agent.select_action(state)
    obs, _, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    state = env.discretise_obs(obs)
h_rl = env.render()

# Baseline episode (same conditions)
obs, _ = env.reset()
env.fh_amp = 10.0; env.fh_freq = 0.5; env.fh_phase = 0.0
env.Be, env.Ke = cfg.SKIN_BE, cfg.SKIN_KE
zero_act = int(np.argmin(np.abs(cfg.V_LEVELS)))
done = False
while not done:
    _, _, terminated, truncated, _ = env.step(zero_act)
    done = terminated or truncated
h_bl = env.render()

pe_rl = np.array(h_rl["pos_error"])*1000
pe_bl = np.array(h_bl["pos_error"])*1000
print(f"RL RMSE:       {np.sqrt(np.mean(pe_rl**2)):.2f} mm")
print(f"Baseline RMSE: {np.sqrt(np.mean(pe_bl**2)):.2f} mm")
print(f"Improvement:   {(1 - np.sqrt(np.mean(pe_rl**2))/np.sqrt(np.mean(pe_bl**2)))*100:.1f}%")

In [None]:
# Episode Comparison: RL vs Baseline
t_rl = np.array(h_rl["time"])
t_bl = np.array(h_bl["time"])

fig, axes = plt.subplots(5, 1, figsize=(14, 16), sharex=True)
fig.suptitle("RL Valve Controller vs Baseline (no valve) — Skin Environment", fontsize=14, fontweight='bold')

# 1. Position tracking
ax = axes[0]
ax.plot(t_rl, np.array(h_rl["x_m"])*1000, 'b-', lw=1.5, label='x_m (master)')
ax.plot(t_rl, np.array(h_rl["x_s"])*1000, 'r-', lw=1.5, label='x_s (slave, RL)')
ax.plot(t_bl, np.array(h_bl["x_s"])*1000, 'r--', lw=1, alpha=0.5, label='x_s (slave, baseline)')
ax.set_ylabel('Position [mm]'); ax.set_title('Position Tracking')
ax.legend(fontsize=9); ax.grid(True, alpha=0.3)

# 2. Chamber pressures (RL)
ax = axes[1]
ax.plot(t_rl, np.array(h_rl["P_s1"])/1000, 'b-', lw=1, label='P_s1')
ax.plot(t_rl, np.array(h_rl["P_s2"])/1000, 'r-', lw=1, label='P_s2')
ax.plot(t_rl, np.array(h_rl["P_m1"])/1000, 'b--', lw=0.7, alpha=0.5, label='P_m1')
ax.plot(t_rl, np.array(h_rl["P_m2"])/1000, 'r--', lw=0.7, alpha=0.5, label='P_m2')
ax.set_ylabel('Pressure [kPa]'); ax.set_title('Chamber Pressures (RL Episode)')
ax.legend(fontsize=8); ax.grid(True, alpha=0.3)

# 3. Forces
ax = axes[2]
ax.plot(t_rl, h_rl["F_h"], 'b-', lw=1, label='F_h (human)')
ax.plot(t_rl, h_rl["F_e"], 'r-', lw=1, label='F_e (env, RL)')
ax.plot(t_bl, h_bl["F_e"], 'r--', lw=0.8, alpha=0.5, label='F_e (env, baseline)')
ax.set_ylabel('Force [N]'); ax.set_title('Forces')
ax.legend(fontsize=9); ax.grid(True, alpha=0.3)

# 4. Valve control
ax = axes[3]
ax.plot(t_rl, h_rl["u_v"], 'g-', lw=1, label='u_v [V]')
ax2 = ax.twinx()
ax2.plot(t_rl, h_rl["x_v"], 'm-', lw=0.8, alpha=0.7, label='x_v (spool)')
ax.set_ylabel('Valve Voltage [V]'); ax2.set_ylabel('Spool Position')
ax.set_title('RL Valve Control Signal')
lines1, labels1 = ax.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax.legend(lines1+lines2, labels1+labels2, fontsize=9); ax.grid(True, alpha=0.3)

# 5. Tracking error comparison
ax = axes[4]
ax.plot(t_rl, pe_rl, 'b-', lw=1.5, label=f'RL (RMSE={np.sqrt(np.mean(pe_rl**2)):.1f} mm)')
ax.plot(t_bl, pe_bl, 'r--', lw=1, alpha=0.7, label=f'Baseline (RMSE={np.sqrt(np.mean(pe_bl**2)):.1f} mm)')
ax.axhline(0, color='gray', lw=0.5)
ax.set_ylabel('Tracking Error [mm]'); ax.set_xlabel('Time [s]')
ax.set_title('Tracking Error (x_m − x_s)')
ax.legend(fontsize=9); ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()