In [1]:
def plot_metrics_comparison(train_metrics, test_metrics):
    """Plot a comparison of key metrics between training and testing"""
    # Calculate metrics for both datasets
    train_true_states = []
    train_predicted_states = []
    
    # Collect all states and predictions from training
    for episode in range(len(train_metrics['episode_accuracies'])):
        episode_index = episode * 100  # Assuming 100 timesteps per episode
        if episode_index < len(train_metrics['all_rewards']):
            train_true_states.append(1)  # Placeholder - we don't have actual train true states stored
            train_predicted_states.append(1)  # Placeholder - we don't have actual train predictions stored
    
    metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    # We can only compare accuracy across train and test since we have limited metrics for training
    train_values = [np.mean(train_metrics['episode_accuracies']), 0, 0, 0]
    
    # Get test metrics
    test_performance = calculate_metrics(test_metrics)
    test_values = [
        test_performance['accuracy'],
        test_performance['precision'],
        test_performance['recall'],
        test_performance['f1']
    ]
    
    # Create the plot
    plt.figure(figsize=(10, 6))
    x = np.arange(len(metrics_names))
    width = 0.35
    
    plt.bar(x - width/2, train_values, width, label='Training', alpha=0.7)
    plt.bar(x + width/2, test_values, width, label='Testing', alpha=0.7)
    
    plt.xlabel('Metrics')
    plt.ylabel('Score')
    plt.title('Performance Metrics Comparison')
    plt.xticks(x, metrics_names)
    plt.ylim(0, 1)
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    plt.tight_layout()
    plt.show()
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
from environments import SBEOS_Environment
from tqdm import tqdm

class QLearningAgent:
    def __init__(self, state_size, action_size, learning_rate=0.1, discount_factor=0.95, 
                 exploration_rate=1.0, exploration_decay=0.995, min_exploration_rate=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.min_exploration_rate = min_exploration_rate
        
        # Initialize Q-table
        # We need to discretize the continuous state space first
        self.n_bins = 10  # Number of bins for each dimension
        self.q_table = {}
        
    def discretize_state(self, state):
        """Convert continuous state to discrete state for Q-table lookup"""
        # For simplicity, we'll use a basic binning approach
        discretized = []
        for value in state:
            # Clip to reasonable range and discretize
            bin_value = min(int(np.clip(value * self.n_bins, 0, self.n_bins - 1)), self.n_bins - 1)
            discretized.append(bin_value)
        return tuple(discretized)
    
    def get_action(self, state):
        """Select action using epsilon-greedy policy"""
        discretized_state = self.discretize_state(state)
        
        if np.random.random() < self.exploration_rate:
            # Explore: select random action
            return np.random.randint(self.action_size)
        else:
            # Exploit: select best action from Q-table
            if discretized_state not in self.q_table:
                self.q_table[discretized_state] = np.zeros(self.action_size)
            return np.argmax(self.q_table[discretized_state])
    
    def update_q_table(self, state, action, reward, next_state, done):
        """Update Q-table using Q-learning update rule"""
        discretized_state = self.discretize_state(state)
        discretized_next_state = self.discretize_state(next_state)
        
        # Initialize Q-values for states if they don't exist
        if discretized_state not in self.q_table:
            self.q_table[discretized_state] = np.zeros(self.action_size)
        if discretized_next_state not in self.q_table:
            self.q_table[discretized_next_state] = np.zeros(self.action_size)
        
        # Q-learning update
        best_next_action = np.argmax(self.q_table[discretized_next_state])
        td_target = reward + (0 if done else self.discount_factor * self.q_table[discretized_next_state][best_next_action])
        td_error = td_target - self.q_table[discretized_state][action]
        
        self.q_table[discretized_state][action] += self.learning_rate * td_error
        
    def decay_exploration_rate(self):
        """Decay exploration rate"""
        self.exploration_rate = max(self.min_exploration_rate, 
                                    self.exploration_rate * self.exploration_decay)

def train_q_learning(env, episodes=1000):
    """Train Q-learning agent on SBEOS environment"""
    # Initialize Q-learning agent
    state_size = env.window_size + 6  # Based on observation space
    action_size = 4  # 0: predict 0, 1: predict 1, 2: predict 0 with high energy, 3: predict 1 with high energy
    
    agent = QLearningAgent(state_size, action_size)
    
    # Track metrics
    all_rewards = []
    episode_rewards = []
    episode_accuracies = []  # Track accuracy per episode
    
    # Training loop
    for episode in tqdm(range(episodes)):
        state = env.reset()
        done = False
        episode_reward = 0
        
        # For tracking accuracy within each episode
        episode_true_states = []
        episode_predicted_states = []
        
        while not done:
            # Select action
            action = agent.get_action(state)
            
            # Store Q-values for ROC curve (difference between predicting 1 vs 0)
            discretized_state = agent.discretize_state(state)
            if discretized_state not in agent.q_table:
                agent.q_table[discretized_state] = np.zeros(agent.action_size)
            
            # Simple case: compare Q-values for actions 0 and 1
            action_value = agent.q_table[discretized_state][1] - agent.q_table[discretized_state][0]
            
            # Take action
            next_state, reward, done, info = env.step(action)
            
            # Update Q-table
            agent.update_q_table(state, action, reward, next_state, done)
            
            # Track metrics
            episode_reward += reward
            true_state = info["state"]
            predicted_state = action % 2  # Extract the binary prediction (0 or 1)
            
            episode_true_states.append(true_state)
            episode_predicted_states.append(predicted_state)
            all_rewards.append(reward)
            
            # Update state
            state = next_state
        
        # Calculate episode accuracy
        if len(episode_true_states) > 0:  # Ensure we have predictions
            episode_accuracy = accuracy_score(episode_true_states, episode_predicted_states)
            episode_accuracies.append(episode_accuracy)
        else:
            episode_accuracies.append(0)
        
        # Decay exploration rate
        agent.decay_exploration_rate()
        
        # Track episode rewards
        episode_rewards.append(episode_reward)
        
        if episode % 100 == 0:
            print(f"Episode: {episode}, Reward: {episode_reward}, Accuracy: {episode_accuracies[-1]:.4f}, Exploration rate: {agent.exploration_rate:.4f}")
    
    return agent, {
        'episode_rewards': episode_rewards,
        'all_rewards': all_rewards,
        'episode_accuracies': episode_accuracies
    }

def test_q_learning(agent, episodes=100, seed=None):
    """Test trained Q-learning agent on a new environment instance with different seed"""
    # Create a new environment instance
    # Note: SBEOS_Environment doesn't accept a seed parameter
    test_env = SBEOS_Environment(max_timesteps=100)
    
    # Use numpy's random state to ensure different randomization 
    if seed is not None:
        # Set numpy's random state for this specific testing
        np.random.seed(seed)
        
    # Track metrics
    episode_rewards = []
    episode_accuracies = []  # Track accuracy per episode
    all_true_states = []
    all_predicted_states = []
    
    # Testing loop
    for episode in tqdm(range(episodes)):
        state = test_env.reset()
        done = False
        episode_reward = 0
        
        # For tracking accuracy within each episode
        episode_true_states = []
        episode_predicted_states = []
        
        while not done:
            # Select best action (no exploration)
            discretized_state = agent.discretize_state(state)
            if discretized_state not in agent.q_table:
                agent.q_table[discretized_state] = np.zeros(agent.action_size)
            
            # Get Q-values for this state
            q_values = agent.q_table[discretized_state]
            action = np.argmax(q_values)
            
            # Take action
            next_state, reward, done, info = test_env.step(action)
            
            # Track metrics
            episode_reward += reward
            true_state = info["state"]
            predicted_state = action % 2  # Extract the binary prediction (0 or 1)
            
            episode_true_states.append(true_state)
            episode_predicted_states.append(predicted_state)
            all_true_states.append(true_state)
            all_predicted_states.append(predicted_state)
            
            # Update state
            state = next_state
        
        # Calculate episode accuracy
        if len(episode_true_states) > 0:  # Ensure we have predictions
            episode_accuracy = accuracy_score(episode_true_states, episode_predicted_states)
            episode_accuracies.append(episode_accuracy)
        else:
            episode_accuracies.append(0)
        
        # Track episode rewards
        episode_rewards.append(episode_reward)
    
    return {
        'episode_rewards': episode_rewards,
        'episode_accuracies': episode_accuracies,
        'true_states': all_true_states,
        'predicted_states': all_predicted_states
    }

def plot_rewards_and_accuracy(train_metrics, test_metrics):
    """Plot episode rewards and accuracy for training and testing"""
    plt.figure(figsize=(15, 10))
    
    # Plot training rewards
    plt.subplot(2, 2, 1)
    plt.plot(train_metrics['episode_rewards'])
    plt.title('Training Rewards per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.grid(True)
    
    # Plot testing rewards
    plt.subplot(2, 2, 2)
    plt.plot(test_metrics['episode_rewards'])
    plt.title('Testing Rewards per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.grid(True)
    
    # Plot training accuracy
    plt.subplot(2, 2, 3)
    plt.plot(train_metrics['episode_accuracies'])
    plt.title('Training Accuracy per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.grid(True)
    
    # Plot testing accuracy
    plt.subplot(2, 2, 4)
    plt.plot(test_metrics['episode_accuracies'])
    plt.title('Testing Accuracy per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.grid(True)
    
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(test_metrics):
    """Plot confusion matrix for testing results"""
    plt.figure(figsize=(8, 6))
    
    # Testing confusion matrix
    test_cm = confusion_matrix(test_metrics['true_states'], test_metrics['predicted_states'])
    sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Predicted 0', 'Predicted 1'],
                yticklabels=['Actual 0', 'Actual 1'])
    plt.title('Testing Confusion Matrix')
    
    plt.tight_layout()
    plt.show()

def calculate_metrics(metrics):
    """Calculate overall accuracy, precision, recall and F1 score from true and predicted states"""
    true_states = metrics['true_states']
    predicted_states = metrics['predicted_states']
    
    accuracy = accuracy_score(true_states, predicted_states)
    precision = precision_score(true_states, predicted_states, zero_division=0)
    recall = recall_score(true_states, predicted_states, zero_division=0)
    f1 = f1_score(true_states, predicted_states, zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Example usage
if __name__ == "__main__":
    # Set random seed for reproducibility for training
    train_seed = 42
    np.random.seed(train_seed)
    
    # Create training environment
    print("Training on SBEOS environment...")
    train_env = SBEOS_Environment(max_timesteps=100)  # SBEOS_Environment doesn't accept seed parameter
    
    # Train the agent
    agent, train_metrics = train_q_learning(train_env, episodes=500)
    
    # Set a different seed for testing
    test_seed = 84  # Different from training seed
    
    # Test the agent on an independent environment
    print("Testing on independent SBEOS environment...")
    test_metrics = test_q_learning(agent, episodes=100, seed=test_seed)
    
    # Calculate metrics
    test_performance = calculate_metrics(test_metrics)
    print(f"Test Performance Metrics:")
    print(f"  Accuracy:  {test_performance['accuracy']:.4f}")
    print(f"  Precision: {test_performance['precision']:.4f}")
    print(f"  Recall:    {test_performance['recall']:.4f}")
    print(f"  F1 Score:  {test_performance['f1']:.4f}")
    
    # Plot metrics comparison
    plot_metrics_comparison(train_metrics, test_metrics)
    
    # Plot detailed metrics
    plot_rewards_and_accuracy(train_metrics, test_metrics)
    plot_confusion_matrix(test_metrics)

SyntaxError: invalid syntax (799512573.py, line 45)