# Multimodal

In [16]:
import gymnasium as gym
import numpy as np
import cv2
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv


In [17]:
# List of games to train on
game_list = ["CartPole-v1", "LunarLander-v2", "Acrobot-v1", "MountainCar-v0"]

In [18]:
class FutureStepWrapper(gym.Wrapper):
    def __init__(self, env, num_future_steps=3):
        """Wrapper to add future steps instead of padding smaller observations."""
        super(FutureStepWrapper, self).__init__(env)
        self.num_future_steps = num_future_steps

        # Define a target observation size (7056 for flattened 84x84 image)
        self.target_obs_size = 7056
        self.future_obs_queue = []

        # Update observation space to match the new shape
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(self.target_obs_size,), dtype=np.float32
        )

    def preprocess_observation(self, obs):
        """Convert images to flattened grayscale and handle vector observations."""
        if isinstance(obs, np.ndarray) and len(obs.shape) == 3:  # Image observation
            obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)  # Convert to grayscale
            obs = cv2.resize(obs, (84, 84))  # Resize to 84x84
            obs = obs.flatten()  # Flatten to 1D (7056,)
        else:  # Vector observation
            obs = np.array(obs, dtype=np.float32).flatten()

        # Debugging: Print observation shape
        print(f"Preprocessed Observation Shape: {obs.shape}")

        # Ensure future queue is initialized
        while len(self.future_obs_queue) < self.num_future_steps:
            self.future_obs_queue.append(np.zeros_like(obs))  # Start with zeros

        # Append current obs and remove the oldest
        self.future_obs_queue.append(obs)
        self.future_obs_queue.pop(0)

        # Concatenate future steps
        stacked_obs = np.concatenate(self.future_obs_queue, axis=0)

        # Ensure final observation fits the target shape
        padded_obs = np.zeros(self.target_obs_size, dtype=np.float32)
        padded_obs[: len(stacked_obs)] = stacked_obs  # Fill with actual values

        return padded_obs

    def step(self, action):
        """Take a step in the environment and process the observation."""
        obs, reward, done, truncated, info = self.env.step(action)
        return self.preprocess_observation(obs), reward, done, truncated, info

    def reset(self, **kwargs):
        """Reset the environment and initialize the future queue."""
        obs, info = self.env.reset(**kwargs)
        self.future_obs_queue = [np.zeros_like(obs) for _ in range(self.num_future_steps)]  # Reset buffer
        return self.preprocess_observation(obs), info


In [19]:
# ---- Environment Creation ----
def make_env(game):
    """
    Creates an environment and applies the FutureStepWrapper.
    """
    env = gym.make(game)
    env = FutureStepWrapper(env)  # Apply preprocessing
    return env

# Use DummyVecEnv for stability on CPU
envs = DummyVecEnv([lambda: make_env(game) for game in game_list])

# Create PPO model
model = PPO("MlpPolicy", envs, verbose=1, tensorboard_log="./ppo_multigame/")

# Train the model
model.learn(total_timesteps=1_000_000)

# Save trained model
model.save("ppo_multigame")

# Test trained model
obs = envs.reset()
for _ in range(500):
    action, _ = model.predict(obs)
    obs, rewards, done, info = envs.step(action)
    envs.render()

Using cpu device
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Logging to ./ppo_multigame/PPO_1
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation Shape: (2,)
Preprocessed Observation 

Process ForkServerProcess-15:
Process ForkServerProcess-14:
Process ForkServerProcess-6:
Process ForkServerProcess-8:
Process ForkServerProcess-7:
Process ForkServerProcess-11:
Process ForkServerProcess-5:
Process ForkServerProcess-4:
Process ForkServerProcess-12:
Process ForkServerProcess-9:
Traceback (most recent call last):
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/site-packages/stable_baselines3/common/vec_env/subproc_vec_env.py", line 47, in _worker
    remote.send((observation, reset_info))
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/connection.py", line 206, in send
    self._send_bytes(_ForkingPickler.dumps(obj))
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/connection

KeyboardInterrupt: 

# New MOe mtheod

In [21]:
import numpy as np
import tensorflow as tf
import gym
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense, Concatenate, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

# --- Hyperparameters ---
num_actions = 5  # Change this as per your environment
num_experts = 3  # Number of expert sub-networks
gamma = 0.99  # Discount factor
clip_ratio = 0.2  # PPO Clipping ratio
learning_rate = 3e-4
epochs = 10
batch_size = 64
entropy_coef = 0.01

# --- PPO Mixture of Experts Policy Model ---
def build_model():
    # Image Input (e.g., CarRacing)
    img_input = Input(shape=(84, 84, 1), name="img_input")
    x_img = Conv2D(32, kernel_size=8, strides=4, activation='relu')(img_input)
    x_img = Conv2D(64, kernel_size=4, strides=2, activation='relu')(x_img)
    x_img = Conv2D(64, kernel_size=3, strides=1, activation='relu')(x_img)
    x_img = Flatten()(x_img)
    x_img = Dense(128, activation='relu')(x_img)

    # Vector Input (e.g., LunarLander or BipedalWalker)
    vec_input = Input(shape=(24,), name="vec_input")
    x_vec = Dense(64, activation='relu')(vec_input)
    x_vec = Dense(64, activation='relu')(x_vec)

    # Combine both inputs
    combined = Concatenate()([x_img, x_vec])

    # Shared Processing
    shared = Dense(256, activation='relu')(combined)

    # Mixture of Experts
    experts = [Dense(128, activation='relu')(shared) for _ in range(num_experts)]
    experts_tensor = tf.stack(experts, axis=1)  # Shape: (batch_size, num_experts, 128)
    gate = Dense(num_experts, activation='softmax')(shared)
    gate_expanded = tf.expand_dims(gate, axis=-1)
    moe_output = tf.reduce_sum(experts_tensor * gate_expanded, axis=1)  # Weighted sum of expert outputs

    # Policy Head
    policy_hidden = Dense(64, activation='relu')(moe_output)
    policy_logits = Dense(num_actions, activation=None, name="policy_logits")(policy_hidden)

    # Value Head
    value_hidden = Dense(64, activation='relu')(moe_output)
    value = Dense(1, activation=None, name="value")(value_hidden)

    model = Model(inputs=[img_input, vec_input], outputs=[policy_logits, value])
    return model

# --- PPO Agent ---
class PPOAgent:
    def __init__(self):
        self.model = build_model()
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    
    def get_action(self, img_obs, vec_obs):
        logits, value = self.model.predict([np.array([img_obs]), np.array([vec_obs])])
        action = tf.random.categorical(logits, 1).numpy()[0, 0]
        return action, value[0, 0]

    def train_step(self, obs_img, obs_vec, actions, advantages, old_probs, returns):
        with tf.GradientTape() as tape:
            logits, values = self.model([obs_img, obs_vec])
            values = tf.squeeze(values)
            
            # Compute probabilities
            action_probs = tf.nn.softmax(logits)
            selected_action_probs = tf.reduce_sum(action_probs * tf.one_hot(actions, num_actions), axis=1)

            # Compute PPO loss
            ratio = selected_action_probs / old_probs
            clipped_ratio = tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio)
            policy_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, clipped_ratio * advantages))
            
            value_loss = tf.keras.losses.mean_squared_error(returns, values)
            entropy_loss = -tf.reduce_mean(action_probs * tf.math.log(action_probs + 1e-10))

            total_loss = policy_loss + 0.5 * value_loss - entropy_coef * entropy_loss
        
        gradients = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

# --- Environment Interaction ---
def collect_data(env, agent, steps=1024):
    obs_imgs, obs_vecs, actions, rewards, dones, values, probs = [], [], [], [], [], [], []
    
    obs = env.reset()
    img_obs, vec_obs = preprocess_observation(obs)
    
    for _ in range(steps):
        action, value = agent.get_action(img_obs, vec_obs)
        obs_new, reward, done, _ = env.step(action)
        
        obs_imgs.append(img_obs)
        obs_vecs.append(vec_obs)
        actions.append(action)
        rewards.append(reward)
        dones.append(done)
        values.append(value)

        # Get new observation
        img_obs, vec_obs = preprocess_observation(obs_new)
        
        if done:
            obs = env.reset()
            img_obs, vec_obs = preprocess_observation(obs)

    return (np.array(obs_imgs), np.array(obs_vecs), np.array(actions), 
            np.array(rewards), np.array(dones), np.array(values))

def compute_advantages(rewards, values, dones, gamma=0.99, lambda_=0.95):
    advantages = np.zeros_like(rewards)
    last_advantage = 0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + (1 - dones[t]) * gamma * values[t + 1] - values[t]
        advantages[t] = last_advantage = delta + (1 - dones[t]) * gamma * lambda_ * last_advantage
    returns = advantages + values[:-1]
    return advantages, returns

# --- Preprocessing ---
def preprocess_observation(obs):
    if isinstance(obs, tuple):
        img_obs, vec_obs = obs
    else:
        img_obs, vec_obs = obs, np.zeros(24)  # Default vector obs
    img_obs = np.expand_dims(img_obs, axis=-1)  # Ensure shape is (84, 84, 1)
    return img_obs, vec_obs

# --- Main Training Loop ---
env = gym.make("CarRacing-v2")  # Change to other environments
agent = PPOAgent()

for iteration in range(1000):
    # Collect data from the environment
    obs_imgs, obs_vecs, actions, rewards, dones, values = collect_data(env, agent)
    
    # Compute advantages
    advantages, returns = compute_advantages(rewards, values, dones)

    # Train agent
    for _ in range(epochs):
        agent.train_step(obs_imgs, obs_vecs, actions, advantages, returns, values)

    print(f"Iteration {iteration} complete")

    # Save model every 10 iterations
    if iteration % 10 == 0:
        agent.model.save("ppo_moe_model.keras")




ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type dict).

In [22]:
# Kuch to kia hai

In [26]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import multiprocessing
import random
from collections import deque

# Set multiprocessing method explicitly
try:
    multiprocessing.set_start_method("spawn", force=True)
except RuntimeError:
    pass

# Experience Replay Buffer
class ExperienceReplay:
    def __init__(self, capacity=50000):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, min(batch_size, len(self.buffer)))

    def size(self):
        return len(self.buffer)

# PPO Agent with Mixture of Experts (MoE)
class PPOAgent:
    def __init__(self, obs_shape, action_size, is_image=False, buffer_size=50000):
        self.is_image = is_image
        self.replay_buffer = ExperienceReplay(buffer_size)
        self.model = self.build_model(obs_shape, action_size)
        self.optimizer = keras.optimizers.Adam(learning_rate=3e-4)

    def build_model(self, obs_shape, action_size):
        inputs = keras.Input(shape=obs_shape)

        # Experts
        expert1 = self.build_cnn_expert(inputs) if self.is_image else self.build_mlp_expert(inputs)
        expert2 = self.build_mlp_expert(inputs)

        # Gating Network
        gate = layers.Dense(2, activation="softmax")(inputs if not self.is_image else layers.Flatten()(inputs))
        gate1 = layers.Lambda(lambda x: x[:, 0:1])(gate)
        gate2 = layers.Lambda(lambda x: x[:, 1:2])(gate)

        # Weighted sum of experts
        mixed_output = layers.Add()([layers.Multiply()([expert1, gate1]), layers.Multiply()([expert2, gate2])])

        # PPO Output Layers
        policy = layers.Dense(action_size, activation="softmax")(mixed_output)
        value = layers.Dense(1)(mixed_output)

        return keras.Model(inputs=inputs, outputs=[policy, value])

    def build_cnn_expert(self, inputs):
        x = layers.Conv2D(32, (3, 3), activation="relu")(inputs)
        x = layers.MaxPooling2D()(x)
        x = layers.Conv2D(64, (3, 3), activation="relu")(x)
        x = layers.MaxPooling2D()(x)
        x = layers.Flatten()(x)
        return layers.Dense(128, activation="relu")(x)

    def build_mlp_expert(self, inputs):
        x = layers.Dense(64, activation="relu")(inputs)
        x = layers.Dense(64, activation="relu")(x)
        return layers.Dense(128, activation="relu")(x)

    def predict(self, obs):
        return self.model(obs)

    def store_experience(self, obs, action, reward, done, next_obs):
        self.replay_buffer.add((obs, action, reward, done, next_obs))

    def train_from_buffer(self, batch_size=64, gamma=0.99, lam=0.95):
        if self.replay_buffer.size() < batch_size:
            return  # Skip if not enough data

        batch = self.replay_buffer.sample(batch_size)
        obs, actions, rewards, dones, next_obs = zip(*batch)

        obs = np.array(obs)
        actions = np.array(actions)
        rewards = np.array(rewards)
        dones = np.array(dones)
        next_obs = np.array(next_obs)

        # Compute advantages
        _, values = self.model(obs)
        _, next_values = self.model(next_obs)
        advantages, returns = self.compute_advantages(rewards, values.numpy(), next_values.numpy(), dones, gamma, lam)

        # Train PPO
        for _ in range(10):  # 10 epochs per batch
            self.train_step(obs, actions, advantages, returns)

    def compute_advantages(self, rewards, values, next_values, dones, gamma=0.99, lam=0.95):
        advantages = np.zeros_like(rewards)
        last_adv = 0
        for t in reversed(range(len(rewards))):
            mask = 1 - dones[t]
            delta = rewards[t] + gamma * next_values[t] * mask - values[t]
            advantages[t] = last_adv = delta + gamma * lam * mask * last_adv
        returns = advantages + values
        return advantages, returns

    def train_step(self, obs, actions, advantages, returns):
        with tf.GradientTape() as tape:
            policy, value = self.model(obs, training=True)
            action_probs = tf.reduce_sum(policy * tf.one_hot(actions, depth=policy.shape[-1]), axis=-1)
            advantage_loss = -tf.reduce_mean(tf.math.log(action_probs) * advantages)
            value_loss = tf.reduce_mean(tf.square(returns - value))
            loss = advantage_loss + 0.5 * value_loss

        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))

# Environment Setup
def create_env(env_name):
    env = gym.make(env_name)
    obs_shape = env.observation_space.shape
    action_size = env.action_space.n if isinstance(env.action_space, gym.spaces.Discrete) else env.action_space.shape[0]
    is_image = len(obs_shape) == 3
    return env, PPOAgent(obs_shape, action_size, is_image)

# Data Collection
def collect_data(env, agent):
    obs = env.reset()
    for _ in range(2048):  # Collect 2048 steps
        obs_input = np.expand_dims(obs, axis=0).astype(np.float32) / 255.0 if agent.is_image else np.expand_dims(obs, axis=0)
        policy, _ = agent.predict(obs_input)
        action = np.random.choice(len(policy[0]), p=policy[0].numpy())

        new_obs, reward, done, _ = env.step(action)
        agent.store_experience(obs, action, reward, done, new_obs)
        obs = new_obs if not done else env.reset()

# Training Function (must be defined globally to avoid pickling issues)
def train_agent(env_name):
    env, agent = create_env(env_name)

    for iteration in range(1000):  # Train for 1000 iterations
        collect_data(env, agent)
        agent.train_from_buffer(batch_size=64)

        if iteration % 10 == 0:
            agent.model.save(f"ppo_moe_{env_name}.keras")
            print(f"{env_name} model saved at iteration {iteration}")

# 🛠 Ensure Multiprocessing Works Correctly
if __name__ == "__main__":
    env_names = ["CarRacing-v2", "LunarLander-v2", "BipedalWalker-v3", "Breakout-v0"]
    processes = []

    for env_name in env_names:
        p = multiprocessing.Process(target=train_agent, args=(env_name,))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'train_agent' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/miniconda3/envs/myenv38/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'train_agent' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/myenv38