In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque


In [3]:
class DinoNet(nn.Module):
    def __init__(self, state_size, action_size):
        super(DinoNet, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


In [4]:
class DinoAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.model = DinoNet(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.criterion = nn.MSELoss()
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.tensor(state, dtype=torch.float32)
        q_values = self.model(state)
        return torch.argmax(q_values).item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.tensor(state, dtype=torch.float32)
            next_state = torch.tensor(next_state, dtype=torch.float32)
            target = reward
            if not done:
                target += self.gamma * torch.max(self.model(next_state)).item()
            target_f = self.model(state)
            target_f[action] = target
            self.optimizer.zero_grad()
            loss = self.criterion(target_f, self.model(state))
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [5]:
class SimpleDinoEnv:
    def __init__(self):
        self.state_size = 5
        self.action_size = 4

    def reset(self):
        # Reset the environment and return the initial state
        return np.random.random(self.state_size)

    def step(self, action):
        # Apply action, return next_state, reward, done
        next_state = np.random.random(self.state_size)
        reward = np.random.randint(0, 10)
        done = random.random() < 0.1  # 10% chance of termination
        return next_state, reward, done


In [6]:
def train_dino_agent(episodes=1000, batch_size=32):
    env = SimpleDinoEnv()
    agent = DinoAgent(state_size=env.state_size, action_size=env.action_size)

    for e in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.act(state)
            next_state, reward, done = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        agent.replay(batch_size)
        print(f"Episode {e+1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")

    return agent


In [7]:
agent = train_dino_agent(episodes=500, batch_size=32)


Episode 1/500, Total Reward: 11, Epsilon: 1.00
Episode 2/500, Total Reward: 23, Epsilon: 1.00
Episode 3/500, Total Reward: 13, Epsilon: 1.00
Episode 4/500, Total Reward: 43, Epsilon: 1.00
Episode 5/500, Total Reward: 7, Epsilon: 1.00
Episode 6/500, Total Reward: 20, Epsilon: 1.00
Episode 7/500, Total Reward: 48, Epsilon: 0.99
Episode 8/500, Total Reward: 113, Epsilon: 0.99
Episode 9/500, Total Reward: 23, Epsilon: 0.99
Episode 10/500, Total Reward: 13, Epsilon: 0.98
Episode 11/500, Total Reward: 9, Epsilon: 0.98
Episode 12/500, Total Reward: 86, Epsilon: 0.97
Episode 13/500, Total Reward: 11, Epsilon: 0.97
Episode 14/500, Total Reward: 104, Epsilon: 0.96
Episode 15/500, Total Reward: 49, Epsilon: 0.96
Episode 16/500, Total Reward: 64, Epsilon: 0.95
Episode 17/500, Total Reward: 3, Epsilon: 0.95
Episode 18/500, Total Reward: 18, Epsilon: 0.94
Episode 19/500, Total Reward: 11, Epsilon: 0.94
Episode 20/500, Total Reward: 70, Epsilon: 0.93
Episode 21/500, Total Reward: 109, Epsilon: 0.93
E

In [8]:
torch.save(agent.model.state_dict(), "dino_dqn.pth")
print("Model saved as dino_dqn.pth")


Model saved as dino_dqn.pth
