In [1]:
import numpy as np
import gym
from gym import spaces

class AdaptiveLearning(gym.Env):
    def __init__(self, D=3, threshold=1e-3, max_steps=200):
        super(AdaptiveLearning, self).__init__()
        self.state_size = D
        self.action_size = 4 
        self.observation_space = spaces.Box(low=0, high=1, shape=(D,), dtype=np.float32)
        self.action_space = spaces.Discrete(self.action_size)
        self.state = np.zeros(D)
        self.target = np.ones(D)  
        self.threshold = threshold  
        self.steps = 0
        self.max_steps = max_steps  

    def step(self, action):
        assert self.action_space.contains(action), "Invalid Resource"
        if action == 0:
            self.state[0] += np.random.uniform(0.1, 0.4)
        elif action == 1:
            self.state[1] += np.random.uniform(0.1, 0.4)
        elif action == 2:
            self.state[2] += np.random.uniform(0.1, 0.4)
        else:
            self.state += np.random.uniform(0.05, 0.2, size=self.state_size)
        self.state = np.clip(self.state, 0, 1)
        
        infinity_norm = np.max(np.abs(self.state - self.target))

        
        if infinity_norm < self.threshold:
            reward = 0  
            done = True  
            print('successfull learning! Took only {} steps!'.format(self.steps))
        else:
            reward = -1  
            done = False
       
        self.steps += 1

        
        if self.steps >= self.max_steps:
            done = True

        return self.state, reward, done, {}

    def reset(self):
        self.state = np.zeros(self.state_size)  
        self.steps = 0
        return self.state

    def render(self, mode='human'):
        print(f"Current Traits: {self.state}")


In [2]:
env = AdaptiveLearning()

In [3]:
state_size = env.observation_space.shape[0]
state_size

3

In [4]:
action_size = env.action_space.n
action_size

4

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque
import os

In [6]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size, gamma=0.95, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01, learning_rate=0.001, n_layers=24):
        super(DQN, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.learning_rate = learning_rate
        self.n_layers = n_layers
        self.model = self._build_model()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
    
    def _build_model(self):
        model = nn.Sequential(
            nn.Linear(self.state_size, self.n_layers),
            nn.ReLU(),
            nn.Linear(self.n_layers, self.n_layers),
            nn.ReLU(),
            nn.Linear(self.n_layers, self.action_size)
        )
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = torch.FloatTensor(state)
        with torch.no_grad():
            act_values = self.model(state)
        return torch.argmax(act_values).item()

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return

        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state)
            next_state = torch.FloatTensor(next_state)
            reward = torch.FloatTensor([reward])
            action = torch.LongTensor([action])
            
            target = reward
            if not done:
                target = reward + self.gamma * torch.max(self.model(next_state)).item()

            current_q_value = self.model(state)[0][action]
            loss = F.mse_loss(current_q_value, target)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_state_dict(torch.load(name))

    def save(self, name):
        torch.save(self.model.state_dict(), name)

    def fit(self, env, n_episodes=1000, batch_size=32, penalty=0):
        output_dir = 'model_output/weights/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for e in range(n_episodes):
            state = env.reset()
            state = np.reshape(state, [1, self.state_size])
            cumulative_reward = 0  
            done = False
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                reward = reward if not done else penalty
                
                cumulative_reward += reward  

                next_state = np.reshape(next_state, [1, self.state_size])
                self.remember(state, action, reward, next_state, done)
                state = next_state
                if done:
                    print(f"episode: {e}/{n_episodes} reward: {cumulative_reward}")
                    break
            if len(self.memory) > batch_size:
                self.replay(batch_size)
            if e % 100 == 0:
                self.save(output_dir + f"weights_{e:04d}.pth")


In [7]:
agent = DQN(state_size, action_size,gamma=0.9)

In [8]:
agent.fit(env,n_episodes=1000)

successfull learning! Took only 28 steps!
episode: 0/1000 reward: -28
successfull learning! Took only 21 steps!
episode: 1/1000 reward: -21
successfull learning! Took only 16 steps!
episode: 2/1000 reward: -16
successfull learning! Took only 18 steps!
episode: 3/1000 reward: -18
successfull learning! Took only 26 steps!
episode: 4/1000 reward: -26
successfull learning! Took only 27 steps!
episode: 5/1000 reward: -27
successfull learning! Took only 20 steps!
episode: 6/1000 reward: -20
successfull learning! Took only 21 steps!
episode: 7/1000 reward: -21
successfull learning! Took only 17 steps!
episode: 8/1000 reward: -17
successfull learning! Took only 24 steps!
episode: 9/1000 reward: -24
successfull learning! Took only 47 steps!
episode: 10/1000 reward: -47
successfull learning! Took only 22 steps!
episode: 11/1000 reward: -22
successfull learning! Took only 24 steps!
episode: 12/1000 reward: -24
successfull learning! Took only 17 steps!
episode: 13/1000 reward: -17
successfull lear