In [6]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import random
import torch.nn.functional as F
import matplotlib.pyplot as plt
from collections import deque
from sklearn.random_projection import GaussianRandomProjection
import torch.amp as amp

data_path = "C:/Users/ksw00/mycode/data"
# ✅ GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 128

def get_dataloader(batch_size, train):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,))
    ])
    
    if train:
        # MNIST 데이터셋 로드
        mnist_dataset = datasets.MNIST(root=data_path, train=True, transform=transform, download=True)
        data_loader = DataLoader(mnist_dataset, batch_size=len(mnist_dataset), shuffle=False)
    else:
        # MNIST 데이터셋 로드
        test_dataset = datasets.MNIST(root=data_path, train=False, transform=transform, download=True)
        data_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)
    
    # # RP
    # data_iter = iter(data_loader)
    # images, labels = next(data_iter)
    # images = images.view(images.size(0), -1).numpy()
    
    # rp = GaussianRandomProjection(n_components=784)
    # images_reduced = rp.fit_transform(images)

    # eps = 1e-8
    # images_reduced = (images_reduced - np.mean(images_reduced, axis=0)) / (np.std(images_reduced, axis=0) + eps)

    # images_reduced = torch.tensor(images_reduced, dtype=torch.float32)
    # labels = labels.clone().detach().long()

    # dataset = TensorDataset(images_reduced, labels)
    # data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=train)


    # No RP
    data_iter = iter(data_loader)
    images, labels = next(data_iter)
    
    dataset = TensorDataset(images, labels)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle = train)
    
    return data_loader

testloader = get_dataloader(batch_size, train=False)

# 모델 평가 함수
def evaluate_model(model, testloader):
    model.eval()  # 평가 모드 활성화
    correct = 0
    total = 0

    with torch.no_grad():  # 그래디언트 계산 비활성화
        for images, labels in testloader:
            images, labels = images.view(images.size(0), -1).to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)  # 가장 높은 확률의 클래스 선택
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

class MLP(nn.Module):
    def __init__(self, input_dim= 48, hidden_dim= 48, output_dim=10):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):  
        with amp.autocast(device_type=str(device)):   
            x = self.fc1(x)
            x = self.relu(x)
            x = self.fc2(x)
        return x
    
class MLP2(nn.Module):
    def __init__(self, input_dim= 28*28, hidden_dim= 128, output_dim=10):
        super(MLP2, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):    
        with amp.autocast(device_type=str(device)): 
            x = self.fc1(x)
            x = self.relu(x)
            x = self.fc2(x)
        return x

In [7]:
class OptimizerEnv(gym.Env):
    def __init__(self, device, batch_size):
        super(OptimizerEnv, self).__init__()
        self.device = device
        self.train_loader = get_dataloader(batch_size, train=True)
        
        self.action_space = gym.spaces.Discrete(5)
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(2,), dtype=np.float32)
        
        # 2층 MLP 모델
        # self.model = MLP().to(self.device)
        self.model = MLP2().to(self.device)

        self.criterion = nn.CrossEntropyLoss()
        self.episode_step = 0
        self.init_loss = None  
        self.optimizer = None  
        self.loss_history = []
        # 🔥 AMP GradScaler 추가
        self.scaler = amp.GradScaler()
        # 🔥 데이터 로더 이터레이터 설정 (배치 단위 처리를 위해)
        self.train_loader_iter = iter(self.train_loader)

    def step(self, action):
        self._set_optimizer(action)
        """🔥 배치 단위로 옵티마이저 선택"""
    # def step(self):     
    #     self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        # self.optimizer = optim.RMSprop(self.model.parameters(), lr=0.001)
        # self.optimizer = optim.Adagrad(self.model.parameters(), lr=0.001)
        # self.optimizer = optim.SGD(self.model.parameters(), lr=0.001)
        # self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9)
        
        try:
            images, labels = next(self.train_loader_iter)  # 🔥 다음 배치 가져오기
        except StopIteration:
            # 🔥 배치 다 돌면 새로운 epoch 시작
            self.train_loader_iter = iter(self.train_loader)
            images, labels = next(self.train_loader_iter)

        images, labels = images.view(images.size(0), -1).to(self.device), labels.to(self.device)

        # ✅ 자동 혼합 정밀도 적용
        with amp.autocast(device_type=str(self.device)):
            outputs = self.model(images)
            loss = self.criterion(outputs, labels)

        loss_value = loss.item()
        self.loss_history.append(loss_value)
        moving_avg_loss = np.mean(self.loss_history[-10:]) if self.loss_history else 0

        self.optimizer.zero_grad()

        # ✅ Scaler를 사용하여 역전파
        self.scaler.scale(loss).backward()

        # 🔥 MLP 모델의 가중치를 벡터로 변환
        model_params = torch.cat([p.flatten() for p in self.model.parameters()]).detach().cpu().numpy()

        # 🔥 MLP 모델의 그라디언트를 벡터로 변환 (없으면 0으로 채움)
        model_grads = []
        for p in self.model.parameters():
            if p.grad is not None:
                model_grads.append(p.grad.flatten())
            else:
                model_grads.append(torch.zeros_like(p.flatten()))  # 🔥 None이면 0으로 채움
        model_grads = torch.cat(model_grads).detach().cpu().numpy()
        model_grads = np.nan_to_num(model_grads, nan=0.0, posinf=0.0, neginf=0.0)
        
        self.scaler.step(self.optimizer)
        self.scaler.update()
        
        # 🔥 state에 축소된 모델 가중치 및 그라디언트 추가
        state = np.concatenate(([self.episode_step, moving_avg_loss], model_params, model_grads)).astype(np.float32)
        self.episode_step += 1

        if self.init_loss is None:
            self.init_loss = loss_value # 🔥 초기 손실값 저장

        reward = -np.log(loss_value / self.init_loss) / (self.episode_step - 1) if self.episode_step > 1 else 0
            
        done = self.episode_step >= 10 * len(self.train_loader)  # 🔥 전체 배치 수 고려한 종료 조건
        
        return state, reward, done, {}

    def reset(self):
        """환경 초기화"""
        self.episode_step = 0
        self.init_loss = None
        self.loss_history = []
        self.model.apply(self._reset_weights)
        self.train_loader_iter = iter(self.train_loader)

        # 🔥 초기 가중치 및 그라디언트를 0으로 설정 (완전 초기화)
        # model_params = np.zeros(2842, dtype=np.float32)  # 가중치 초기값을 0으로 설정
        # model_grads = np.zeros(2842, dtype=np.float32)  # 초기에는 그라디언트 없음

        model_params = np.zeros(101770, dtype=np.float32)  # 가중치 초기값을 0으로 설정
        model_grads = np.zeros(101770, dtype=np.float32)  # 초기에는 그라디언트 없음

        # 🔥 state에 축소된 가중치와 초기 그라디언트 포함
        state = np.concatenate(([self.episode_step, 0.0], model_params, model_grads)).astype(np.float32)
        return state

    def _set_optimizer(self, action):
        """매 배치마다 옵티마이저 변경"""
        optimizers = [
            optim.Adam(self.model.parameters(), lr=0.001),
            optim.RMSprop(self.model.parameters(), lr=0.001),
            optim.Adagrad(self.model.parameters(), lr=0.001),
            optim.SGD(self.model.parameters(), lr=0.001),
            optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9)
        ]
        self.optimizer = optimizers[action]
    
    def _reset_weights(self, m):
        """모델 가중치 초기화"""
        if isinstance(m, nn.Linear):
            m.reset_parameters()

In [8]:
class DQNAgent:
    def __init__(self, state_size, action_size, device):
        self.state_size = state_size
        self.action_size = action_size
        self.device = device

        self.epsilon = 1.0  
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.99  # 🔥 탐색 비율 감소 조정
        self.gamma = 0.9  # 🔥 단기적 보상 고려 강화
        self.lr = 0.001
        
        self.model = nn.Sequential(
            nn.Linear(state_size, 512),
            nn.ReLU(),
            nn.Linear(512, 64),
            nn.ReLU(),
            nn.Linear(64, action_size)
        ).to(self.device)

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.memory = []
        self.memory_size = 5000  # 🔥 경험 저장 크기 설정
        self.memory = deque(maxlen=self.memory_size)  # 🔥 가장 오래된 데이터를 자동 삭제

        self.scaler = amp.GradScaler()

    def act(self, state):
        """ε-greedy 정책으로 행동 선택"""
        if random.random() < self.epsilon:
            return random.randint(0, self.action_size - 1)
        else:
            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            with torch.no_grad():
                return torch.argmax(self.model(state)).item()

    def remember(self, state, action, reward, next_state, done):
        """경험 저장"""
        self.memory.append((state, action, reward, next_state, done))

    def train(self, batch_size=128):
        """DQN 학습"""
        if len(self.memory) < batch_size:
            return
        
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(np.array(states)).to(self.device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)

        with amp.autocast(device_type=str(self.device)):
            q_values = self.model(states).gather(1, actions)
            next_q_values = self.model(next_states).max(1, keepdim=True)[0]
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

            loss = F.mse_loss(q_values, target_q_values.detach())

        # ✅ Scaler 적용
        self.scaler.scale(loss).backward()
        self.scaler.step(self.optimizer)
        self.scaler.update()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = OptimizerEnv(device, batch_size)
# agent = DQNAgent(state_size=5856, action_size=5, device=device)
agent = DQNAgent(state_size=203542, action_size=5, device=device)

loss_history = []
num_episodes = 1000  

# 🔥 옵티마이저 선택 횟수 기록
optimizer_names = ["Adam", "RMSprop", "Adagrad", "SGD", "Momentum"]

reward_history = []  # 🔥 각 episode의 총 reward 저장 리스트

for episode in range(num_episodes):
    state = env.reset()
    done = False
    episode_losses = []
    episode_trajectory = []
    total_reward = 0  # 🔥 각 episode의 총 reward 초기화
    
    optimizer_count = {i: 0 for i in range(5)}  # 🔥 옵티마이저 선택 횟수 초기화

    while not done:
        action = agent.act(state)  # 🔥 매 batch마다 옵티마이저 선택
        optimizer_count[action] += 1  # 🔥 선택된 옵티마이저 카운트 증가
        next_state, reward, done, _ = env.step(action)
        # next_state, reward, done, _ = env.step()

        total_reward += reward  # 🔥 reward를 episode 단위로 누적
        episode_losses.append(state[1])  
        episode_trajectory.append((state, action, reward, next_state, done))

        state = next_state
    
    for transition in episode_trajectory:
        agent.remember(*transition)

    avg_loss = sum(episode_losses) / len(episode_losses)
    loss_history.append(avg_loss)

    avg_reward = total_reward / len(episode_trajectory) if episode_trajectory else 0
    reward_history.append(avg_reward)  # 🔥 episode 종료 후 총 reward 저장

    agent.train(batch_size)

    print(f"Episode {episode+1}, Avg Loss: {avg_loss}, Episode mean Reward: {avg_reward}")
    print(f"Optimizer Counts: {', '.join([f'{optimizer_names[i]}: {optimizer_count[i]}' for i in range(5)])}")

    model = env.model
    # 모델 정확성 평가
    evaluate_model(model, testloader)


Episode 1, Avg Loss: 1.231890397234512, Episode mean Reward: 0.0006057805069635702
Optimizer Counts: Adam: 980, RMSprop: 928, Adagrad: 967, SGD: 936, Momentum: 879
Test Accuracy: 92.22%
Episode 2, Avg Loss: 1.2956901462665245, Episode mean Reward: 0.0004323564197591837
Optimizer Counts: Adam: 906, RMSprop: 983, Adagrad: 952, SGD: 921, Momentum: 928
Test Accuracy: 94.09%
Episode 3, Avg Loss: 1.1941080491680072, Episode mean Reward: 0.0005806761317231226
Optimizer Counts: Adam: 915, RMSprop: 987, Adagrad: 907, SGD: 960, Momentum: 921
Test Accuracy: 92.13%
Episode 4, Avg Loss: 1.3232380460002529, Episode mean Reward: 0.0004644261116003281
Optimizer Counts: Adam: 887, RMSprop: 1028, Adagrad: 911, SGD: 935, Momentum: 929
Test Accuracy: 93.68%
Episode 5, Avg Loss: 1.2564257536425012, Episode mean Reward: 0.000668385652896342
Optimizer Counts: Adam: 908, RMSprop: 1056, Adagrad: 879, SGD: 911, Momentum: 936
Test Accuracy: 93.21%
Episode 6, Avg Loss: 1.1656318811401885, Episode mean Reward: 0.0

In [None]:
plt.plot(loss_history, label="L2O Optimizer Loss")
plt.xlabel("Episode")
plt.ylabel("Loss")
plt.title("Loss Reduction Over Training Episodes")
plt.legend()
plt.grid()
plt.show()

NameError: name 'plt' is not defined

reward 수정
- discount rate 적용해보기 --> 이게 gamma값
- moving average 방식
- 배치단위 리워드 -> 에폭 단위 리워드로 변경?


state 수정
- 파라미터 값 추가
