In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 1) Imports 및 디렉터리 설정
# ──────────────────────────────────────────────────────────────────────────────
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import heapq
import logging
from collections import deque
from enum import Enum
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# 디렉터리 생성 및 로깅 설정
BASE_DIR    = './AICC_Project_0608_v2'
RESULTS_DIR = os.path.join(BASE_DIR, 'results')
PLOTS_DIR   = os.path.join(BASE_DIR, 'plots')
MODELS_DIR  = os.path.join(BASE_DIR, 'models')

for dir_path in [BASE_DIR, RESULTS_DIR, PLOTS_DIR, MODELS_DIR]:
    os.makedirs(dir_path, exist_ok=True)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logger.info(f"Base Directory: {BASE_DIR}")
logger.info(f"Results will be saved to: {RESULTS_DIR}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA 사용 가능 여부:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("사용 중인 디바이스:", torch.cuda.get_device_name(0))
else:
    print("사용 중인 디바이스: CPU")


CUDA 사용 가능 여부: False
사용 중인 디바이스: CPU


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 2) Config 클래스 정의 (FCR 보너스 포함)
# ──────────────────────────────────────────────────────────────────────────────
class AICCConfig:
    """Simulation을 위한 하이퍼파라미터 및 상수 집합."""
    def __init__(self):
        # 1) Reproducibility
        self.random_seed = 42

        # 2) Customer 정보
        self.customer_lifetime_value       = 890.18
        self.customer_lifetime_inquiries    = 122
        self.call_value                     = self.customer_lifetime_value / self.customer_lifetime_inquiries

        # **도착률** (Poisson 분포 파라미터: per minute)
        self.arrival_rate                   = 1.0

        # 문제 유형 분포
        self.problem_distribution           = {'easy': 0.4, 'hard': 0.6}

        # 고객 인내심 평균 (분 단위)
        self.customer_patience_mean         = 2.0

        # 고객 대기 비용 (분당)
        self.customer_wait_cost_per_minute  = 2.87

        # Fallback 패널티 비율 (resolved 실패 시 부과)
        self.fallback_penalty_rate          = 0.16

        # 문제 유형별 가치 가중치
        self.problem_value_multiplier       = {'easy': 1.0, 'hard': 1.5}

        # ***추가: FCR 보너스 비율 (첫 통화 해결 시 추가 보상)***
        self.fcr_bonus_rate                 = 0.47

        # 3) Call center 정보
        # 운영 시간 (분 단위, 예: 8시간 = 480분)
        self.operating_hours = 480

        # 4) AI Agent 정보
        self.num_ai_agents      = 100
        self.ai_cost_per_minute = 0.08
        self.ai_success_rate    = {'easy': 0.7, 'hard': 0.2}
        self.ai_time_multiplier = 0.6  # talk_time 및 ACW에 곱해지는 계수

        # 5) Human Agent 정보
        self.num_human_agents     = 3
        self.human_cost_per_minute = 0.33
        self.human_success_rate    = {'easy': 1.0, 'hard': 1.0}

        # 6) 통화 처리 시간 범위 (분 단위)
        self.talk_time_range = {'easy': (3, 6), 'hard': (6, 10)}
        self.acw_time_range  = {'easy': (30/60, 50/60), 'hard': (50/60, 70/60)}

        # 7) 휴리스틱 대기 시간 (분 단위)
        self.baseline_wait_time = {'easy': 5.0, 'hard': 8.0}

        # 8) 디렉터리 경로
        self.results_dir = RESULTS_DIR
        self.plots_dir   = PLOTS_DIR
        self.models_dir  = MODELS_DIR

# Config 인스턴스 생성
config = AICCConfig()

# CuDNN을 결정론적으로 설정 (속도 저하 감수)
torch.backends.cudnn.deterministic = True

# 3) 시드 설정 함수 및 초기 호출
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.benchmark = False

set_random_seed(config.random_seed)
logger.info(f"Global random seed set to: {config.random_seed}")


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 4) Domain Model: Customer 및 Agent 클래스
# ──────────────────────────────────────────────────────────────────────────────
class Customer:
    """고객 데이터 필드 및 상태 추적."""
    def __init__(self, customer_id: int, arrival_time: float, problem_type: str, patience: float):
        self.customer_id        = customer_id
        self.arrival_time       = arrival_time
        self.problem_type       = problem_type  # 'easy' 또는 'hard'
        self.patience           = patience

        # 상태 추적용 필드
        self.wait_start_time    = None
        self.service_start_time = None
        self.service_end_time   = None
        self.total_wait_time    = 0.0
        self.ai_service_time    = 0.0
        self.human_service_time = 0.0

        self.resolved         = False
        self.dropped_out      = False
        self.is_fcr           = False  # 첫 통화 해결 여부
        self.had_fallback     = False

        # 라우팅 정보
        self.initial_routing  = None  # 0=AI, 1=Human
        self.final_routing    = None  # 0=AI, 1=Human, -1=Failed

        # 이벤트 추적(드롭아웃 예약 ID)
        self.dropout_event_id = None
        self.reward           = 0.0  # 최종 reward 저장

class Agent:
    """AI 및 Human 공통 Agent 기본 클래스."""
    def __init__(self, agent_id: str, agent_type: str):
        self.agent_id               = agent_id
        self.agent_type             = agent_type  # 'ai' 또는 'human'
        self.is_busy                = False
        self.current_customer       = None
        self.service_end_time       = None
        self.total_service_time     = 0.0
        self.total_customers_served = 0

    def start_service(self, customer: Customer, current_time: float, duration: float):
        self.is_busy              = True
        self.current_customer     = customer
        self.service_end_time     = current_time + duration
        customer.service_start_time = current_time

    def finish_service(self):
        if self.current_customer:
            service_time = self.service_end_time - self.current_customer.service_start_time
            service_time = max(0.0, service_time)
            self.total_service_time += service_time
            self.total_customers_served += 1

            if self.agent_type == 'ai':
                self.current_customer.ai_service_time += service_time
            else:
                self.current_customer.human_service_time += service_time

        self.is_busy          = False
        self.current_customer = None
        self.service_end_time = None

class AIAgent(Agent):
    """AI Agent 구현."""
    def __init__(self, agent_id: int, config: AICCConfig):
        super().__init__(f"AI_{agent_id}", 'ai')
        self.config = config

    def get_service_time(self, problem_type: str) -> float:
        talk_min, talk_max = self.config.talk_time_range[problem_type]
        acw_min, acw_max   = self.config.acw_time_range[problem_type]
        talk_time = np.random.uniform(talk_min, talk_max) * self.config.ai_time_multiplier
        acw_time  = np.random.uniform(acw_min, acw_max) * self.config.ai_time_multiplier
        return talk_time + acw_time

    def can_resolve(self, problem_type: str) -> bool:
        return np.random.random() < self.config.ai_success_rate[problem_type]

class HumanAgent(Agent):
    """Human Agent 구현."""
    def __init__(self, agent_id: int, config: AICCConfig):
        super().__init__(f"Human_{agent_id}", 'human')
        self.config = config

    def get_service_time(self, problem_type: str) -> float:
        talk_min, talk_max = self.config.talk_time_range[problem_type]
        acw_min, acw_max   = self.config.acw_time_range[problem_type]
        talk_time = np.random.uniform(talk_min, talk_max)
        acw_time  = np.random.uniform(acw_min, acw_max)
        return talk_time + acw_time

    def can_resolve(self, problem_type: str) -> bool:
        return True


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 5) Event System: EventType 및 Event 클래스 정의
# ──────────────────────────────────────────────────────────────────────────────
class EventType(Enum):
    SERVICE_COMPLETION = 1
    DROPOUT            = 2
    ARRIVAL            = 3
    CLOSING            = 4

class Event:
    """우선순위 큐 스케줄링용 이벤트 객체."""
    def __init__(self, event_type: EventType, time: float, customer=None, agent=None, event_id=None):
        self.event_type = event_type
        self.time       = time
        self.customer   = customer
        self.agent      = agent
        self.event_id   = event_id
        self.cancelled  = False

    def __lt__(self, other):
        ε = 1e-6
        if abs(self.time - other.time) > ε:
            return self.time < other.time
        return self.event_type.value < other.event_type.value


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 6) Routing Policy: 다양한 정책 클래스 정의
# ──────────────────────────────────────────────────────────────────────────────
class RoutingPolicy:
    def __init__(self, name: str):
        self.name = name

    def route(self, customer: Customer, simulator_state: dict) -> int:
        raise NotImplementedError

class RandomRouting(RoutingPolicy):
    def __init__(self):
        super().__init__("Random")

    def route(self, customer: Customer, simulator_state: dict) -> int:
        return int(np.random.choice([0, 1]))

class TypeBasedRouting(RoutingPolicy):
    def __init__(self):
        super().__init__("Type-based")

    def route(self, customer: Customer, simulator_state: dict) -> int:
        return 0 if customer.problem_type == 'easy' else 1

class AIFirstRouting(RoutingPolicy):
    def __init__(self):
        super().__init__("AI-First")

    def route(self, customer: Customer, simulator_state: dict) -> int:
        return 0

class TimeThresholdRouting(RoutingPolicy):
    def __init__(self, threshold: float, config: AICCConfig):
        if config is None:
            raise ValueError("TimeThresholdRouting requires a valid config")
        super().__init__(f"Time-Threshold({threshold:.1f})")
        self.threshold = threshold
        self.config    = config

    def route(self, customer: Customer, simulator_state: dict) -> int:
        queue_length = simulator_state.get('queue_length', 0)
        free_humans  = simulator_state.get('free_humans', 0)
        if free_humans > 0:
            return 1
        baseline_wait    = self.config.baseline_wait_time.get(customer.problem_type, 7.5)
        avg_service_time = baseline_wait * 0.8
        expected_wait    = queue_length * avg_service_time
        return 1 if expected_wait <= self.threshold else 0

class CostBasedRouting(RoutingPolicy):
    def __init__(self, config: AICCConfig):
        super().__init__("Cost-Based")
        self.config = config

    def route(self, customer: Customer, simulator_state: dict) -> int:
        q_len        = simulator_state.get('queue_length', 0)
        free_humans  = simulator_state.get('free_humans', 0)
        problem_type = customer.problem_type

        # 평균 서비스 시간 계산
        talk_min, talk_max = self.config.talk_time_range[problem_type]
        acw_min, acw_max   = self.config.acw_time_range[problem_type]
        avg_talk   = (talk_min + talk_max) / 2
        avg_acw    = (acw_min + acw_max) / 2
        ai_time    = (avg_talk + avg_acw) * self.config.ai_time_multiplier
        human_time = avg_talk + avg_acw

        # AI 비용 + 실패 시 fallback 비용
        ai_cost           = ai_time * self.config.ai_cost_per_minute
        fallback_penalty  = self.config.call_value * self.config.fallback_penalty_rate
        fallback_prob     = 1 - self.config.ai_success_rate[problem_type]
        expected_ai_total_cost = ai_cost + fallback_prob * (
            human_time * self.config.human_cost_per_minute + fallback_penalty
        )

        # Human 쪽 예상 대기비용
        wait_time_est = 0 if free_humans > 0 else q_len * human_time
        expected_human_total_cost = wait_time_est * self.config.customer_wait_cost_per_minute + human_time * self.config.human_cost_per_minute

        return 0 if expected_ai_total_cost < expected_human_total_cost else 1


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 7) UnifiedRewardCalculator: FCR 보너스 반영
# ──────────────────────────────────────────────────────────────────────────────
class UnifiedRewardCalculator:
    def __init__(self, config: AICCConfig):
        self.config = config

    def calculate_reward(self, customer: Customer) -> float:
        # 이미 계산된 경우 체크를 위한 플래그 추가
        if hasattr(customer, '_reward_calculated') and customer._reward_calculated:
            return customer.reward

        reward = 0.0

        # 1) 기본 해결 보상
        # === 해결 보상(조건 강화) ===
        if customer.resolved and customer.final_routing in [0, 1]:
            multiplier = self.config.problem_value_multiplier.get(customer.problem_type, 1.0)
            reward += self.config.call_value * multiplier

            # 2) 첫 통화 해결(FCR) 보너스
            if customer.is_fcr:
                reward += self.config.call_value * self.config.fcr_bonus_rate

        # 3) 대기 시간 패널티
        if customer.total_wait_time > 0:
            reward -= customer.total_wait_time * self.config.customer_wait_cost_per_minute

        # 4) AI 서비스 시간 비용
        if customer.ai_service_time > 0:
            reward -= customer.ai_service_time * self.config.ai_cost_per_minute

        # 5) Human 서비스 시간 비용
        if customer.human_service_time > 0:
            reward -= customer.human_service_time * self.config.human_cost_per_minute

        # 6) AI 실패 → 인간 폴백 패널티
        if customer.had_fallback:
            reward -= self.config.call_value * self.config.fallback_penalty_rate

        customer.reward = reward
        customer._reward_calculated = True
        return reward

unified_reward_calculator = UnifiedRewardCalculator(config)


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 8) Simulator Core: AICCSimulator 클래스 정의 (get_state 최적화 반영)
# ──────────────────────────────────────────────────────────────────────────────
class AICCSimulator:
    def __init__(self, config: AICCConfig):
        self.config = config
        self.reset()

    def reset(self):
        self.current_time     = 0.0
        self.customer_counter = 0
        self.event_counter    = 0

        # Agents 초기화
        self.ai_agents        = [AIAgent(i, self.config) for i in range(self.config.num_ai_agents)]
        self.human_agents     = [HumanAgent(i, self.config) for i in range(self.config.num_human_agents)]
        self.idle_human_count = self.config.num_human_agents

        # 큐 및 이벤트 큐 초기화
        self.human_queue            = deque()
        self.event_queue            = []
        self.all_customers          = []
        self.completed_customer_stats = []

        # 통계 초기화
        self.total_arrivals   = 0
        self.total_resolved   = 0
        self.total_dropouts   = 0
        self.total_fcr        = 0
        self.total_fallbacks  = 0
        self.total_wait_time  = 0.0

        # 첫 Arrival 이벤트 예약
        self._schedule_next_arrival()

        # Closing 이벤트 예약
        closing_id   = self._get_next_event_id()
        closing_time = max(0.0, self.config.operating_hours)
        heapq.heappush(self.event_queue, Event(EventType.CLOSING, closing_time, event_id=closing_id))

        logger.debug("Simulator reset and initialized")

    def _get_next_event_id(self) -> str:
        self.event_counter += 1
        return f"event_{self.event_counter}"

    def _schedule_next_arrival(self):
        if self.current_time < self.config.operating_hours:
            inter_arrival = np.random.exponential(1.0 / self.config.arrival_rate)
            next_time     = self.current_time + inter_arrival
            if next_time < self.config.operating_hours:
                arrival_id = self._get_next_event_id()
                heapq.heappush(self.event_queue, Event(EventType.ARRIVAL, next_time, event_id=arrival_id))

    def _create_customer(self) -> Customer:
        self.customer_counter += 1
        problem_type = np.random.choice(
            ['easy', 'hard'],
            p=[self.config.problem_distribution['easy'], self.config.problem_distribution['hard']]
        )
        patience = np.random.exponential(self.config.customer_patience_mean)
        customer = Customer(self.customer_counter, self.current_time, problem_type, patience)
        self.all_customers.append(customer)
        self.total_arrivals += 1
        return customer

    def _get_available_human_agent(self):
        for agent in self.human_agents:
            if not agent.is_busy:
                return agent
        return None

    def _get_available_ai_agent(self):
        for agent in self.ai_agents:
            if not agent.is_busy:
                return agent
        return None

    def _cancel_dropout_event(self, customer: Customer):
        if customer.dropout_event_id is None:
            return
        for ev in self.event_queue:
            if (ev.event_id == customer.dropout_event_id and
                ev.event_type == EventType.DROPOUT and
                ev.customer == customer):
                ev.cancelled = True
                break
        customer.dropout_event_id = None

    def route_customer(self, customer: Customer, action: int):
        if customer.resolved or customer.dropped_out:
            return
        customer.initial_routing = action
        if action == 1:
            self._route_to_human(customer)
        else:
            self._route_to_ai(customer)

    def _route_to_human(self, customer: Customer):
        available_agent = self._get_available_human_agent()
        if available_agent is not None:
            # 첫 통화 해결 여부 결정: initial_routing이 Human이므로 FCR -> True
            customer.is_fcr = (not customer.had_fallback)
            self._start_human_service(customer, available_agent)
        else:
            customer.wait_start_time = self.current_time
            self.human_queue.append(customer)
            dropout_time = self.current_time + customer.patience
            if dropout_time < self.config.operating_hours:
                event_id = self._get_next_event_id()
                customer.dropout_event_id = event_id
                heapq.heappush(
                    self.event_queue,
                    Event(EventType.DROPOUT, dropout_time, customer=customer, event_id=event_id)
                )

    def _route_to_ai(self, customer: Customer):
        available_agent = self._get_available_ai_agent()
        if available_agent is not None:
            self._start_ai_service(customer, available_agent)
        else:
            # AI 무한 가정: 새 에이전트 생성
            new_agent_id = len(self.ai_agents)
            new_agent = AIAgent(new_agent_id, self.config)
            self.ai_agents.append(new_agent)
            self._start_ai_service(customer, new_agent)

    def _start_human_service(self, customer: Customer, agent: HumanAgent):
        self._cancel_dropout_event(customer)
        service_time = agent.get_service_time(customer.problem_type)
        agent.start_service(customer, self.current_time, service_time)
        self.idle_human_count -= 1

        if customer.wait_start_time is not None:
            wait = self.current_time - customer.wait_start_time
            customer.total_wait_time += wait
            self.total_wait_time += wait
            customer.wait_start_time = None

        event_id = self._get_next_event_id()
        heapq.heappush(
            self.event_queue,
            Event(EventType.SERVICE_COMPLETION, agent.service_end_time, customer=customer, agent=agent, event_id=event_id)
        )

    def _start_ai_service(self, customer: Customer, agent: AIAgent):
        service_time = agent.get_service_time(customer.problem_type)
        agent.start_service(customer, self.current_time, service_time)
        event_id = self._get_next_event_id()
        heapq.heappush(
            self.event_queue,
            Event(EventType.SERVICE_COMPLETION, agent.service_end_time, customer=customer, agent=agent, event_id=event_id)
        )

    def _handle_service_completion(self, event: Event):
        agent = event.agent
        agent.finish_service()

        customer = event.customer
        if agent.agent_type == 'ai':
            self._handle_ai_service_completion(customer, agent)
        else:
            self._handle_human_service_completion(customer, agent)

        if agent.agent_type == 'human':
            self.idle_human_count += 1
            while self.human_queue:
                next_customer = self.human_queue.popleft()
                if not next_customer.dropped_out:
                    # 재통화 관련 코드 제거 - 단순히 다음 고객 서비스 시작
                    self._start_human_service(next_customer, agent)
                    break

    def _handle_ai_service_completion(self, customer: Customer, agent: AIAgent):
        if agent.can_resolve(customer.problem_type):
            customer.resolved = True
            customer.final_routing = 0
            # AI 첫 라우팅 성공 시에만 FCR
            if customer.initial_routing == 0 and not customer.had_fallback:
                customer.is_fcr = True
                self.total_fcr += 1
            self.total_resolved += 1
            self._finalize_customer(customer)
        else:
            customer.had_fallback = True
            customer.is_fcr = False  # AI 실패 시 FCR 불가
            self.total_fallbacks += 1
            if self.current_time >= self.config.operating_hours:
                customer.resolved = False
                customer.final_routing = -1
                self._finalize_customer(customer)
            else:
                self._route_to_human(customer)

    def _handle_human_service_completion(self, customer: Customer, agent: HumanAgent):
        customer.resolved = True
        customer.final_routing = 1
        self.total_resolved += 1
        # 첫 라우팅이 Human이고 fallback이 없는 경우에만 FCR
        if customer.initial_routing == 1 and not customer.had_fallback:
            customer.is_fcr = True
            self.total_fcr += 1
        self._finalize_customer(customer)

    def _handle_dropout(self, event: Event):
        if event.cancelled:
            return
        customer = event.customer
        if customer in self.human_queue and not customer.dropped_out:
            customer.dropped_out      = True
            customer.resolved         = False
            customer.final_routing    = -1
            self.total_dropouts       += 1

            if customer.wait_start_time is not None:
                wait = self.current_time - customer.wait_start_time
                customer.total_wait_time += wait
                self.total_wait_time    += wait
                customer.wait_start_time = None

            try:
                self.human_queue.remove(customer)
            except ValueError:
                pass

            self._finalize_customer(customer)

    def _handle_closing(self, event: Event):
        while self.human_queue:
            customer = self.human_queue.popleft()
            if not customer.dropped_out:
                customer.dropped_out = True
                customer.resolved = False
                customer.final_routing = -1
                self.total_dropouts += 1
                self._finalize_customer(customer)

        # 2) 상담 중인 고객 처리 (추가)
        # AI 상담 중인 고객
        for agent in self.ai_agents:
            if agent.is_busy and agent.current_customer:
                customer = agent.current_customer
                if not customer.resolved and not customer.dropped_out:
                    if agent.can_resolve(customer.problem_type):
                        customer.resolved = True
                        customer.final_routing = 0
                        if customer.initial_routing == 0 and not customer.had_fallback:
                            customer.is_fcr = True
                            self.total_fcr += 1
                        self.total_resolved += 1
                    else:
                        # 실패 시 fallback 없이 drop 처리
                        customer.resolved = False
                        customer.dropped_out = True
                        customer.final_routing = -1
                        self.total_dropouts += 1
                    self._finalize_customer(customer)

        # Human 상담 중인 고객
        for agent in self.human_agents:
            if agent.is_busy and agent.current_customer:
                customer = agent.current_customer
                if not customer.resolved and not customer.dropped_out:
                    customer.resolved = True
                    customer.final_routing = 1
                    if customer.initial_routing == 1 and not customer.had_fallback:
                        customer.is_fcr = True
                        self.total_fcr += 1
                    self.total_resolved += 1
                    self._finalize_customer(customer)

    def _finalize_customer(self, customer: Customer):
        if customer.resolved:
            customer.dropped_out = False
        elif customer.dropped_out:
            customer.resolved = False
            customer.dropped_out    = True
            customer.resolved       = False      # 여기 명확히 False
        elif not customer.resolved and not customer.dropped_out:
            # 둘 다 False면 강제 드롭
            customer.dropped_out = True

        assert customer.resolved != customer.dropped_out, (
            f"resolved={customer.resolved}, dropped_out={customer.dropped_out} for customer_id={customer.customer_id}"
        )

        # is_fcr/had_fallback 둘 다 True만 금지
        if customer.is_fcr and customer.had_fallback:
            raise AssertionError(
                f"is_fcr={customer.is_fcr}, had_fallback={customer.had_fallback} for customer_id={customer.customer_id}"
            )

        # 4. 보상 계산
        unified_reward_calculator.calculate_reward(customer)

        # 5. summary 저장
        summary = {
            'customer_id':        customer.customer_id,
            'problem_type':       customer.problem_type,
            'resolved':           customer.resolved,
            'dropped_out':        customer.dropped_out,
            'is_fcr':             customer.is_fcr,
            'had_fallback':       customer.had_fallback,
            'total_wait_time':    customer.total_wait_time,
            'ai_service_time':    customer.ai_service_time,
            'human_service_time': customer.human_service_time,
            'initial_routing':    customer.initial_routing,
            'final_routing':      customer.final_routing,
            'reward':             customer.reward
        }
        self.completed_customer_stats.append(summary)


    def step(self):
        if not self.event_queue:
            return None, True

        # 취소되지 않은 다음 이벤트 찾기
        event = None
        while self.event_queue:
            candidate = heapq.heappop(self.event_queue)
            if not candidate.cancelled:
                event = candidate
                break
        if event is None:
            return None, True

        self.current_time = event.time

        if event.event_type == EventType.ARRIVAL:
            customer = self._create_customer()
            self._schedule_next_arrival()
            return customer, False
        elif event.event_type == EventType.SERVICE_COMPLETION:
            self._handle_service_completion(event)
            return None, False
        elif event.event_type == EventType.DROPOUT:
            self._handle_dropout(event)
            return None, False
        elif event.event_type == EventType.CLOSING:
            self._handle_closing(event)
            return None, False

        return None, False

    def get_state(self, customer: Customer) -> np.ndarray:
        """4차원 통일된 상태 반환"""
        queue_length = len(self.human_queue)
        free_humans = self.config.num_human_agents - sum(a.is_busy for a in self.human_agents)
        problem_difficulty = 1.0 if customer.problem_type == 'hard' else 0.0
        remaining_time_ratio = max(0.0, self.config.operating_hours - self.current_time) / self.config.operating_hours

        return np.array([
            min(queue_length / 10.0, 1.0),  # 대기열 길이 정규화
            free_humans / self.config.num_human_agents,  # 가용 인력 비율
            problem_difficulty,  # 문제 난이도 (0 또는 1)
            remaining_time_ratio  # 잔여 시간 비율
        ], dtype=np.float32)

    # 모든 정책에서 동일한 상태 사용
    def get_state_dict(self, customer: Customer) -> dict:
        state_array = self.get_state(customer)
        return {
            'queue_length': len(self.human_queue),
            'free_humans': self.config.num_human_agents - sum(a.is_busy for a in self.human_agents),
            'problem_difficulty': 1.0 if customer.problem_type == 'hard' else 0.0,
            'remaining_time_ratio': state_array[3]
        }

    def calculate_reward(self, customer: Customer) -> float:
        return unified_reward_calculator.calculate_reward(customer)

    def get_simulation_results(self):
        total_customers = self.total_arrivals
        if total_customers == 0:
            return {
                'total_customers':   0,
                'resolution_rate':   0.0,
                'dropout_rate':      0.0,
                'fcr_rate':          0.0,
                'fallback_rate':     0.0,
                'avg_wait_time':     0.0,
                'ai_success_rate':   0.0,
                'human_utilization': 0.0,
                'asa':               0.0,
                'aht':               0.0,
                'abandonment_rate':  0.0
            }

        resolution_rate = self.total_resolved / total_customers
        total_unresolved = total_customers - self.total_resolved
        dropout_rate = total_unresolved / total_customers
        fcr_rate      = self.total_fcr / total_customers
        fallback_rate = self.total_fallbacks / total_customers

        wait_times = [
            s['total_wait_time']
            for s in self.completed_customer_stats
            if s['total_wait_time'] > 0
        ]
        avg_wait_time = np.mean(wait_times) if wait_times else 0.0

        ai_customers     = [s for s in self.completed_customer_stats if s['initial_routing'] == 0]
        ai_success_count = sum(1 for s in ai_customers if s['final_routing'] == 0)
        ai_success_rate  = ai_success_count / len(ai_customers) if ai_customers else 0.0

        total_human_work_time = sum(a.total_service_time for a in self.human_agents)
        human_utilization     = (
            total_human_work_time
            / (self.config.num_human_agents * self.config.operating_hours)
            if self.config.operating_hours > 0 else 0.0
        )

        answered_customers = [s for s in self.completed_customer_stats if s['resolved']]
        asa = (
            np.mean([s['total_wait_time'] for s in answered_customers])
            if answered_customers else 0.0
        )

        handle_times = [
            s['ai_service_time'] + s['human_service_time'] for s in answered_customers
        ]
        aht = np.mean(handle_times) if handle_times else 0.0

        abandonment_rate = self.total_dropouts / total_customers

        return {
            'total_customers':   total_customers,
            'resolution_rate':   resolution_rate,
            'dropout_rate':      dropout_rate,
            'fcr_rate':          fcr_rate,
            'fallback_rate':     fallback_rate,
            'avg_wait_time':     avg_wait_time,
            'ai_success_rate':   ai_success_rate,
            'human_utilization': human_utilization,
            'asa':               asa,
            'aht':               aht,
            'abandonment_rate':  abandonment_rate
        }


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 9) run_simulation_with_policy 및 run_simulation_with_rl 함수 정의
# ──────────────────────────────────────────────────────────────────────────────
def run_simulation_with_policy(policy, seed=42):
    """
    Baseline 정책(policy)을 사용해 전체 에피소드를 시뮬레이션하고 KPI 반환.
    """
    if seed is not None:
        set_random_seed(seed)

    simulator = AICCSimulator(config)
    done = False

    while not done:
        customer, done = simulator.step()
        if customer is not None:
            state_dict = {
                'queue_length': len(simulator.human_queue),
                'free_humans':  config.num_human_agents - sum(a.is_busy for a in simulator.human_agents)
            }
            action = policy.route(customer, state_dict)
            simulator.route_customer(customer, action)

    total_reward = sum(c.reward for c in simulator.all_customers)
    metrics = simulator.get_simulation_results()
    metrics['total_reward'] = total_reward
    return metrics

def run_simulation_with_rl(policy_obj, seed=42):
    """
    RL 정책(policy_obj)을 사용한 시뮬레이션 함수.
    최종적으로 모든 고객의 reward를 합산하여 반환한다.
    """
    if seed is not None:
        set_random_seed(seed)

    simulator = AICCSimulator(config)
    done = False
    customer = None

    # 첫 고객 등장 대기
    while not done and customer is None:
        customer, done = simulator.step()

    # 에피소드 루프
    while not done:
        # customer가 없으면 다음 고객이 올 때까지 step() 호출
        if customer is None:
            while not done and customer is None:
                customer, done = simulator.step()
            if done:
                break

        # 유효한 customer가 있을 때 행동 선택 및 라우팅
        action = policy_obj.route(customer, simulator)
        simulator.route_customer(customer, action)

        # 다음 고객 얻기
        customer, done = simulator.step()

    # 시뮬레이션 종료 후 모든 고객의 reward를 한 번에 합산
    total_reward = sum(c.reward for c in simulator.all_customers)

    metrics = simulator.get_simulation_results()
    metrics['total_reward'] = total_reward
    return metrics


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 10) Baseline 정책별 평가 및 보상 분해 정보
# ──────────────────────────────────────────────────────────────────────────────
baseline_policies = {
    "Random":      RandomRouting(),
    "Type-based":  TypeBasedRouting(),
    "AI-First":    AIFirstRouting(),
    "Cost-Based":  CostBasedRouting(config)
}

# TimeThreshold 최적값 탐색 (center=2.0, step=0.2)
center = 2.0
step   = 0.2
time_threshold_values = [center + i*step for i in range(-2, 3)]
tt_results = []

for thr in time_threshold_values:
    res_rates = []
    for _ in range(10):  # 10회 반복
        policy = TimeThresholdRouting(threshold=thr, config=config)
        m = run_simulation_with_policy(policy, seed=None)
        res_rates.append(m['resolution_rate'])
    tt_results.append((thr, np.mean(res_rates)))

best_time_threshold = max(tt_results, key=lambda x: x[1])[0]
baseline_policies[f"TimeThreshold({best_time_threshold})"] = TimeThresholdRouting(best_time_threshold, config)

# Baseline 평가 - 50회 반복으로 변경
results_detailed = []
for name, policy in tqdm(baseline_policies.items(), desc="Baseline Policies Evaluation"):
    for run in range(50):  # 50회 반복
        metrics = run_simulation_with_policy(policy, seed=None)
        metrics['policy'] = name
        metrics['run'] = run + 1
        results_detailed.append(metrics)

# 전체 상세 데이터 (250개 행)
df_baseline_detailed = pd.DataFrame(results_detailed)

# 원래 모양처럼 정책별 요약 (5개 행)
df_baseline_summary = df_baseline_detailed.groupby('policy').agg({
    'total_customers': 'mean',
    'resolution_rate': 'mean',
    'fcr_rate': 'mean',
    'avg_wait_time': 'mean',
    'total_reward': 'mean',
    'ai_success_rate': 'mean',
    'human_utilization': 'mean',
    'abandonment_rate': 'mean'
}).round(4).reset_index()

# 원래 모양대로 표시
print("=== Baseline 정책별 평균 성능 (50회 평균) ===")
display(df_baseline_summary)

Baseline Policies Evaluation: 100%|██████████| 5/5 [00:10<00:00,  2.07s/it]

=== Baseline 정책별 평균 성능 (50회 평균) ===





Unnamed: 0,policy,total_customers,resolution_rate,fcr_rate,avg_wait_time,total_reward,ai_success_rate,human_utilization,abandonment_rate
0,AI-First,480.02,0.7276,0.4029,1.311,2165.6549,0.397,0.8736,0.2808
1,Cost-Based,482.4,0.6833,0.5289,1.2524,2202.9847,0.4837,0.9416,0.3247
2,Random,479.64,0.5651,0.4276,1.3194,1295.826,0.3913,0.9288,0.4421
3,TimeThreshold(2.2),481.26,0.6187,0.4763,1.2064,1683.1912,0.3973,0.9686,0.3889
4,Type-based,480.18,0.6199,0.5643,1.336,1947.4806,0.6983,0.9183,0.3875


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 보상 분해 정보 - 50회 반복 평가
# ──────────────────────────────────────────────────────────────────────────────

# 1) 베이스라인 정책 정의(생략)

# 2) 고객 단위 보상+상태 분해 정보를 담을 리스트 (50회 반복)
breakdown_records = []

for name, policy in tqdm(baseline_policies.items(), desc="보상 분해 분석"):
    for run in range(50):  # 50회 반복
        sim = AICCSimulator(config)
        done = False

        # 시뮬레이션 수행
        while not done:
            cust, done = sim.step()
            if cust is not None:
                state = {
                    "queue_length": len(sim.human_queue),
                    "free_humans": config.num_human_agents - sum(a.is_busy for a in sim.human_agents)
                }
                action = policy.route(cust, state)
                sim.route_customer(cust, action)

        # 완료된 고객별 통계에서 보상·라우팅·시간정보 모두 계산
        for c in sim.completed_customer_stats:
            # 보상 요소 계산
            total_wait = c["total_wait_time"]
            ai_time = c["ai_service_time"]
            human_time = c["human_service_time"]
            resolved = c["resolved"]
            is_fcr = c["is_fcr"]
            had_fallback = c["had_fallback"]
            problem_type = c["problem_type"]

            base_value = config.call_value * config.problem_value_multiplier[problem_type] if resolved else 0.0
            fcr_bonus = config.call_value * config.fcr_bonus_rate if is_fcr else 0.0
            wait_penalty = total_wait * config.customer_wait_cost_per_minute
            ai_cost = ai_time * config.ai_cost_per_minute
            human_cost = human_time * config.human_cost_per_minute
            fallback_penalty = config.call_value * config.fallback_penalty_rate if had_fallback else 0.0

            breakdown_records.append({
                "policy": name,
                "run": run + 1,
                "customer_id": c["customer_id"],

                # ─── 리워드 구성 항목 ───
                "base_reward": base_value,
                "fcr_bonus": fcr_bonus,
                "wait_penalty": wait_penalty,
                "ai_cost": ai_cost,
                "human_cost": human_cost,
                "fallback_penalty": fallback_penalty,
                "total_reward": c["reward"],

                # ─── 라우팅 & 상태 정보 ───
                "initial_routing":    c["initial_routing"],    # 0=AI, 1=Human
                "final_routing":      c["final_routing"],      # 0=AI, 1=Human, -1=Failed
                "is_fcr":             c["is_fcr"],             # 첫 통화 해결 여부
                "total_wait_time":    total_wait,
                "ai_service_time":    ai_time,
                "human_service_time": human_time
            })

# 3) DataFrame으로 변환
df_breakdown = pd.DataFrame(breakdown_records)

# 4) 첫 20개 행 출력
print("=== 보상 분해 정보 (첫 20개 행) ===")
display(df_breakdown.head(20))


보상 분해 분석: 100%|██████████| 5/5 [00:13<00:00,  2.70s/it]


=== 보상 분해 정보 (첫 20개 행) ===


Unnamed: 0,policy,run,customer_id,base_reward,fcr_bonus,wait_penalty,ai_cost,human_cost,fallback_penalty,total_reward,initial_routing,final_routing,is_fcr,total_wait_time,ai_service_time,human_service_time
0,Random,1,7,0.0,0.0,1.02812,0.0,0.0,0.0,-1.02812,1,-1,False,0.35823,0.0,0.0
1,Random,1,3,7.29656,3.42938,0.0,0.23694,0.0,0.0,10.489,0,0,True,0.0,2.96179,0.0
2,Random,1,6,0.0,0.0,6.07145,0.0,0.0,0.0,-6.07145,1,-1,False,2.11549,0.0,0.0
3,Random,1,1,7.29656,3.42938,0.0,0.0,1.98754,0.0,8.7384,1,1,True,0.0,0.0,6.02283
4,Random,1,8,7.29656,3.42938,0.0,0.24941,0.0,0.0,10.47653,0,0,True,0.0,3.11766,0.0
5,Random,1,2,10.94484,3.42938,0.0,0.0,2.7364,0.0,11.63782,1,1,True,0.0,0.0,8.29212
6,Random,1,4,10.94484,3.42938,0.0,0.0,2.68275,0.0,11.69147,1,1,True,0.0,0.0,8.12955
7,Random,1,9,10.94484,3.42938,0.0,0.43458,0.0,0.0,13.93964,0,0,True,0.0,5.43226,0.0
8,Random,1,10,7.29656,3.42938,0.0,0.0,1.78939,0.0,8.93655,1,1,True,0.0,0.0,5.4224
9,Random,1,5,10.94484,0.0,0.0,0.49971,2.43604,1.16745,6.84164,0,1,False,0.0,6.24643,7.38193


In [None]:
# 5) 정책별 항목 합계 계산 (50회 평균)
agg = df_breakdown.groupby("policy").agg({
    "base_reward":        ["mean", "std", "sum"],
    "fcr_bonus":          ["mean", "std", "sum"],
    "wait_penalty":       ["mean", "std", "sum"],
    "ai_cost":            ["mean", "std", "sum"],
    "human_cost":         ["mean", "std", "sum"],
    "fallback_penalty":   ["mean", "std", "sum"],
    "total_reward":       ["mean", "std", "sum"],
    "initial_routing":    lambda x: (x == 0).sum(),  # AI에 라우팅된 고객 수
    "final_routing":      lambda x: (x == 0).sum(),  # AI가 최종 처리한 고객 수
    "is_fcr":             "sum",                    # FCR(True)인 고객 수
    "total_wait_time":    ["mean", "std"],           # 평균 & 표준편차 대기 시간
    "ai_service_time":    ["mean", "std"],           # 평균 & 표준편차 AI 서비스 시간
    "human_service_time": ["mean", "std"],           # 평균 & 표준편차 Human 서비스 시간
}).reset_index()

# 5-1) 멀티인덱스 컬럼(flatten) 처리
agg.columns = [
    "policy",
    "total_reward_mean", "total_reward_std", "total_reward_sum",
    "base_reward_mean", "base_reward_std", "base_reward_sum",
    "fcr_bonus_mean", "fcr_bonus_std", "fcr_bonus_sum",
    "wait_penalty_mean", "wait_penalty_std", "wait_penalty_sum",
    "ai_cost_mean", "ai_cost_std", "ai_cost_sum",
    "human_cost_mean", "human_cost_std", "human_cost_sum",
    "fallback_penalty_mean", "fallback_penalty_std", "fallback_penalty_sum",
    "num_initial_to_ai",
    "num_final_by_ai",
    "num_fcr",
    "avg_wait_time_mean", "avg_wait_time_std",
    "avg_ai_service_time_mean", "avg_ai_service_time_std",
    "avg_human_service_time_mean", "avg_human_service_time_std"
]

# 6) 정책별 합계 출력
print("\n=== 정책별 보상 분해 통계 (50회 평균) ===")
display(agg)



=== 정책별 보상 분해 통계 (50회 평균) ===


Unnamed: 0,policy,total_reward_mean,total_reward_std,total_reward_sum,base_reward_mean,base_reward_std,base_reward_sum,fcr_bonus_mean,fcr_bonus_std,fcr_bonus_sum,...,fallback_penalty_sum,num_initial_to_ai,num_final_by_ai,num_fcr,avg_wait_time_mean,avg_wait_time_std,avg_ai_service_time_mean,avg_ai_service_time_std,avg_human_service_time_mean,avg_human_service_time_std
0,AI-First,6.57205,4.43988,161100.69033,1.36459,1.6786,33450.19171,1.56557,2.82136,38376.81735,...,110483.27564,24513,9754,9754,0.54549,0.98305,4.43762,1.35411,2.55252,3.94725
1,Cost-Based,6.1995,4.55089,149984.38516,1.79074,1.71304,43323.38239,1.85935,2.95806,44983.30782,...,110768.42038,16802,8096,12633,0.64786,1.03068,2.81763,2.17752,2.81036,4.09375
2,Random,5.19231,4.81101,126562.43598,1.4594,1.69562,35572.97915,2.51287,3.27553,61251.17763,...,66448.90544,12256,4845,10373,0.87556,1.1413,2.24675,2.43234,2.7479,3.91531
3,TimeThreshold(2.2),5.6113,4.74276,136792.20943,1.60187,1.71101,39050.37246,2.20427,3.0192,53735.67429,...,84255.24106,14487,5648,11387,0.76804,1.05198,2.64468,2.42448,2.86113,3.95229
4,Type-based,5.515,4.57242,133727.65533,1.92995,1.70116,46797.34632,2.17948,3.17271,52847.94511,...,100673.56411,9765,6917,13646,0.7594,1.10547,1.24029,1.55429,2.72683,4.0515


In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 11) Training Module: ReplayBuffer 및 DQN 계열 네트워크 정의 (풀버전)
# ──────────────────────────────────────────────────────────────────────────────
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
import numpy as np
from tqdm import tqdm

# CUDNN 설정: 비결정론적 + 벤치마크 모드 활성화
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark     = True

class ReplayBuffer:
    def __init__(self, capacity: int = 5000):
        self.capacity = capacity
        self.buffer   = deque(maxlen=capacity)

    def push(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: float):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size: int):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.stack, zip(*batch))
        return (
            torch.tensor(states, dtype=torch.float32, device=device),
            torch.tensor(actions, dtype=torch.long, device=device).unsqueeze(1),
            torch.tensor(rewards, dtype=torch.float32, device=device).unsqueeze(1),
            torch.tensor(next_states, dtype=torch.float32, device=device),
            torch.tensor(dones, dtype=torch.float32, device=device).unsqueeze(1)
        )

    def __len__(self) -> int:
        return len(self.buffer)

class DQNNetwork(nn.Module):
    def __init__(self, state_dim: int, action_dim: int, hidden_dim: int = 64, dropout_p: float = 0.1):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.do1 = nn.Dropout(p=dropout_p)

        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)
        self.do2 = nn.Dropout(p=dropout_p)

        self.fc3 = nn.Linear(hidden_dim, action_dim)

        # 가중치 초기화
        nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
        nn.init.kaiming_uniform_(self.fc3.weight, nonlinearity='linear')
        if self.fc1.bias is not None: nn.init.constant_(self.fc1.bias, 0)
        if self.fc2.bias is not None: nn.init.constant_(self.fc2.bias, 0)
        if self.fc3.bias is not None: nn.init.constant_(self.fc3.bias, 0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.ln1(self.fc1(x)))
        x = self.do1(x)
        x = F.relu(self.ln2(self.fc2(x)))
        x = self.do2(x)
        return self.fc3(x)

class DuelingDQNNetwork(nn.Module):
    def __init__(self, state_dim: int, action_dim: int, hidden_dim: int = 64, dropout_p: float = 0.1):
        super(DuelingDQNNetwork, self).__init__()
        self.fc1      = nn.Linear(state_dim, hidden_dim)
        self.ln1      = nn.LayerNorm(hidden_dim)
        self.do1      = nn.Dropout(p=dropout_p)

        # Value stream
        self.fc_value = nn.Linear(hidden_dim, hidden_dim)
        self.ln_v     = nn.LayerNorm(hidden_dim)
        self.do_v     = nn.Dropout(p=dropout_p)
        self.value_out= nn.Linear(hidden_dim, 1)

        # Advantage stream
        self.fc_adv   = nn.Linear(hidden_dim, hidden_dim)
        self.ln_a     = nn.LayerNorm(hidden_dim)
        self.do_a     = nn.Dropout(p=dropout_p)
        self.adv_out  = nn.Linear(hidden_dim, action_dim)

        # 가중치 초기화
        for layer in [self.fc1, self.fc_value, self.fc_adv, self.value_out, self.adv_out]:
            nn.init.kaiming_uniform_(layer.weight, nonlinearity='relu')
            if layer.bias is not None:
                nn.init.constant_(layer.bias, 0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.ln1(self.fc1(x)))
        x = self.do1(x)

        v = F.relu(self.ln_v(self.fc_value(x)))
        v = self.do_v(v)
        v = self.value_out(v)

        a = F.relu(self.ln_a(self.fc_adv(x)))
        a = self.do_a(a)
        a = self.adv_out(a)

        return v + (a - a.mean(dim=1, keepdim=True))

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 13) DQN / Double DQN / Dueling DQN 학습 및 저장 (에피소드 단위 처리)
# ──────────────────────────────────────────────────────────────────────────────
import os
import matplotlib.pyplot as plt

num_episodes = 7000

def train_dqn_episode_based(
    env_class,
    config,
    network_type: str = "DQN",
    num_episodes: int = num_episodes,
    batch_size: int = 32,
    gamma: float = 0.98,
    lr: float = 5e-4,
    buffer_capacity: int = 8000,
    target_update_freq: int = 20,
    epsilon_start: float = 1.0,
    epsilon_final: float = 0.05,
    epsilon_decay: int = 700,
    hidden_dim: int = 128,
    dropout_p: float = 0.1
):
    state_dim = 4
    action_dim = 2

    if network_type == "DuelingDQN":
        policy_net = DuelingDQNNetwork(state_dim, action_dim, hidden_dim=hidden_dim, dropout_p=dropout_p).to(device)
        target_net = DuelingDQNNetwork(state_dim, action_dim, hidden_dim=hidden_dim, dropout_p=dropout_p).to(device)
    else:
        policy_net = DQNNetwork(state_dim, action_dim, hidden_dim=hidden_dim, dropout_p=dropout_p).to(device)
        target_net = DQNNetwork(state_dim, action_dim, hidden_dim=hidden_dim, dropout_p=dropout_p).to(device)

    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    replay_buffer = ReplayBuffer(capacity=buffer_capacity)
    loss_fn = nn.SmoothL1Loss()

    epsilon_by_episode = lambda ep: epsilon_final + (epsilon_start - epsilon_final) * np.exp(-1.0 * ep / epsilon_decay)

    losses = []
    all_rewards = []

    for episode in tqdm(range(1, num_episodes + 1), desc=f"{network_type} Training"):
        env = env_class(config)
        episode_transitions = []
        done = False

        # 1단계: 전체 에피소드 실행 (480분)
        while not done:
            customer, done = env.step()
            if customer is not None:
                state = env.get_state(customer)

                # 행동 선택
                eps = epsilon_by_episode(episode)
                if random.random() < eps:
                    action = random.randrange(action_dim)
                else:
                    state_tensor = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)
                    with torch.no_grad():
                        q_vals = policy_net(state_tensor)
                        action = int(q_vals.argmax(dim=1).item())

                # 라우팅 실행
                env.route_customer(customer, action)

                # transition 저장 (보상은 나중에 할당)
                episode_transitions.append({
                    'state': state,
                    'action': action,
                    'customer': customer
                })

        # 2단계: 에피소드 완료 후 보상 할당 및 리플레이 버퍼에 추가
        total_episode_reward = 0.0
        for i, trans in enumerate(episode_transitions):
            customer = trans['customer']
            reward = customer.reward  # 이제 확정된 보상
            total_episode_reward += reward

            # 다음 상태 결정: 시스템 상태의 시간적 변화로 수정
            if i + 1 < len(episode_transitions):
                next_state = episode_transitions[i + 1]['state']
                done_flag = False
            else:
                # 에피소드 종료 시 시스템 상태 (대기열 비움, 잔여시간 0)
                next_state = np.array([0.0, 1.0, 0.0, 0.0], dtype=np.float32)  # [queue=0, free_humans=1, problem=0, time=0]
                done_flag = True

            # 리플레이 버퍼에 추가
            replay_buffer.push(
                trans['state'],
                trans['action'],
                reward,
                next_state,
                done_flag
            )

        all_rewards.append(total_episode_reward)

        # 3단계: 학습 (충분한 데이터가 있을 때)
        if len(replay_buffer) >= batch_size:
            states_b, actions_b, rewards_b, next_states_b, dones_b = replay_buffer.sample(batch_size)

            q_vals = policy_net(states_b).gather(1, actions_b)

            with torch.no_grad():
                if network_type == "DoubleDQN":
                    next_actions = policy_net(next_states_b).argmax(dim=1, keepdim=True)
                    next_q = target_net(next_states_b).gather(1, next_actions)
                else:
                    next_q = target_net(next_states_b).max(dim=1, keepdim=True)[0]
                target_q = rewards_b + gamma * next_q * (1 - dones_b)

            loss = loss_fn(q_vals, target_q)

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(policy_net.parameters(), max_norm=1.0)
            optimizer.step()

            losses.append(loss.item())

        # 타겟 네트워크 업데이트
        if episode % target_update_freq == 0:
            target_net.load_state_dict(policy_net.state_dict())

        # 진행 상황 출력
        if episode % 50 == 0:
            avg_loss = np.mean(losses[-50:]) if losses else 0.0
            avg_reward = np.mean(all_rewards[-50:])
            tqdm.write(f"Episode {episode} | AvgLoss: {avg_loss:.4f} | AvgRew: {avg_reward:.4f} | Eps: {eps:.3f}")

    return losses, all_rewards, policy_net

In [None]:
# DQN 학습
# ──────────────────────────────────────────────────────────────────────────────
# 13) DQN / Double DQN / Dueling DQN 학습 및 저장 (에피소드 단위 처리)
# ──────────────────────────────────────────────────────────────────────────────
import os
import matplotlib.pyplot as plt

num_episodes = 20000

# DQN 학습
losses_dqn, rewards_dqn, model_dqn = train_dqn_episode_based(
    env_class    = AICCSimulator,
    config       = config,
    network_type = "DQN",
    num_episodes = num_episodes
)

dqn_path = os.path.join(config.models_dir, "dqn_model.pth")
torch.save(model_dqn.state_dict(), dqn_path)

plt.figure(figsize=(5,2.5))
plt.plot(losses_dqn, label="DQN Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.savefig(os.path.join(config.plots_dir, "dqn_loss.png"))
plt.close()

plt.figure(figsize=(5,2.5))
plt.plot(rewards_dqn, label="DQN Reward")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.savefig(os.path.join(config.plots_dir, "dqn_reward.png"))
plt.close()


In [None]:
# Double DQN
losses_ddqn, rewards_ddqn, model_ddqn = train_dqn_episode_based(
    env_class    = AICCSimulator,
    config       = config,
    network_type = "DoubleDQN",
    num_episodes = num_episodes
)

ddqn_path = os.path.join(config.models_dir, "ddqn_model.pth")
torch.save(model_ddqn.state_dict(), ddqn_path)

plt.figure(figsize=(5,2.5))
plt.plot(losses_ddqn, label="DoubleDQN Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.savefig(os.path.join(config.plots_dir, "ddqn_loss.png"))
plt.close()

plt.figure(figsize=(5,2.5))
plt.plot(rewards_ddqn, label="DoubleDQN Reward")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.savefig(os.path.join(config.plots_dir, "ddqn_reward.png"))
plt.close()

In [None]:
# Dueling DQN
losses_dueling, rewards_dueling, model_dueling = train_dqn_episode_based(
    env_class    = AICCSimulator,
    config       = config,
    network_type = "DuelingDQN",
    num_episodes = num_episodes
)

dueling_path = os.path.join(config.models_dir, "dueling_dqn_model.pth")
torch.save(model_dueling.state_dict(), dueling_path)

plt.figure(figsize=(5,2.5))
plt.plot(losses_dueling, label="DuelingDQN Loss")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.legend()
plt.savefig(os.path.join(config.plots_dir, "dueling_loss.png"))
plt.close()

plt.figure(figsize=(5,2.5))
plt.plot(rewards_dueling, label="DuelingDQN Reward")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend()
plt.savefig(os.path.join(config.plots_dir, "dueling_reward.png"))
plt.close()

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# 14) RLPolicy 래퍼 정의 및 최종 평가 실행 (풀버전)
# ──────────────────────────────────────────────────────────────────────────────
class RLPolicy(RoutingPolicy):
    def __init__(self, model_path: str, network_type: str):
        super().__init__(network_type)
        state_dim  = 4
        action_dim = 2
        if network_type == "DuelingDQN":
            self.policy_net = DuelingDQNNetwork(state_dim, action_dim, hidden_dim=128).to(device)
        else:
            self.policy_net = DQNNetwork(state_dim, action_dim, hidden_dim=128).to(device)
        self.policy_net.load_state_dict(torch.load(model_path, map_location=device))
        self.policy_net.eval()

    def route(self, customer: Customer, simulator: AICCSimulator) -> int:
        state_arr    = simulator.get_state(customer)
        state_tensor = torch.tensor(state_arr, dtype=torch.float32, device=device).unsqueeze(0)
        with torch.no_grad():
            q_vals = self.policy_net(state_tensor)
            return int(q_vals.argmax(dim=1).item())

# 최종 평가
rl_policies = {
    "DQN":       RLPolicy(os.path.join(config.models_dir, "dqn_model.pth"), "DQN"),
    "DoubleDQN": RLPolicy(os.path.join(config.models_dir, "ddqn_model.pth"), "DoubleDQN"),
    "DuelingDQN":RLPolicy(os.path.join(config.models_dir, "dueling_dqn_model.pth"), "DuelingDQN")
}
all_policies = {**baseline_policies, **rl_policies}
n_runs       = 200
records      = []
for name, policy in tqdm(all_policies.items(), desc="Policies Eval"):
    for _ in range(n_runs):
        if name in rl_policies:
            metrics = run_simulation_with_rl(policy, seed=None)
        else:
            metrics = run_simulation_with_policy(policy, seed=None)
        metrics['policy'] = name
        records.append(metrics)

df_records = pd.DataFrame(records)
df_stats   = df_records.groupby('policy').agg(['mean','std']).reset_index()
print(df_stats)

In [None]:
df_stats

Unnamed: 0_level_0,policy,total_customers,total_customers,resolution_rate,resolution_rate,dropout_rate,dropout_rate,fcr_rate,fcr_rate,fallback_rate,...,human_utilization,human_utilization,asa,asa,aht,aht,abandonment_rate,abandonment_rate,total_reward,total_reward
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std,mean,...,mean,std,mean,std,mean,std,mean,std,mean,std
0,AI-First,479.7,21.79692,0.7219,0.02341,0.2781,0.02341,0.39865,0.02178,0.60459,...,0.87037,0.02246,0.33699,0.05693,7.92377,0.19059,0.28696,0.02337,2113.23236,193.30012
1,Cost-Based,482.805,21.61558,0.6826,0.02255,0.3174,0.02255,0.52969,0.02907,0.35742,...,0.94407,0.01428,0.41803,0.06443,6.90224,0.19244,0.32553,0.02258,2203.491,185.06562
2,DQN,479.195,22.1699,0.72913,0.02308,0.27087,0.02308,0.42117,0.02544,0.58233,...,0.88946,0.01823,0.32332,0.05999,7.808,0.20374,0.27973,0.02298,2210.11415,209.94952
3,DoubleDQN,479.17,19.96507,0.72073,0.0221,0.27927,0.0221,0.48472,0.0262,0.52006,...,0.95328,0.00956,0.34351,0.05733,7.44863,0.19196,0.2882,0.02213,2271.67721,187.4313
4,DuelingDQN,482.23,21.32112,0.71596,0.02029,0.28404,0.02029,0.47921,0.02417,0.52511,...,0.9579,0.00908,0.36228,0.05887,7.47146,0.18919,0.29287,0.01973,2225.0465,182.15035
5,Random,479.61,21.21742,0.56655,0.02285,0.43345,0.02285,0.4307,0.02484,0.30089,...,0.92865,0.01563,0.64445,0.0853,7.44638,0.22022,0.4408,0.02253,1299.94735,177.55309
6,TimeThreshold(2.4),481.335,20.53586,0.62043,0.01789,0.37957,0.01789,0.47968,0.02444,0.35368,...,0.96837,0.00997,0.55651,0.06579,7.25038,0.20684,0.38746,0.01753,1705.63953,152.84774
7,Type-based,481.01,19.27896,0.61987,0.02314,0.38013,0.02314,0.56402,0.02241,0.1217,...,0.92095,0.01596,0.51511,0.07589,6.14536,0.13267,0.38798,0.02301,1933.54232,160.12619
