In [1]:
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Tuple, Optional, Iterable
import math
import random
import statistics
import itertools


In [2]:
class Environment:
    """
    Abstract Environment.
    - state: representation of the current situation (S_t)
    - action: decision taken by the agent (A_t)
    - reward: scalar feedback from env (R_{t+1})
    - transition function: P(s', r | s, a) -- implemented via step()
    """
    def reset(self) -> Any:
        """Start a new episode: returns initial state s0."""
        raise NotImplementedError

    def step(self, action: Any) -> Tuple[Any, float, bool, Dict]:
        """
        Take action in current state.
        Returns: (next_state, reward, done, info)
        - done indicates episode termination
        - this embodies the transition function and reward function
        """
        raise NotImplementedError

    def action_space(self) -> Iterable[Any]:
    
        """Available actions in the current state (or a fixed set)."""
        raise NotImplementedError


class Agent:
    """
    Abstract Agent mapping states to actions.
    - policy π(a|s) implemented by act(state)
    - may update internal knowledge by observe(...)
    """
    def act(self, state: Any) -> Any:
        raise NotImplementedError

    def observe(self, s: Any, a: Any, r: float, s_next: Any, done: bool):
        """Learning update after each transition."""
        pass

