In [6]:
import numpy as np
from numpy import random
import torch
import torch.nn as nn
import torch.nn.functional as F

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(1, 16)
        self.out = nn.Linear(16, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.out(x)

model = NeuralNetwork()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

epsilon = 0.2
gamma = 0.9
episodes = 5000
states = [0, 1, 2, 3, 4, 5, 6]
actions = [0, 1]
r = [-1, -1, -1, -1, -1, -1, 10]

def in_bounds(current_state):
    return 0 < current_state < 6

def get_q(state):
    state_tensor = torch.tensor([[state]], dtype=torch.float32)
    return model(state_tensor).squeeze()

def select_action(state):
    if np.random.random() < epsilon:
        return np.random.randint(2)
    else:
        return get_q(state).argmax().item()

def agent_step(state, action):

    next_state = state + 1 if action == 1 else state - 1
    
    if in_bounds(next_state):
        td_target = torch.tensor(r[next_state] + gamma * get_q(state).max().item(), dtype=torch.float32)
    else:
        td_target = torch.tensor(r[next_state], dtype=torch.float32)
    
    q_values = get_q(state)
    current_q = q_values[action]

    loss = loss_fn(current_q, td_target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss = loss

    next_action = select_action(next_state)
    return next_state, next_action

for episode in range(episodes):

    current_state = 3
    current_action = select_action(current_state)

    while in_bounds(current_state):
    
        current_state, current_action = agent_step(current_state, current_action)


3