In [10]:
import os

# Get and print the current working directory
current_working_directory = os.getcwd()
print(f"The current working directory is: {current_working_directory}")

The current working directory is: /content/sample_data/Flappy_birds/src


In [11]:
import os

# Create directories if they don't exist
if not os.path.exists("/content/sample_data/Flappy_birds/src/tensorboard"):
    os.makedirs("/content/sample_data/Flappy_birds/src/tensorboard")

if not os.path.exists("/content/sample_data/Flappy_birds/src/trained_models"):
    os.makedirs("/content/sample_data/Flappy_birds/src/trained_models")

# Change the working directory to a new directory (replace with the path you want)
new_working_directory = "/content/sample_data/Flappy_birds/src/"
os.chdir(new_working_directory)

# Rest of your code...


In [12]:
import os
import shutil
from random import random, randint, sample
import numpy as np
import torch
import torch.nn as nn
from tensorboardX import SummaryWriter

from deep_q_network import DeepQNetwork
from flappy_bird import FlappyBird
from utils import pre_processing

# Define parameters directly
class Options:
    def __init__(self):
        self.image_size = 84
        self.batch_size = 32
        self.optimizer = "adam"
        self.lr = 1e-6
        self.gamma = 0.99
        self.initial_epsilon = 0.1
        self.final_epsilon = 1e-4
        self.num_iters = 10_000  # Adjusted from num_iterations to 10,000
        self.replay_memory_size = 50000
        self.log_path = "tensorboard"
        self.saved_path = "trained_models"

# Create an instance of Options
opt = Options()

# Define the train function
def train(opt):
    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)
    model = DeepQNetwork()
    if os.path.isdir(opt.log_path):
        shutil.rmtree(opt.log_path)
    os.makedirs(opt.log_path)
    writer = SummaryWriter(opt.log_path)
    optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr)
    criterion = nn.MSELoss()
    game_state = FlappyBird()
    image, reward, terminal = game_state.next_frame(0)
    image = pre_processing(image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size, opt.image_size)
    image = torch.from_numpy(image)
    if torch.cuda.is_available():
        model.cuda()
        image = image.cuda()
    state = torch.cat(tuple(image for _ in range(4)))[None, :, :, :]

    replay_memory = []
    iter = 0
    while iter < opt.num_iters:
        prediction = model(state)[0]
        # Exploration or exploitation
        epsilon = opt.final_epsilon + (
                (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters)
        u = random()
        random_action = u <= epsilon
        if random_action:
            print("Perform a random action")
            action = randint(0, 1)
        else:
            action = torch.argmax(prediction) #[0]

        next_image, reward, terminal = game_state.next_frame(action)
        next_image = pre_processing(next_image[:game_state.screen_width, :int(game_state.base_y)], opt.image_size,
                                    opt.image_size)
        next_image = torch.from_numpy(next_image)
        if torch.cuda.is_available():
            next_image = next_image.cuda()
        next_state = torch.cat((state[0, 1:, :, :], next_image))[None, :, :, :]
        replay_memory.append([state, action, reward, next_state, terminal])
        if len(replay_memory) > opt.replay_memory_size:
            del replay_memory[0]
        batch = sample(replay_memory, min(len(replay_memory), opt.batch_size))
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip(*batch)

        state_batch = torch.cat(tuple(state for state in state_batch))
        action_batch = torch.from_numpy(
            np.array([[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32))
        reward_batch = torch.from_numpy(np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.cat(tuple(state for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
        current_prediction_batch = model(state_batch)
        next_prediction_batch = model(next_state_batch)

        y_batch = torch.cat(
            tuple(reward if terminal else reward + opt.gamma * torch.max(prediction) for reward, terminal, prediction in
                  zip(reward_batch, terminal_batch, next_prediction_batch)))

        q_value = torch.sum(current_prediction_batch * action_batch, dim=1)
        optimizer.zero_grad()
        loss = criterion(q_value, y_batch)
        loss.backward()
        optimizer.step()

        state = next_state
        iter += 1
        print("Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}".format(
            iter + 1,
            opt.num_iters,
            action,
            loss,
            epsilon, reward, torch.max(prediction)))
        writer.add_scalar('Train/Loss', loss, iter)
        writer.add_scalar('Train/Epsilon', epsilon, iter)
        writer.add_scalar('Train/Reward', reward, iter)
        writer.add_scalar('Train/Q-value', torch.max(prediction), iter)
        if (iter+1) % 1000000 == 0:
            torch.save(model, "{}/flappy_bird_{}".format(opt.saved_path, iter+1))
    torch.save(model, "{}/flappy_bird".format(opt.saved_path))

# Call the train function with the created Options instance
train(opt)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Perform a random action
Iteration: 5121/10000, Action: 1, Loss: 0.006171687506139278, Epsilon 0.04886119, Reward: 0.1, Q-value: -0.1603391021490097
Iteration: 5122/10000, Action: 0, Loss: 0.0052381339482963085, Epsilon 0.048851200000000004, Reward: 0.1, Q-value: -0.16457432508468628
Iteration: 5123/10000, Action: 0, Loss: 0.005852661095559597, Epsilon 0.04884121, Reward: 0.1, Q-value: -0.17548204958438873
Iteration: 5124/10000, Action: 0, Loss: 0.006603860296308994, Epsilon 0.04883122, Reward: 0.1, Q-value: -0.19049426913261414
Iteration: 5125/10000, Action: 0, Loss: 0.005988342687487602, Epsilon 0.04882123000000001, Reward: 0.1, Q-value: -0.2097787857055664
Iteration: 5126/10000, Action: 0, Loss: 0.005825432948768139, Epsilon 0.048811240000000006, Reward: 0.1, Q-value: -0.2349800169467926
Iteration: 5127/10000, Action: 0, Loss: 0.005346036981791258, Epsilon 0.048801250000000004, Reward: 0.1, Q-value: -0.2595840394496918
