In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np


In [2]:
import pandas as pd

print(pd.__version__)

import matplotlib.pyplot as plt
import numpy as np
import gym

def plotLearning(x, scores, epsilons, filename, lines=None):
    fig=plt.figure()
    ax=fig.add_subplot(111, label="1")
    ax2=fig.add_subplot(111, label="2", frame_on=False)

    ax.plot(x, epsilons, color="C0")
    ax.set_xlabel("Game", color="C0")
    ax.set_ylabel("Epsilon", color="C0")
    ax.tick_params(axis='x', colors="C0")
    ax.tick_params(axis='y', colors="C0")

    N = len(scores)
    running_avg = np.empty(N)
    for t in range(N):
	    running_avg[t] = np.mean(scores[max(0, t-20):(t+1)])
    
    #ax2.xaxis.tick_top()
    ax2.scatter(x, running_avg, color="C1")

    ax2.axes.get_xaxis().set_visible(False)
    ax2.yaxis.tick_right()
    #ax2.set_xlabel('x label 2', color="C1")
    ax2.set_ylabel('Score', color="C1")
    #ax2.xaxis.set_label_position('top')
    ax2.yaxis.set_label_position('right')
    #ax2.tick_params(axis='x', colors="C1")
    ax2.tick_params(axis='y', colors="C1")

    if lines is not None:
        for line in lines:
            plt.axvline(x=line)

    plt.savefig(filename)

class SkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        super(SkipEnv, self).__init__(env)
        self._skip = skip

    def step(self, action):
        t_reward = 0.0
        done = False
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            t_reward += reward
            if done:
                break
        return obs, t_reward, done, info

    def reset(self):
        self._obs_buffer = []
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

class PreProcessFrame(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(PreProcessFrame, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255,
                                                shape=(80,80,1), dtype=np.uint8)
    def observation(self, obs):
        return PreProcessFrame.process(obs)

    @staticmethod
    def process(frame):

        new_frame = np.reshape(frame, frame.shape).astype(np.float32)

        new_frame = 0.299*new_frame[:,:,0] + 0.587*new_frame[:,:,1] + \
                    0.114*new_frame[:,:,2]

        new_frame = new_frame[35:195:2, ::2].reshape(80,80,1)

        return new_frame.astype(np.uint8)

class MoveImgChannel(gym.ObservationWrapper):
    def __init__(self, env):
        super(MoveImgChannel, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                            shape=(self.observation_space.shape[-1],
                                   self.observation_space.shape[0],
                                   self.observation_space.shape[1]),
                            dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

class ScaleFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0

class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps):
        super(BufferWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(
                             env.observation_space.low.repeat(n_steps, axis=0),
                             env.observation_space.high.repeat(n_steps, axis=0),
                             dtype=np.float32)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=np.float32)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer

def make_env(env_name):
    env = gym.make(env_name)
    env = SkipEnv(env)
    env = PreProcessFrame(env)
    env = MoveImgChannel(env)
    env = BufferWrapper(env, 4)
    return ScaleFrame(env)



2.2.1


In [4]:
df = pd.read_csv('./data/TAMA_7D_graph_coinmarketcap.csv')
df.head(5)

Unnamed: 0,name,open,high,low,close,volume,marketCap,timestamp
0,2781,0.009883,0.009953,0.009156,0.009623,961067.1,13407557.82,2024-03-23T00:00:00.000Z
1,2781,0.009621,0.010043,0.009453,0.009845,974010.65,13717864.69,2024-03-23T03:00:00.000Z
2,2781,0.009842,0.01004,0.009706,0.010039,1030381.52,13987644.47,2024-03-23T06:00:00.000Z
3,2781,0.010002,0.010065,0.009477,0.009968,838887.07,13875271.43,2024-03-23T09:00:00.000Z
4,2781,0.009958,0.009991,0.009369,0.009386,646186.26,13078427.75,2024-03-23T12:00:00.000Z


In [12]:
class DQN(nn.Module):
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DQN, self).__init__() #constructor for base class
        self.input_dims = input_dims
        self.lr = lr
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims) #* unpacks list, input observation vectors
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss=nn.MSELoss() #Q learning is like linear regression

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu') 
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)

        return actions

class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions, max_mem_size=100000, eps_end = 0.01, eps_dec=5e-4):
        # gamma = awaiting future rewards, epsilon: how often to explore or take action
        self.gamma = gamma
        self.epsilon = epsilon #porp of time taking random or greedy option policy
        self.lr = lr
        self.input_dims= input_dims
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.mem_size = max_mem_size
        self.action_space = [ i for i in range(n_actions)]
        self.mem_cntr = 0

        self.Q_eval = DQN(self.lr, n_actions=self.n_actions, input_dims=self.input_dims, 
                          fc1_dims=256, fc2_dims =256)
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        #deep q doesn't work for continuous action spaces
        self.reward_memory = np.zeros(self.mem_size, dtype = np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=bool) #mask for setting

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index]  = reward
        self.action_memory[index] = action #one hot encoding for actions? action to int one hot back to int one hot = [0,1,0,0]
        self.terminal_memory[index] = done

        self.mem_cntr+=1
    
    def choose_action(self, obs):
        if np.random.random() > self.epsilon:
            state = T.tensor([obs]).to(self.Q_eval.device) #bracket around obs because setup
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action
    
    def learn(self):
        #dilemma: mem filled with zero how to deal with that cant learn from zero
        # 1. play games rand until fill memory then learn? 2. start learning as soon as filled batch size
        if self.mem_cntr < self.batch_size:
            return # dont bother learning if not big enough
        
        self.Q_eval.optimizer.zero_grad() #zero gradient, 
        #select up to max filled memory
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem,self.batch_size, replace=False ) #not problem if a huge batch size/memory storage
        
        batch_index = np.arange(self.batch_size, dtype=np.int32) #need this?
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)#subset of agent mem 
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch= T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)

        action_batch = self.action_memory[batch]

        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch] #values for action we actually took for each set
        q_next = self.Q_eval.forward(new_state_batch)
        #target network?
        q_next[terminal_batch] = 0.0

        q_target = reward_batch+self.gamma*T.max(q_next, dim=1)[0] # discount factor gamma, max value of next state, greedy action
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimzer.step()

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.min


In [17]:
import gym

env = gym.make('LunarLander-v2')
agent = Agent(gamma = 0.99, epsilon = 1.0, batch_size=64, n_actions=4, 
              eps_end=0.01, input_dims=[8], lr=0.03)
scores, eps_history = [], []
n_games=500

for i in range(n_games):
    score = 0
    done = False
    obs = env.reset()
    while not done:
        action = agent.choose_action(obs)
        obs_ , reward, done, info = env.step(action)
        score += reward
        agent.store_transition(obs, action, reward, obs_, done)
        agent.learn()
        obs = obs_

    
    scores.append(score)
    eps_history.append(agent.epsilon)
    avg_score = np.mean(score[-100:]) #avg of previous 100 games

    print('episode', i, 'score %.2f' % score,
          'average score %.2f' % avg_score,
          'epsilon %.2f' % agent.epsilon)
    
x = [i+1 for i in range(n_games)]
fn = 'lunar_lander.png'
plotLearning(x, scores, eps_history, fn)



  if not isinstance(terminated, (bool, np.bool8)):


ValueError: setting an array element with a sequence. The requested array would exceed the maximum number of dimension of 1.