In [None]:

#Policy gradient
# use R_t -> Return (REINFORCE Algorithm)
# delta TD error 
# loss ascent

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim as optim
from torch.distributions import Categorical

class Policy(nn.Module):
    pass

def main():
    env=gym.make('CartPole-v1')
    pi=Policy()# policy 클래스의 instance
    avg_t=0
    for n_episode in range(10000):
        obs=env.reset() # 시작에 reset
        done=False
        for t in range(500):
            obs=torch.tensor(obs,dtype=torch.float)# obs자체는 np.array였음
            out=pi(obs)
            m=Categorical(out) #torch pi의 확률 분포 출력
            action=m.sample()# 확률기반 탐색후 action결정
            obs,reward,done,info=env.step(action.item()) # state transition
            # item 함수는 scalar로 뽑아서 전송
            pi.put_data((reward,torch.log(out[action])))
            if done:
                break
        avg_t+=t
        pi.train()
        if n_episode %100==0 and n_episode!=0:
            print("# of episode :{},Avgtimestep:{}".format(n_episode,avg_t/20.0))
            avg_t=0
    env.close()


class Policy(nn.Module):#nn.Module 상속
    def __init__(self):
        super(Policy,self).__init__()#super가 Policy의 조상클래스
        self.data=[]
        self.gamma=0.99
        
        self.fc1=nn.Linear(4,128)
        self.fc2=nn.Linear(128,2)
        self.optimizer=optim.Adam(self.parameters(),lr=0.0005)

    def forward(self, x):#네트워크 정의
        x=F.relu(self.fc1(x))
        x=F.softmax(self.fc2(x),dim=0)
        return x
    
    def put_data(self,item):
        self.data.append(item)
    
    def train(self):
        Return=0
        self.optimizer.zero_grad()
        for reward,log_prob in self.data[::-1]: # 맨뒤부터
            Return=reward+Return*self.gamma
            loss = -torch.log(log_prob)*Return #REINFORCE
            loss.backward() #gradient 계산하면서 쭉 back propagation진행
        self.optimizer.step() # Weight update 
        self.data=[ ]

if __name__=='__main__':
  main()

In [18]:
  
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []
        
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x
      
    def put_data(self, item):
        self.data.append(item)
        
    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            loss.backward()
        self.optimizer.step()
        self.data = []

def main():
    env = gym.make('CartPole-v1')
    pi = Policy()
    score = 0.0
    print_interval = 100
    
    
    for n_epi in range(10000):
        s = env.reset()
        done = False
        
        while not done: # CartPole-v1 forced to terminates at 500 step.
            prob = pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample()
            s_prime, r, done, info = env.step(a.item())
            pi.put_data((r,prob[a]))
            s = s_prime
            score += r
            
        pi.train_net()
        
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
            score = 0.0
    env.close()
    
if __name__ == '__main__':
    main()

# of episode :100, avg score : 22.76
# of episode :200, avg score : 29.25
# of episode :300, avg score : 35.19
# of episode :400, avg score : 42.54
# of episode :500, avg score : 49.04
# of episode :600, avg score : 57.69
# of episode :700, avg score : 66.99
# of episode :800, avg score : 73.58
# of episode :900, avg score : 98.28
# of episode :1000, avg score : 114.51
# of episode :1100, avg score : 115.81
# of episode :1200, avg score : 168.81
# of episode :1300, avg score : 197.53
# of episode :1400, avg score : 206.02
# of episode :1500, avg score : 200.93
# of episode :1600, avg score : 239.88
# of episode :1700, avg score : 278.41
# of episode :1800, avg score : 273.35
# of episode :1900, avg score : 287.31
# of episode :2000, avg score : 278.82
# of episode :2100, avg score : 260.72
# of episode :2200, avg score : 326.19
# of episode :2300, avg score : 319.1
# of episode :2400, avg score : 357.87
# of episode :2500, avg score : 351.13
# of episode :2600, avg score : 398.65
# of 

In [24]:
  
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.002
gamma         = 0.99

class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.loss_lst = [] #빈 list 생성
        
        self.fc1 = nn.Linear(4, 128)
        self.fc_pi = nn.Linear(128, 2)
        self.fc_v=nn.Linear(128,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    def forward(self, x):
        x = F.relu(self.fc1(x)) # 처음에 relu랑 fcnet 통과
        pol=self.fc_pi(x) # pi 분사 거치는 단계
        pi=F.softmax(pol,dim=0)  # 분사
        v=self.fc_v(x) # valuefunction으로 분사
        return pi,v
      
    def gather_loss(self,loss):
        self.loss_lst.append(loss.unsqueeze(0))

        
    def train(self):
      loss=torch.cat(self.loss_lst).sum()
      loss=loss/len(self.loss_lst) #안정적 학습을 위한 트릭
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()
      self.loss_lst=[]

        

def main():
    env = gym.make('CartPole-v1')
    model=ActorCritic()
    score = 0.0
    print_interval = 100
    
    
    for n_epi in range(10000):
        obs = env.reset()
        done = False
        
        while not done: # CartPole-v1 forced to terminates at 500 step.
            obs=torch.from_numpy(obs).float() # transformation to tensor
            pi,v=model(obs)
            m = Categorical(pi)
            action = m.sample()

            obs, r, done, info = env.step(action.item())
            _,next_v=model(torch.from_numpy(obs).float())
            delta=r+gamma*next_v-v # TD error
            loss= - torch.log(pi[action])*delta.item()+delta*delta #delta^2은 value loss
            # item()은 backpropagation때 주소 참조 업데이트 방지
            # value loss와 policy loss 동시 update
            model.gather_loss(loss)
            score += r
            
        model.train()
        
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {}".format(n_epi, score/print_interval))
            score = 0.0
        if score/print_interval>400:
          obs=env.reset()
          done=False
          while not done:
            env.render()
            obs=torch.from_numpy(obs).float()
            pi,v=model(obs)
            m=Categorical(pi)
            action=m.sample()
            obs, r, done, info = env.step(action.item())
            _,next_v=model(torch.from_numpy(obs).float())
            delta=r+gamma*next_v-v # TD error
            loss= - torch.log(pi[action])*delta.item()+delta*delta #delta^2은 value loss
            # item()은 backpropagation때 주소 참조 업데이트 방지
            # value loss와 policy loss 동시 update
            model.gather_loss(loss)
            score += r
    env.close()
    
if __name__ == '__main__':
    main()

# of episode :100, avg score : 25.79
# of episode :200, avg score : 36.77
# of episode :300, avg score : 46.0
# of episode :400, avg score : 60.02
# of episode :500, avg score : 56.61
# of episode :600, avg score : 49.8
# of episode :700, avg score : 162.92
# of episode :800, avg score : 91.68
# of episode :900, avg score : 191.91
# of episode :1000, avg score : 196.43
# of episode :1100, avg score : 256.7
# of episode :1200, avg score : 239.57
# of episode :1300, avg score : 273.14
# of episode :1400, avg score : 316.82
# of episode :1500, avg score : 319.9
# of episode :1600, avg score : 352.79


NoSuchDisplayException: ignored