### Reinforcement Learning Tutorial

Some common terms
- Agent
- Environment
- Action,rewards and classification

In [1]:
!pip install gym




In [2]:
import gym

In [3]:
# Create environment

env = gym.make('CartPole-v0')


### Comes with certain import methods and attributes
- action_space
- Observation_space
- reset()
- step()
- render()

In [4]:
env.reset()     # returns the game to initial state

array([-0.01873481,  0.03078699, -0.01476466,  0.0064216 ])

In [5]:
# Atari game --> usual size 210x160x3 Tensor (RGB)

In [6]:
for _ in range(1000):
    env.step(env.action_space.sample())
    env.render()
env.close()



In [7]:
env.action_space.n

2

In [8]:
env.observation_space.shape[0]

4

In [9]:
for _ in range(1000):
    env.step(env.action_space.sample())
    env.render()
env.close()

### 2. Playing games with a Random strategy
- Game Episode
- step() function in more details
- game over?

**Game Episode** --> Entire game play 
- starting till end(game over)
 


- We will play the game with random steps and then compute score
- We will also learn how to play multiple game episode

In [10]:
for e in range(20): # episode
    # Play 20 ep.
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,other_info = env.step(action)
        
        if done:
            # Game ep. is over
            print("Game episode:{}/{} High score :{}".format(e,20,t))
            break

env.close()
print("All 20 ep.s are over!!")

Game episode:0/20 High score :13
Game episode:1/20 High score :14
Game episode:2/20 High score :20
Game episode:3/20 High score :27
Game episode:4/20 High score :11
Game episode:5/20 High score :11
Game episode:6/20 High score :15
Game episode:7/20 High score :27
Game episode:8/20 High score :15
Game episode:10/20 High score :28
Game episode:11/20 High score :41
Game episode:12/20 High score :23
Game episode:13/20 High score :9
Game episode:14/20 High score :26
Game episode:15/20 High score :37
Game episode:16/20 High score :33
Game episode:17/20 High score :27
Game episode:18/20 High score :12
Game episode:19/20 High score :20
All 20 ep.s are over!!


##### Step()

step function returns -->

- observation
- reward
- done
- info

## 3. Q-Learning


In [60]:
from keras.layers import Dense
from matplotlib import pyplot as plt
from keras.models import Sequential
from keras.optimizers import Adam
import numpy as np
import os
from collections import deque
import random

In [None]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size;
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 # Discount factor
        # Exploration  and exploitation tradeoff
        # Exploration : Good in the beginning --> helps you to try various random things
        # exploitation : sample good experiences from the past(memory) --> good in the end
        # To maintain the tradeoff 
        self.epsilon = 1.0 # 100% Random exploration in the beginning
        self.epsilon_decay = 0.995 
        self.epsilon_min = 0.01 # this means 1% random actions will be taken all the times no matter the past experience
        self.learning_rate = 0.001
        self.model = self._create_model()
        
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        
        return model

    def remember(self,state,action,reward,next_state,done):
        # Remember past experience
        self.memory.append((state,action,reward,next_state,done))
    
    def act(self,state):
        #Sampling according to the Epsilon greedy Method
        if np.random.rand() <= self.epsilon:
            # Take a random action
            return random.randrange(self.action_size)
        
        # Ask the model to give me the suitable action
        return np.argmax(model.predict(state)[0])
    
    def train(self,batch_size):
        # Training using a replay buffer
        minibatch = random.sample(self.memory,batch_size)
        for experience in minibatch:
            state,action,reward,next_state,done = experience
            # X,Y : state, expected reward
            if not done :
                # Game is not done yet, bellman eqn to approx the target_value of the reward
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
                
            target_f = self.model.predict(state)
            target_f[0][action] = target
            # X= state, Y= target
            self.model.fit(state,target_f,epochs=1,verbose=0)
            
        if self.epsilon > self.epsilon_min :
            self.epsilon *= self.epsilon_decay
            

    def load(self,name):
        self.model.load_weights(name)
    
    def save(self,name):
        self.model.save_weights(name)   

In [None]:
model = Sequential()
model.add(Dense(24,input_dim=4,activation='relu'))
model.add(Dense(24,activation='relu'))
model.add(Dense(2,activation='linear'))
model.compile(loss='mse',optimizer=Adam(lr=0.001))
        

In [None]:
x = np.random.randn(1,4)  # Input
model.predict(x)  # Model rewards on the action

Training tht DQN Agent ( deep Q-leaner )


In [None]:
n_episodes = 1000

output_dir = 'cartpole_model/'


In [None]:
agent = Agent(state_size=4,action_size=2)
done = False
state_size = 4
action_size = 2

In [None]:
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,agent.state_size])
    batch_size = 32
    
    for time in range(500):
        env.render()
        action = agent.act(state) # action is 0 or 1
        next_state, reward, done,other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)  # Experience for the agent
        
        if done :
            print("Game episode:{}/{} High score :{} Exploration rate:{:.2}".format(e,20,t,agent.epsilon))
            break
        
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
        
    if e%50 == 0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
    
print("Deep Q-Learner Model Trained")
env.close()