# REINFORCEMENT LEARNING 
in this notebook we use Open-AI gym to implement a RL problem

## Imports

***mount drive***

In [1]:
from google.colab import drive 
drive.mount("/content/drive")

Mounted at /content/drive


***general imports***

In [2]:
import numpy as np 
import random 
import gym
from collections import deque
import os 

***tensorflow imports***

In [3]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense 
from tensorflow.keras.optimizers import Adam

***making a dummy video player driver***

In [4]:
os.environ["SDL_VIDEODRIVER"] = "dummy"

***installing pygame***

In [5]:
!pip install gym[classic_control]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pygame
Successfully installed pygame-2.1.0


## RL Environment 

***initializing the cartpole environment***

In [6]:
env = gym.make("CartPole-v1", new_step_api=True)

states = env.observation_space.shape[0]
actions = env.action_space.n
batch_size = 32
n_episodes = 200

print("this is number of actions:", actions)
print("this is number of states:", states)

this is number of actions: 2
this is number of states: 4


***saving the model***

In [7]:
output_dir = "/content/drive/MyDrive/Youtube/9 - RL/model/"

if not os.path.exists(output_dir):
  os.makedirs(output_dir)

## Agent

***defining the DQG-Agent Class***

In [8]:
class DQNAgent:

    def __init__(self, states, actions):
        self.states = states
        self.actions = actions
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 
        self.epsilon = 1.0
        self.epsilon_decay = 0.995 
        self.epsilon_min = 0.01 
        self.learning_rate = 0.001
        self.model = self._build_model()
    
    def _build_model(self):
        model = Sequential()
        model.add(Dense(32, activation='relu', input_dim=self.states))
        model.add(Dense(32, activation='relu')) 
        model.add(Dense(self.actions, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate)) 
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, 
                            reward, next_state, done))
    
    def train(self, batch_size): 
        minibatch = random.sample(self.memory, batch_size) 
        for state, action, reward, next_state, done in minibatch: 
            target = reward 
            if not done: 
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) 
            target_f = self.model.predict(state) 
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)  
        if self.epsilon > self.epsilon_min: 
            self.epsilon *= self.epsilon_decay 

    def act(self, state):
        if np.random.rand() <= self.epsilon: 
            return random.randrange(self.actions) 
        act_values = self.model.predict(state) 
        return np.argmax(act_values[0]) 
        
    def save(self, name):
        self.model.save_weights(name)

    def load(self, name):
        self.model.load_weights(name)

***instantiating the DQN-Agent***

In [9]:
agent = DQNAgent(states, actions) 

## Train the DQN-Agent

In [10]:
for e in range(n_episodes):
    
    state = env.reset()
    state = np.reshape(state, [1, states]) 
    
    done = False
    time = 0 
    while not done: 
        action = agent.act(state) 
        next_state, reward, done, _, _ = env.step(action) 
        reward = reward if not done else -10 
        next_state = np.reshape(next_state, [1, states]) 
        agent.remember(state, action, reward, next_state, done)       
        state = next_state        
        if done:  
            print( "episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, n_episodes-1, time, agent.epsilon))
        time += 1
        
        env.render()
        
    if len(agent.memory) > batch_size:
        agent.train(batch_size) 
    
    if e % 50 == 0:
        agent.save(output_dir + "weights_" + '{:04d}'.format(e) + ".hdf5")

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
episode: 150/199, score: 67, e: 0.47
episode: 151/199, score: 30, e: 0.47
episode: 152/199, score: 47, e: 0.47
episode: 153/199, score: 82, e: 0.47
episode: 154/199, score: 69, e: 0.46
episode: 155/199, score: 58, e: 0.46
episode: 156/199, score: 79, e: 0.46
episode: 157/199, score: 45, e: 0.46
episode: 158/199, score: 17, e: 0.46
episode: 159/199, score: 39, e: 0.45
episode: 160/199, score: 28, e: 0.45
episode: 161/199, score: 67, e: 0.45
episode: 162/199, score: 48, e: 0.45
episode: 163/199, score: 81, e: 0.44
episode: 164/199, score: 47, e: 0.44
episode: 165/199, score: 46, e: 0.44
episode: 166/199, score: 16, e: 0.44
episode: 167/199, score: 51, e: 0.44
episode: 168/199, score: 54, e: 0.43
episode: 169/199, score: 140, e: 0.43
episode: 170/199, score: 72, e: 0.43
episode: 171/199, score: 86, e: 0.43
episode: 172/199, score: 71, e: 0.42
episode: 173/199, score: 186, e: 0.42
episode: 174/199, score: 27, e: 0.42
episode: