#### To Select CUDA Processor

In [None]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "2"

In [None]:
!nvcc --version

In [None]:
#from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())

In [None]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
!nvidia-smi

#### Set Parameters

In [None]:
env = gym.make('CartPole-v0')

In [None]:
state_size = env.observation_space.shape[0] 
action_size = env.action_space.n

print(f"State {state_size}   Actions {action_size}")

In [None]:
batch_size = 32

n_episodes = 1000 # No. of times the agent will play the game

In [None]:
output_dir = "model_output/cartpole"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### Defining Agent

In [None]:
class DQNAgent:
    
    def __init__(self, state_size,action_size):
        
        self.state_size = state_size
        self.action_size = action_size
        
        self.memory = deque(maxlen = 2000) #To remember something from each episode. Its one type of Python list
        
        self.gamma = 0.95
        
        self.epsilon = 1
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.learning_rate = 0.001
        
        self.model = self._build_model()
        
        
    def _build_model(self):
        """
        
        This Method takes state as input and outputs which actions to take
        
        input: State
        Output: Reward value for each action from a State
        """
        
        model = Sequential([
            Dense(24, input_dim = self.state_size, activation = 'relu'),
            Dense(24, activation = 'relu'),
            
            Dense(self.action_size, activation = 'linear')
        ])
        
        
        model.compile(loss='mse', optimizer = Adam (lr = self.learning_rate)) # For this particular work MSE found to be better loss function
        
        return model
        
    def remember(self, state, action, reward, next_state, done):
        
        """
        Important Function. Remembers each episodes.
        
        done: this variable indicates whether the episode ended
        """
        
        self.memory.append((state, action, reward, next_state, done))
        
        
    def act(self, state):
        
        """
        Selects Action. 
        For Exploration returns random action.
        For Exploitation returns the predicted action from model.
        """
        
        
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        else:
            reward_values = self.model.predict(state)
            return np.argmax(reward_values[0]) #Mane J action er jnno bshi value oita return korbe
        

    
    def replay(self, batch_size):
        
        minibatch = random.sample(self.memory, batch_size) # Picks sample from self.memory. How many samples?
                                                        # batch_size number of samples.
                                                        # On each sample we have state, action, reward, next_state, done
        
        
        for state, action, reward, next_state, done in minibatch:
            target = reward # if episode is done then target is the reward received.
            
            if not done: # if episode is yet to end, we need to predict the future reward for each state 
                
                reward_next = np.amax(self.model.predict(next_state)[0]) 
                target = (reward + self.gamma * reward_next) # predicts reward
                
            target_f = self.model.predict(state) 
            target_f[0][action] = target # updates the reward for that action from that state
            
            
            self.model.fit(state, target_f, epochs =1, verbose = 0) #trains the model for predicting rewards
            
        
            
            
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay
            
    
    
    def load(self, name):
        self.model.load_weights(name)
        
        
        
    def save(self, name):
        self.model.save_weights(name)
            
            

        

In [None]:
agent = DQNAgent(state_size, action_size)

#### Interaction with environment

In [None]:
done = False

for e in range(n_episodes):
    
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    
    for time in range(5000):
        
        env.render()
        action = agent.act(state)
        
        next_state, reward, done, _ = env.step(action)
        
        if not done: # Final Goal Achieve kortese nki check. Done = True hole loop theke ber hbe. check kore dkho porer if ta
            reward = reward
        else:
            reward = -10
            
        next_state = np.reshape(next_state, [1,state_size])
        
        agent.remember(state, action, reward, next_state, done)
        
        state = next_state
        
        if done:
            print(f"episode: {e}/{n_episodes},  score:{time}.  epsilon:{agent.epsilon} ")
            break
        
        

        
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
        
        
        
    if e%50 == 0:
        agent.save(output_dir+"weights.hdf5")
    