<a href="https://colab.research.google.com/github/Federico6419/MachineLearningProject/blob/main/MachineLearningProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install libraries


In [1]:
!pip install gymnasium
!pip install swig     #This solves the errori in the installation of gymnasium[box2d]
!pip install gymnasium[box2d]
!pip install gym-notebook-wrapper   #This installs Gym-Notebook-Wrapper, that provides small wrappers for running and rendering OpenAI Gym

#To solve the xvfb missing file problem
!sudo apt-get install xvfb
!pip install xvfbwrapper

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/953.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.6/953.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting swig
  Downloading swig-4.1.1-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed s

## Import libraries

In [2]:
!git clone https://github.com/Federico6419/MachineLearningProject          #It clones my github repository
%cd MachineLearningProject

import gymnasium as gym
import gnwrapper
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision.utils import save_image
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import random
import cv2
import config
from model import Model
from collections import deque

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Cloning into 'MachineLearningProject'...
remote: Enumerating objects: 226, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (92/92), done.[K
remote: Total 226 (delta 62), reused 4 (delta 4), pack-reused 130[K
Receiving objects: 100% (226/226), 132.98 KiB | 7.39 MiB/s, done.
Resolving deltas: 100% (134/134), done.
/content/MachineLearningProject


# F

In [None]:
from google.colab import drive                                   #This is commented because we used to save or load our results using our Google Drive
drive.mount('/content/drive')

episode_reward = 0
tot_negative_reward = 0
time_frame_counter = 1
buffer = deque([], config.BUFFER_SIZE)             #Initialize the Queue that contains the past experience
epsilon = config.MAX_EPSILON
alpha = config.ALPHA
decay = config.EPSILON_DECAY

#For the plotting
cum_reward_table = np.zeros(config.NUM_EPISODES)
cum_reward_nn = np.zeros(config.NUM_EPISODES)

#Initialize the Model
model = Model().to(config.DEVICE)

#Initialize the Target Model
target_model = Model().to(config.DEVICE)

optimizer = optim.Adam(model.parameters(), lr=config.LR)
optimizer_target = optim.Adam(target_model.parameters(), lr=config.LR)

if(config.LOAD_MODEL):
    config.load_model(config.LOAD_CHECKPOINT_FOLDER,model,optimizer)
    config.load_model(config.LOAD_CHECKPOINT_FOLDER,target_model,optimizer_target)

#huber_loss=nn.HuberLoss(delta=1.0)
mean_squared_error = torch.nn.MSELoss()

#Define the Action Space
action_space = [
                (-1, 1, 0.2), (0, 1, 0.2), (1, 1, 0.2),
                (-1, 1,   0), (0, 1,   0), (1, 1,   0),               #(Steering Wheel, Gas, Break)
                (-1, 0, 0.2), (0, 0, 0.2), (1, 0, 0.2),               #Range -1~1 0~1 0~1
                (-1, 0,   0), (0, 0,   0), (1, 0,   0)
              ]

#Define the policy to know how chose the action
#Q-Table
def select_action(state, epsilon):
    rv = random.uniform(0, 1)
    if rv < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(Q[state])

#Neural Network
def select_action_nn(state, epsilon):
    rv = random.uniform(0, 1)
    if rv < epsilon:
        return action_space[random.randrange(len(action_space))]          #We sample a random action

    else:
        prediction = model(torch.from_numpy(state.astype('float32')).to(config.DEVICE)).detach().cpu().numpy()
        action = action_space[np.argmax(prediction)]              #Select the action with the maximum predicted Q-Value

        return action


## update the epsilon value along the iteration until converges to MIN_EPSILON
def update_epsilon(epsilon):
    epsilon -= epsilon/100 # reduce epsilon by 1/100
    if epsilon<=config.MIN_EPSILON:
        return config.MIN_EPSILON
    else:
        return epsilon

## update the epsilon every episode by epsilon decay variable
def update_epsilon_nn(epsilon):
    epsilon *= decay
    if epsilon<=config.MIN_EPSILON:
        return config.MIN_EPSILON
    else:
        return epsilon


env = gym.make("CarRacing-v2", render_mode="human")


if(config.USE_QTABLE):
    # define the Q table
    #Q = np.zeros([27684, env.action_space.n]) # little discretization
    Q = np.zeros([19051200, env.action_space.n]) #big discretization

###see the limit of the values of the box observation space
#print(env.observation_space.high)
#print(env.observation_space.low)

###see in more detail the action space and the observation space
#print(env.action_space)
#print(env.observation_space)


if(config.USE_QTABLE): # use a q table to reach the goal
    for i in range(config.NUM_EPISODES):
        observation, info = env.reset()# use seed to have same initial state
        #state = config.discretize(observation)
        state = config.big_discretize(observation)

        for j in range(500):
            action = select_action(state,epsilon)
            obv, reward, done, truncated, info = env.step(action)
            #next_state = config.discretize(obv)
            next_state = config.big_discretize(obv)

            next_max = np.max(Q[next_state])

            Q[state,action] += alpha*(reward+config.GAMMA*next_max-Q[state,action])
            state = next_state

            episode_reward += reward

            if done or truncated:
                break

        print("episode: ", i)
        print("episode cumulative reward : ", episode_reward)
        print("epsilon: ",epsilon)
        epsilon = update_epsilon(epsilon)
        cum_reward_table[i]=episode_reward
        episode_reward = 0 #reset the total reward each episode

    #save the q table for testing
    #np.savetxt('q_table.csv', Q, delimiter=','fmt='%f18')
    #np.savetxt('q_table_little_discretization2000.csv', Q, delimiter=',') # full precision
    np.savetxt('q_table_big_discretization1000.csv', Q, delimiter=',') # full precision

else:             #Use a Neural Network to approximate the Q Function
    for i in range(config.NUM_EPISODES):
        state, info = env.reset()               #The state is a 96x96 Matrix, that contains elements composed by 3 Colours RGB
        state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)           #Convert the state into a Grayscale Image, that is a Matrix 96x96 composed by Integer values
        #state = state.astype(float)
        #state /= 255.0

        frames_queue = deque([state]*3, maxlen = 3)

        done = False

        while(True):

            current_frame = np.array(frames_queue)

            action = select_action_nn(current_frame, epsilon)                      #The Action is composed by 3 Values, that are the steering, gas and breaking

            rew = 0
            #Skip Frames
            for tot in range(3):
                next_state, reward, done, truncated, info = env.step(action)
                rew += reward
                if done or truncated:
                    break

            # If continually getting negative reward 10 times after the tolerance steps, terminate this episode
            tot_negative_reward = tot_negative_reward + 1 if time_frame_counter > 100 and reward < 0 else 0


            # Extra bonus for the model if it uses full gas
            if action[1] == 1 and action[2] == 0:
                rew *= 1.5

            episode_reward += rew

            next_state = cv2.cvtColor(next_state, cv2.COLOR_BGR2GRAY)
            #Add normalization?

            frames_queue.append(next_state)
            next_frame = np.array(frames_queue)

            #Remove the oldest item if the queue is full, in a way such that we can add a new one
            if len(buffer)>=config.BUFFER_SIZE:
                buffer.popleft()               #We dequeue the oldest item

            #buffer.append([*state,action,reward,*next_state,done])
            buffer.append((current_frame, action_space.index(action), reward, next_frame, done))

            if done or truncated or tot_negative_reward > 25 or episode_reward < 0:
                epsilon = update_epsilon_nn(epsilon)
                print("episode ", i)
                print("episode cumulative reward: ", episode_reward)
                print("current epsilon: ", epsilon)
                print("#---------------------------------------------#")
                break

            #Let's train the Neural Network every 4 actions and if the buffer has at least BATCH_SIZE elements
            #if((len(buffer) >= config.BATCH_SIZE) and ((j+1) % 4 == 0)):
            if(len(buffer) >= config.BATCH_SIZE):
                batch = random.sample(buffer, config.BATCH_SIZE)

                for current_frame, action, reward, next_frame, done in batch:

                    # Find next best action using model network
                    #next_frame = torch.from_numpy(next_frame.astype('float32')).to(config.DEVICE)
                    #predictions_next = model(next_frame).detach().cpu().numpy()
                    current_frame = torch.from_numpy(current_frame.astype('float32')).to(config.DEVICE)
                    predictions_next = model(current_frame).detach().cpu().numpy()

                    next_frame = torch.from_numpy(next_frame.astype('float32')).to(config.DEVICE)
                    t = target_model(next_frame)
                    predictions_next[action] = reward + config.GAMMA * max(t)

                    #compute the predicted value of the model(output)
                    output = model(current_frame)
                    output =  output[..., np.newaxis]
                    predictions_next =  predictions_next[..., np.newaxis]
                    predictions_next = torch.from_numpy(predictions_next).to(config.DEVICE)
                    #loss = huber_loss(output, predictions_next)
                    loss = mean_squared_error(output, predictions_next)

                    #Train network
                    optimizer.zero_grad()#clear existing gradient
                    loss.backward() #backpropagate the error
                    optimizer.step() # update weights

            time_frame_counter += 1


        if (i+1) % 5 == 0:
            #save the weight of the network
            config.save_model(model,optimizer,i+1)
            torch.save({
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }, "../drive/MyDrive/Checkpoint")
            print("Save weigths in: "+ config.CHECKPOINT_FOLDER)

        if (i+1) % 5 == 0:
            #update weights of target network every 10 actions
            print("Target network updated")
            config.load_model(config.CHECKPOINT_FOLDER,target_model,optimizer_target)


        cum_reward_nn[i]=episode_reward
        episode_reward = 0
        tot_negative_reward = 1
        time_frame_counter = 1


env.close()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
episode  0
episode cumulative reward:  -0.21622073578593437
current epsilon:  0.99
#---------------------------------------------#


## Example