# ZMUMiG projekt

Autorzy:
- Jarosław Kołodziej
- Przemysław Kożuch


## Importy bibliotek

In [32]:
import sys
path_nb = r'/zmumig/project/'
sys.path.append(path_nb)

In [33]:
#!pip install "gym[atari, accept-rom-license]"

In [34]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from IPython import display as ipythondisplay
import time

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, Conv2D, Dense, MaxPool2D, Flatten

## Inicjalizacja

In [35]:
env = gym.make("ALE/Pong-v5")
seed = 1
env.reset(seed=seed)

(array([[[  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0],
         ...,
         [109, 118,  43],
         [109, 118,  43],
         [109, 118,  43]],
 
        [[109, 118,  43],
         [109, 118,  43],
         [109, 118,  43],
         ...,
         [109, 118,  43],
         [109, 118,  43],
         [109, 118,  43]],
 
        [[109, 118,  43],
         [109, 118,  43],
         [109, 118,  43],
         ...,
         [109, 118,  43],
         [109, 118,  43],
         [109, 118,  43]],
 
        ...,
 
        [[ 53,  95,  24],
         [ 53,  95,  24],
         [ 53,  95,  24],
         ...,
         [ 53,  95,  24],
         [ 53,  95,  24],
         [ 53,  95,  24]],
 
        [[ 53,  95,  24],
         [ 53,  95,  24],
         [ 53,  95,  24],
         ...,
         [ 53,  95,  24],
         [ 53,  95,  24],
         [ 53,  95,  24]],
 
        [[ 53,  95,  24],
         [ 53,  95,  24],
         [ 53,  95,  24],
         ...,
         [ 53,  95,  24],
  

In [36]:
print("Observation space: {}".format(env.observation_space))
n_actions = env.action_space.n
print("Possible actions: {}".format(n_actions))

Observation space: Box(0, 255, (210, 160, 3), uint8)
Possible actions: 6


## Definicja modelu

In [37]:
def create_test_model():
    x = Input(shape=(210, 160, 3), batch_size=1)
    p = MaxPool2D((7,8))(x)
    c1 = Conv2D(4, (3,3),)(p)
    c2 = Conv2D(6, (3,3),)(c1)
    f = Flatten()(c2)
    d = Dense(units=16, activation='relu')(f)
    d = Dense(units=16, activation='relu')(d)
    out = Dense(units=n_actions, activation='softmax')(d)
    return tf.keras.models.Model(inputs=x, outputs=out)

test_model = create_test_model()
test_model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(1, 210, 160, 3)]        0         
                                                                 
 max_pooling2d_4 (MaxPoolin  (1, 30, 20, 3)            0         
 g2D)                                                            
                                                                 
 conv2d_8 (Conv2D)           (1, 28, 18, 4)            112       
                                                                 
 conv2d_9 (Conv2D)           (1, 26, 16, 6)            222       
                                                                 
 flatten_4 (Flatten)         (1, 2496)                 0         
                                                                 
 dense_12 (Dense)            (1, 16)                   39952     
                                                           

In [38]:
def choose_action(model, observation):
  observation = observation.reshape((1, 210, 160, 3))
  prob_weights = model.predict(observation)
  action = np.random.choice(n_actions, size=1, p=prob_weights.flatten())[0]
  #print("output: {}  decided action: {}".format(prob_weights,action))
  return action

In [39]:
class Memory:
  def __init__(self):
      self.clear()

  def clear(self):
      self.observations = []
      self.actions = []
      self.rewards = []

  def add_to_memory(self, new_observation, new_action, new_reward):
      self.observations.append(new_observation)
      self.actions.append(new_action)
      self.rewards.append(new_reward)

memory = Memory()

In [40]:
def normalize(x):
  x -= np.mean(x)
  x /= np.std(x)
  return x

def discount_rewards(rewards, gamma=0.95):
  discounted_rewards = np.zeros_like(rewards)
  R = 0
  for t in reversed(range(0, len(rewards))):
      # update the total discounted reward
      R = R * gamma + rewards[t]
      discounted_rewards[t] = R
  return normalize(discounted_rewards)

In [41]:
learning_rate = 1e-3
optimizer = Adam(learning_rate)

## Uczenie modelu

In [42]:
test_model = create_test_model()
test_model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')

In [43]:

for i_episode in range(10):
    print('episode:', i_episode)
    # Restart the environment
    observation, reset_info = env.reset(seed=seed)
    print("reset info:",reset_info)
    print("observation:",observation.shape)
    iter = 0
    while True:
        if(iter%10==0):
            print("iter {}".format(iter))
        # using our observation, take an action
        action = choose_action(test_model, observation)
        next_observation, reward, terminated, truncated, info = env.step(action)
        # add to memory
        memory.add_to_memory(observation, action, reward)
        # is the episode over? did you crash or do so well that you're done?
        if terminated or truncated:
            # determine total reward and keep a record of this
            total_reward = sum(memory.rewards)
            print(total_reward, len(memory.actions))
            # initiate training - remember we don't know anything about how the agent is doing until it's crashed!
            test_model.fit(np.stack(memory.observations), np.stack(memory.actions), epochs=1, batch_size=len(memory.observations), sample_weight=discount_rewards(memory.rewards), verbose=0)
            memory.clear()
            break
        # update our observatons
        observation = next_observation
        iter += 1

episode: 0
reset info: {'lives': 0, 'episode_frame_number': 0, 'frame_number': 0, 'seeds': (1835504127, 1731038949)}
observation: (210, 160, 3)
output: [[2.7021549e-14 1.4223623e-01 0.0000000e+00 8.5738707e-01 3.7671495e-04
  0.0000000e+00]]  decided action: 3
output: [[4.2789981e-08 9.8899370e-01 0.0000000e+00 1.1003002e-02 3.3446404e-06
  0.0000000e+00]]  decided action: 1
output: [[3.2149977e-08 9.9808240e-01 0.0000000e+00 1.9171843e-03 5.2278637e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1


  if not isinstance(terminated, (bool, np.bool8)):


output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-03 1.9909254e-07
  0.0000000e+00]]  decided action: 1
output: [[1.2041865e-09 9.9327528e-01 0.0000000e+00 6.7245229e-0

In [47]:
def save_video_of_model(model, env_name='ALE/Pong-v5', filename='pong_seesion.mp4'):
  import skvideo.io

  env = gym.make(env_name, render_mode='rgb_array')
  env = env.unwrapped
  obs, _ = env.reset()

  out = skvideo.io.FFmpegWriter(filename)

  done = False
  while not done:

      frame = env.render()
      out.writeFrame(frame)
      action = model.predict(obs.reshape((1, 210, 160, 3))).argmax()
      obs, reward, terminated, truncated, info = env.step(action)
      done = terminated or truncated
  out.close()
  print("Successfully saved into {}!".format(filename))

filename = "pong_session_" + time.strftime("%Y-%m-%d %H-%M-%S") + ".mp4"
save_video_of_model(test_model, filename=filename)

Successfully saved into pong_session_2024-01-02 20-13-10.mp4!
