This Jupyter notebook shows how to create and train a DQN model for a Four-in-a-Row-Evolution environment.

To use this notebook in Colab follow these instructions:
1. Upload and open this notebook on Google Colab.
2. Download the `train_4inarowEvolution` directory from the corresponding GitHub repository. Compress it into a .zip file named **train_4inarowEvolution.zip** (do NOT change the name).
3. Upload the `train_4inarowEvolution.zip` file to Colab.
4. Run the cells in this notebook.

Due to Colab time limits, it is recommended to set a training timeout of max 5/6 hours per session. 

Once the training stops, a new `train_4inarowEvolution.zip` will be created with the current training status. 

By downloading the new zip file, it is possible to resume training by uploading a new Colab with this notebook and then upload the last `train_4inarowEvolution.zip` you just downloaded. The training state will be restored and resumed. In this way is it possible to train the model for a large number of episodes.

In [1]:
import os, shutil, time, random, zipfile, json
import matplotlib.pyplot as plt
# Tensorflow framework
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
def unzip_directory(zip_path, extract_to):
  """
  Unzips a zip file to the specified directory.
  """
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)
  print(f"Extracted {zip_path} to {extract_to}")

def zip_directory(folder_path, zip_path):
  """
  Zips a directory into a zip file.
  """
  with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_ref:
    for root, dirs, files in os.walk(folder_path):
      for file in files:
        file_path = os.path.join(root, file)
        arcname = os.path.relpath(file_path, folder_path)
        zip_ref.write(file_path, arcname)
  print(f"Zipped {folder_path} into {zip_path}")

def save_dict_to_json_file(dictionary, filename):
  """
  Saves a dictionary to a JSON file.
  """
  with open(filename, 'w') as json_file:
    json.dump(dictionary, json_file, indent=4)
  print(f"Dictionary saved to {filename}")

def read_json_file_to_dict(filename):
  """
  Reads a JSON file and returns it as a dictionary.
  """
  with open(filename, 'r') as json_file:
    dictionary = json.load(json_file)
  print(f"JSON file {filename} loaded into a dictionary")
  return dictionary

In [None]:
# Define paths and unzip training files
base_path = "train_4inarowEvolution"
zip_filename = "train_4inarowEvolution.zip"
unzip_directory(zip_filename, f"./{base_path}")
# fix if there is a duplicated directory because of the zip
if os.path.exists(f"{base_path}/{base_path}"):
  shutil.rmtree(base_path)
  unzip_directory(zip_filename, f"./")

# Load or create training settings
train_settings_path = f"{base_path}/train_settings.json"
if os.path.exists(train_settings_path):
  train_settings = read_json_file_to_dict(train_settings_path)
else:
  train_settings = {
    "notes": "3x(32conv3x3)x128+128+128x128 ConfB3: smaller target update interval + increased episodes + conv layers",
    "map_width": 7,
    "map_height": 6,
    "moves_at_time": 3,
    "DQN": "DoubleDuelingDQN", # "DQN", "DoubleDQN", "DuelingDQN", "DoubleDuelingDQN"
    "gamma": 0.95,
    "episodes": 50_000,
    "buffer_size": 200_000,
    "min_episodes_before_eps_update": 2_000,
    "eps_start": 1.0,
    "eps_end": 0.05,
    "episodes_after_eps_end": 1_000,
    "max_random_skips": 5, # improve exploration: randomly skip action with highest Q value when playing
    "target_update_interval": 5_000, # increase if the loss is thrashing
    "strategy": "mixed", # "random", "target", "mixed"
    "batch_size": 64,
    "min_buffer_size_before_training": 3_000,
    "network_update_per_episode": 5,
    "evaluate_vs_random_interval": 2_000,
    "learning_rate": 0.00001, # or 0.001
    "loss": "huber", # "mse" or "huber"

    # parameters to restore last training conditions
    "episodes_done": 0,
    "current_eps": None,
    "training_executions_count": 0,
    "best_win_rate_vs_random": 0.0
  }

Extracted train_4inarowEvolution.zip to ./train_4inarowEvolution
JSON file train_4inarowEvolution/train_settings.json loaded into a dictionary


In [None]:
from train_4inarowEvolution.DQN import DQN
from train_4inarowEvolution.DoubleDQN import DoubleDQN
from train_4inarowEvolution.DuelingDQN import DuelingDQN
from train_4inarowEvolution.DoubleDuelingDQN import DoubleDuelingDQN
from train_4inarowEvolution.FourInARowEnv import FourInARowEnv

# Environment setup
map_width = train_settings["map_width"]
map_height = train_settings["map_height"]
moves_at_time = train_settings["moves_at_time"]
output_classes = map_width ** moves_at_time
print(f'Map shape is ({map_width}, {map_height}), {moves_at_time=}, {output_classes=}');

state_space = (map_height, map_width)
action_space = (output_classes)

# Initialize the environment
env = FourInARowEnv(
    map_width=map_width, map_height=map_height,
    moves_at_time=moves_at_time)
# Initialize the chosen DQN model
dqn_type = train_settings["DQN"]
if dqn_type == "DQN":
  dqn = DQN(state_space,
            action_space,
            env,
            gamma=train_settings["gamma"])
elif dqn_type == "DoubleDQN":
  dqn = DoubleDQN(state_space,
            action_space,
            env,
            gamma=train_settings["gamma"])
elif dqn_type == "DuelingDQN":
  dqn = DuelingDQN(state_space,
            action_space,
            env,
            gamma=train_settings["gamma"])
elif dqn_type == "DoubleDuelingDQN":
  dqn = DoubleDuelingDQN(state_space,
            action_space,
            env,
            gamma=train_settings["gamma"])

# Set loss function
loss = keras.losses.MeanSquaredError() if train_settings["loss"] == "mse" else keras.losses.Huber()

# Model paths
best_model_path = f"{base_path}/model_checkpoints/best.keras"
last_model_path = f"{base_path}/model_checkpoints/trained_model.keras"
model_to_load_path = last_model_path

# Create or load the model
network_type = "CNN" # "NN" or "CNN"
with tf.device('/GPU:0'):
  if os.path.exists(model_to_load_path):
    dqn.load_model(model_to_load_path)
  elif "dueling" in dqn_type.lower():
    dqn.create_model(
      shared_cnn_layers=[(64, (3,3)), (64, (3,3))],
      shared_dense_layers=[256],
      value_stream_layers=[256],
      advantage_stream_layers=[256],
      padding="same",
      optimizer=tf.keras.optimizers.Adam(learning_rate=train_settings["learning_rate"]),
      loss=loss,
      metrics=["accuracy"])
  elif network_type == "CNN":
    padding = "same"
    dqn.create_model(
      keras.Sequential([
        layers.Input(shape=(*state_space, 1)),
        layers.Conv2D(
            filters=32,
            kernel_size=3,
            strides=1,
            padding=padding,
            activation="relu"
        ),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(output_classes)
      ]),
      optimizer=tf.keras.optimizers.Adam(learning_rate=train_settings["learning_rate"]),
      loss=loss,
      metrics=["accuracy"]
    )
  else:
    dqn.create_model(
      keras.Sequential([
        layers.Input(shape=(*state_space, 1)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(output_classes)
      ]),
      optimizer=tf.keras.optimizers.Adam(learning_rate=train_settings["learning_rate"]),
      loss=loss,
      metrics=["accuracy"]
    )

dqn.model.summary()

Map shape is (7, 6), moves_at_time=3, output_classes=343


In [None]:
# Evaluate initial performance
initial_win_rate, initial_tie_rate, action_count = dqn.compute_win_rate_vs_random_opponent(episodes=1000)
print(f"{initial_win_rate=}, {initial_tie_rate=}")

In [None]:
# Training setup
episodes = train_settings["episodes"]
episodes_done = train_settings["episodes_done"]
episodes_left = episodes - episodes_done
eps_start = train_settings["current_eps"] if train_settings["current_eps"] is not None else train_settings["eps_start"]
min_episodes_before_eps_update = max(0, train_settings["min_episodes_before_eps_update"] - episodes_done)
min_buffer_size_before_training = train_settings["min_buffer_size_before_training"]

round_per_episode = dqn.env.game.rounds
warmup_episodes = min(train_settings["training_executions_count"], 1) * int((train_settings["buffer_size"]*0.8)//round_per_episode)

# Train the model
training_completed, curr_parameters_values = dqn.train(
    episodes=episodes_left,
    buffer_size=train_settings["buffer_size"],
    min_episodes_before_eps_update=min_episodes_before_eps_update,
    eps_start=eps_start,
    eps_end=train_settings["eps_end"],
    episodes_after_eps_end=train_settings["episodes_after_eps_end"],
    max_random_skips=train_settings["max_random_skips"],
    target_update_interval=train_settings["target_update_interval"],
    strategy=train_settings["strategy"],
    batch_size=train_settings["batch_size"],
    min_buffer_size_before_training=min_buffer_size_before_training,
    network_update_per_episode=train_settings["network_update_per_episode"],
    evaluate_vs_random_interval=train_settings["evaluate_vs_random_interval"],
    warmup_episodes=warmup_episodes,
    best_win_rate_vs_random=train_settings["best_win_rate_vs_random"],
    checkpoints_pathname=f"{base_path}/model_checkpoints",
    timeout=5*60*60)

warmup_episodes=22857


100%|██████████| 22857/22857 [38:30<00:00,  9.89it/s]


Initial Replay Buffer size: 159999

 * Start training * 



  2%|▏         | 2000/80895 [16:39<355:38:36, 16.23s/it]

 * Train step 2000 ✅ *
Epsilon value: 0.4246
Loss: 0.0025097269099205732
Accuracy: 0.859375
Win rate vs random opponent: 79.4% (tie_rate=0.078)
Replay Buffer size: 173999/200000


  5%|▍         | 4000/80895 [32:34<323:31:48, 15.15s/it]

 * Train step 4000 ✅ *
Epsilon value: 0.4149
Loss: 0.003209829330444336
Accuracy: 0.8359375
Win rate vs random opponent: 79.4% (tie_rate=0.11)
Replay Buffer size: 187999/200000


  7%|▋         | 6000/80895 [48:09<318:10:14, 15.29s/it]

 * Train step 6000 ✅ *
Epsilon value: 0.4051
Loss: 0.004564732313156128
Accuracy: 0.8125
Win rate vs random opponent: 76.0% (tie_rate=0.1)
Replay Buffer size: 200000/200000


 10%|▉         | 8000/80895 [1:03:57<308:55:34, 15.26s/it]

 * Train step 8000 ✅ *
Epsilon value: 0.3954
Loss: 0.004584756214171648
Accuracy: 0.7578125
Win rate vs random opponent: 78.8% (tie_rate=0.082)
Replay Buffer size: 200000/200000


 12%|█▏        | 10000/80895 [1:19:37<298:26:15, 15.15s/it]

 * Train step 10000 ✅ *
Epsilon value: 0.3856
Loss: 0.0030948014464229345
Accuracy: 0.75
Win rate vs random opponent: 72.8% (tie_rate=0.114)
Replay Buffer size: 200000/200000


 15%|█▍        | 12000/80895 [1:35:12<287:35:28, 15.03s/it]

 * Train step 12000 ✅ *
Epsilon value: 0.3759
Loss: 0.004357491619884968
Accuracy: 0.7890625
Win rate vs random opponent: 71.39999999999999% (tie_rate=0.116)
Replay Buffer size: 200000/200000


 17%|█▋        | 14000/80895 [1:50:48<289:07:49, 15.56s/it]

 * Train step 14000 ✅ *
Epsilon value: 0.3662
Loss: 0.0024267600383609533
Accuracy: 0.8046875
Win rate vs random opponent: 72.8% (tie_rate=0.108)
Replay Buffer size: 200000/200000


 20%|█▉        | 16000/80895 [2:07:51<281:07:35, 15.60s/it]

 * Train step 16000 ✅ *
Epsilon value: 0.3564
Loss: 0.0032514859922230244
Accuracy: 0.7421875
Win rate vs random opponent: 65.4% (tie_rate=0.142)
Replay Buffer size: 200000/200000


 22%|██▏       | 18000/80895 [2:24:01<273:56:18, 15.68s/it]

 * Train step 18000 ✅ *
Epsilon value: 0.3467
Loss: 0.003210956696420908
Accuracy: 0.796875
Win rate vs random opponent: 69.8% (tie_rate=0.124)
Replay Buffer size: 200000/200000


 25%|██▍       | 20000/80895 [2:39:54<266:54:19, 15.78s/it]

 * Train step 20000 ✅ *
Epsilon value: 0.3369
Loss: 0.0023622422013431787
Accuracy: 0.8203125
Win rate vs random opponent: 74.0% (tie_rate=0.094)
Replay Buffer size: 200000/200000


 27%|██▋       | 22000/80895 [2:57:24<272:37:41, 16.66s/it]

 * Train step 22000 ✅ *
Epsilon value: 0.3272
Loss: 0.0026873969472944736
Accuracy: 0.7578125
Win rate vs random opponent: 68.8% (tie_rate=0.134)
Replay Buffer size: 200000/200000


 30%|██▉       | 24000/80895 [3:14:11<238:02:08, 15.06s/it]

 * Train step 24000 ✅ *
Epsilon value: 0.3174
Loss: 0.003241804428398609
Accuracy: 0.7421875
Win rate vs random opponent: 62.4% (tie_rate=0.156)
Replay Buffer size: 200000/200000


 32%|███▏      | 26000/80895 [3:30:01<225:13:38, 14.77s/it]

 * Train step 26000 ✅ *
Epsilon value: 0.3077
Loss: 0.002263268455862999
Accuracy: 0.7734375
Win rate vs random opponent: 65.4% (tie_rate=0.134)
Replay Buffer size: 200000/200000


 35%|███▍      | 28000/80895 [3:45:48<218:03:02, 14.84s/it]

 * Train step 28000 ✅ *
Epsilon value: 0.2980
Loss: 0.0018981453031301498
Accuracy: 0.6875
Win rate vs random opponent: 67.80000000000001% (tie_rate=0.108)
Replay Buffer size: 200000/200000


 37%|███▋      | 30000/80895 [4:01:49<207:15:45, 14.66s/it]

 * Train step 30000 ✅ *
Epsilon value: 0.2882
Loss: 0.002871972508728504
Accuracy: 0.7734375
Win rate vs random opponent: 65.2% (tie_rate=0.162)
Replay Buffer size: 200000/200000


 40%|███▉      | 32000/80895 [4:17:42<205:08:15, 15.10s/it]

 * Train step 32000 ✅ *
Epsilon value: 0.2785
Loss: 0.0025002937763929367
Accuracy: 0.734375
Win rate vs random opponent: 64.60000000000001% (tie_rate=0.13)
Replay Buffer size: 200000/200000


 40%|████      | 32491/80895 [4:22:18<6:30:46,  2.06it/s]


 * Timeout * 
Last model Win rate vs random opponent: 64.8%





In [None]:
# Update and save training progress
tot_episodes_done = episodes_done + curr_parameters_values["episodes_done"]
best_win_rate = curr_parameters_values["best_win_rate_vs_random"]
win_rate_history = curr_parameters_values["win_rate_history"]

win_rate_progression_path = f"{base_path}/win_rate_progression.json"
if os.path.exists(win_rate_progression_path):
  win_rate_progression = read_json_file_to_dict(win_rate_progression_path)
else:
  win_rate_progression = {"win_rate_progression": [], "best_win_rate_vs_random": [], "episodes_count": []}

save_dict_to_json_file({
    "win_rate_progression": win_rate_progression["win_rate_progression"] + win_rate_history,
    "best_win_rate_vs_random": win_rate_progression["best_win_rate_vs_random"] + [best_win_rate],
    "episodes_count": win_rate_progression["episodes_count"] + [tot_episodes_done]
}, win_rate_progression_path)
print(f"final_win_rate={win_rate_history[-1]['win_rate']}\t{best_win_rate=}")

JSON file train_4inarowEvolution/win_rate_progression.json loaded into a dictionary
Dictionary saved to train_4inarowEvolution/win_rate_progression.json
final_win_rate=0.646	best_win_rate=0.816


In [None]:
train_settings["episodes_done"] = tot_episodes_done
train_settings["current_eps"] = curr_parameters_values["eps"]
train_settings["training_executions_count"] += 1
train_settings["best_win_rate_vs_random"] = best_win_rate
save_dict_to_json_file(train_settings, train_settings_path)
# Zip the updated training files
os.remove(zip_filename)
zip_directory(base_path, zip_filename)
print(f"Stored training conditions")

Dictionary saved to train_4inarowEvolution/train_settings.json
Zipped train_4inarowEvolution into train_4inarowEvolution.zip
Stored training conditions


In [None]:
# Automatically start downloading the updated zip file
from google.colab import files
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>