In [2]:
from core import game
import random
from env import TetrisEnv
import numpy as np
import gym
from keras.layers import Conv2D,Dense, Flatten, Input, concatenate, Reshape, MaxPooling2D
from keras.models import Sequential 
from keras.optimizers import Adam
from keras.models import Model
import keras
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import datetime
from IPython.display import display, clear_output

In [3]:
seed = 42
discount_factor = 0.95  # Discount factor for past rewards
epsilon = 1.0  # Epsilon greedy parameter
epsilon_min = 0.01  # Minimum epsilon greedy parameter
epsilon_max = 1.0  # Maximum epsilon greedy parameter
epsilon_interval = 0.993  # Rate at which to reduce chance of random action being taken
batch_size = 64  # Size of batch taken from replay buffer
max_steps_per_episode = 500000

# Use the Baseline Atari environment because of Deepmind helper functions
env = TetrisEnv()

In [4]:
num_action_loc = 12
num_action_rot = 4

def create_q_model():
    input_all = Input(shape=(4), name = 'Input')

    x1 = Dense(32, activation='relu')(input_all)
    x1 = Dense(32, activation='relu')(x1)

    output = Dense(1, activation='linear')(x1)

    model = Model(inputs=input_all, outputs=output)

    return model

q_model = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1.0)

# Experience replay buffers
current_state_memory = []
next_state_memory = []
reward_memory = []
done_memory = []

cumm_reward_history = []

episode_train_interval = 2

episode_count = 0
frame_count = 0
# Number of frames to take random action and observe output
epsilon_random_frames = 1
# Number of frames for exploration
epsilon_greedy_frames = 10000
# Train the model after 4 actions
update_after_actions = 4

q_model.compile(loss = 'mse', optimizer='Adam')

In [5]:
import time
time.sleep(1)
epsilon = -1
q_model.load_weights('Model_Weights.h5')

while True: 
    state = env.reset()
    cumm_reward = 0
    done = False
    while not done:
        frame_count += 1

        next_possible_states = np.array([np.array(i) for i in env.game.getNextStates().keys()])

        if np.random.uniform() < epsilon:
            best_move = next_possible_states[np.random.choice(next_possible_states.shape[0], size=1), :].reshape(4)
        else:
            best_value = None
            for state in next_possible_states:
                value = q_model.predict(state.reshape(1, 4), verbose = 0)
                if not best_value or value > best_value:
                    best_value = value
                    best_move = np.array(state).reshape(4)

        best_action = env.game.getNextStates()[tuple(best_move.flatten())]
        old_pg = env.game.playground
        next_state, reward, done, _  = env.step(best_action)
        new_pg = env.game.playground
        

        cumm_reward += reward

        current_state_memory.append(state)
        next_state_memory.append(next_state)
        reward_memory.append(reward)
        done_memory.append(done)        

        state = next_state

    cumm_reward_history.append(cumm_reward)
    episode_count += 1
    # if episode_count % episode_train_interval == 0:
    #     x = []
    #     y = []

    #     batch_current_memory = current_state_memory[-batch_size:]
    #     batch_next_memory = next_state_memory[-batch_size:]
    #     batch_reward_memory = reward_memory[-batch_size:]
    #     batch_done_memory = done_memory[-batch_size:]

    #     next_predicted_q_values = q_model.predict(np.array(batch_next_memory))
    #     for current, reward, done, next_q in zip(batch_current_memory, batch_reward_memory, batch_done_memory, next_predicted_q_values):
    #         if not done:
    #             new_q_value = (reward + discount_factor * next_q)[0]
    #         else:
    #             new_q_value = reward

    #         x.append(current)
    #         y.append(new_q_value)

    #     q_model.fit(np.array(x), np.array(y), batch_size=batch_size)

    #     if epsilon > epsilon_min:
    #         epsilon *= epsilon_interval

    if episode_count == 0:
        path = r"../Tetris AI/Outputs"
        max_file = max([int(i[:-4]) for i in os.listdir(path)]) + 1
        f = open("..\\Tetris AI\\Outputs\\%s.txt" % max_file, "w")
    else:
        path = r"../Tetris AI/Outputs"
        curr_file = max([int(i[:-4]) for i in os.listdir(path)])
        f = open("..\\Tetris AI\\Outputs\\%s.txt" % curr_file, "a")

    f.write("Currently on episode: %s with reward %s\n"%(episode_count, cumm_reward))

    f.close()

    if (len(current_state_memory) > 512):
        current_state_memory = current_state_memory[-512:]
        next_state_memory = next_state_memory[-512:]
        reward_memory = reward_memory[-512:]
        done_memory = done_memory[-512:]

    if cumm_reward > 10000:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

[4]
2
[4]
2
[1]
15
[1]
15
[5]
11
[5]
11
[7]
9
[7]
9
[1]
15
[1]
15
[3]
12
[3]
12
[4]
4
[4]
4
[7]
9
[7]
9
[5]
12
[5]
12
[6]
23
[6]
23
[7]
12
[7]
12
[2]
28
[2]
28
[1]
25
[1]
25
[5]
15
[5]
15
[6]
23
[6]
23
[2]
21
[2]
21
[1]
16
[1]
16
[6]
18
[6]
18
[6]
20
[6]
20
[5]
12
[5]
12
[6]
20
[6]
20
[4]
5
[4]
5
[6]
19
[6]
19
[3]
12
[3]
12
[2]
22
[2]
22
[3]
12
[3]
12
[5]
15
[5]
15
[7]
10
[7]
10
[5]
14
[5]
14
[1]
18
[1]
18
[7]
10
[7]
10
[1]
7
[1]
7
[2]
15
[2]
15
[3]
9
[3]
9
[4]
5
[4]
5
[7]
11
[7]
11
[3]
9
[3]
9
[2]
19
[2]
19
[7]
9
[7]
9
[3]
12
[3]
12
[2]
21
[2]
21
[6]
23
[6]
23
[4]
7
[4]
7
[2]
22
[2]
22
[5]
11
[5]
11
[6]
16
[6]
16
[3]
11
[3]
11
[4]
6
[4]
6
[2]
20
[2]
20
[5]
7
[5]
7
[6]
22
[6]
22
[1]
22
[1]
22
[3]
10
[3]
10
[7]
11
[7]
11
[5]
8
[5]
8
[4]
5
[4]
5
[7]
12
[7]
12
[6]
19
[6]
19
[5]
11
[5]
11
[7]
10
[7]
10
[4]
6
[4]
6
[7]
11
[7]
11
[5]
9
[5]
9
[6]
17
[6]
17
[7]
11
[7]
11
[3]
11
[3]
11
[7]
11
[7]
11
[6]
25
[6]
25
[5]
11
[5]
11
[1]
23
[1]
23
[5]
11
[5]
11
[2]
15
[2]
15
[6]
19
[6]
19
[1]
16
[1]
1

84