In [None]:
import numpy as np
import copy
from collections import Counter
from multiprocessing import Pool
import time

In [None]:
from multiprocessing import cpu_count
cpu_count()

In [None]:
# this class plays the games
class conny4:
    
    def __init__(self, start_board = np.zeros((6,7))):
        self.board = copy.deepcopy(start_board)
        self.player = self.player()

        if np.sum(self.board) == 1:
            self.player.player = 'Y'
        elif np.sum(self.board) == 0:
            self.player.player = 'R'
        else:
            self.board = np.zeros((6,7))
    
    class player:
        def __init__(self):
            self.player = None
            self.player_swap = {'R':'Y', 'Y':'R'}
            self.player_score = {'R':1, 'Y':-1}
            self.player_name = {'R':'Red', 'Y':'Yellow'}
    
    def legal(self):
        leg = []
        if not self.game_end()[0]:
            for i in range(7):
                if self.board[0,i] == 0:
                    leg.append(i)
        return(leg)
        
    def turn(self, pos):
        if pos not in self.legal():
            pass
        else:
            self.board[sum(np.where(self.board[:,pos] == 0, 1 ,0)) - 1,pos] = self.player.player_score[self.player.player]
            if not self.game_end()[0]:
                self.player.player = self.player.player_swap[self.player.player]
    
    def game_end(self):
        if self.score_update() == 4:
            return(True,self.player.player_score[self.player.player])
        elif sum(sum(np.where(self.board == 0, 1,0))) == 0:
            return(True,0)
        else:
            return(False,)
    
    def score_update(self):
        for i in range(3):
            for j in range(4):
                x4 = self.board[i:4+i,j:4+j]
                hors = [abs(sum(x4[:,k])) for k in range(4)]
                vers = [abs(sum(x4[k,:])) for k in range(4)]
                diag = [abs(np.trace(x4)),abs(np.trace(np.flip(x4,0)))]
                score = max(hors+vers+diag)
                if score == 4:
                    return(score)
        return(score)


In [None]:
#empty class for initial games when nothing is known
class default_model:
    def __init__(self, st_va):
        self.st_va = st_va
    def predict(self, state):
        if state in self.st_va:
            return (self.st_va[state.tobytes()])
        else:
            return(0)

# this function iterates playing the conny4 class against itself using a model, then outputs the dictionary of states visisted, and the values determined through value iteration.
def q_l(model = default_model(st_va = {}), games = 100, learn_rate = 0.1, discount = 0.9, exploit = 0.5, seed = 0):
    
    np.random.seed(seed)
    count = 0
    st_va = {}
    for i in range(games):
        
        game = conny4()
        states = [np.copy(game.board)]
        count += 1
        # play until end game
        while not game.game_end()[0]:
            if np.random.rand() > exploit:
                state = game.board
                legal = game.legal()
                np.random.shuffle(legal)
                scores = {}
                
                for i in legal:
                    game.turn(i)
                    if game.game_end()[0]:
                        scores[i] = game.game_end()[1]
                    else:
                        scores[i] = -1*model.predict(game.board)*game.player.player_score[game.player.player]
                    game = conny4(start_board = state)
                    
                move = max(scores, key = scores.get)
            else:
                legal = game.legal()
                np.random.shuffle(legal)
                move = legal[0]
                
            game.turn(move)
            states.append(np.copy(game.board))
        
        # game ends, perform value iteration
        reward = game.game_end()[1]
        st_va[states[-1].tobytes()] = reward
        for i in reversed(states[:-1]):
            if i.tobytes() not in st_va:
                st_va[i.tobytes()] = 0
                st_va[np.fliplr(i).tobytes()] = 0
            st_va[i.tobytes()] = ((1 - learn_rate)*st_va[i.tobytes()]) + (learn_rate*(discount*reward))
            st_va[np.fliplr(i).tobytes()] = ((1 - learn_rate) * st_va[np.fliplr(i).tobytes()]) + (learn_rate*(discount*reward))
            reward = reward * discount
    
    return(st_va)

def merge_dicts(dict_arr, st_va = None):
    c = Counter(st_va)
    for dic in dict_arr:
        c.update(Counter(dic))
    sums = dict(c)
    means = {k: sums[k] / sum((1 for dic in dict_arr if k in dic)) for k in sums}
    return means

def time_taken(x=3665):
    print(f'Time Taken: {np.floor((x)/(60*60))}H: {np.floor( ((x)%(60*60))/60 )}M: {np.floor( ((x)%(60)) )}S')


In [None]:
start = time.time()
cores = 5

args = []
for i in range(cores):
    args.append((default_model({}), 1000000, 0.1, 0.9, 1, np.random.randint(2147483647)))
with Pool(processes = cores) as p:
    st_va = merge_dicts(p.starmap(q_l, args))

time_taken(time.time() - start)

# 2.7M states (from 1M games) uses roughly 12gb memory

In [None]:
X = np.array([np.reshape(np.frombuffer(i), newshape=(6, 7, 1)) for i in st_va])
y = np.array([st_va[i] for i in st_va])
del(st_va)

In [None]:
print(y.shape, X.shape)

# Construct and tune CNN on base data which will be updated with incremental training


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, InputLayer, Conv2D, Concatenate, Dropout, Flatten
from tensorflow.keras import Model, Input
import keras_tuner as kt

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
input_shape = Input(shape=(6, 7, 1))
tower_1 = Conv2D(32, (4, 4), padding='same', activation='relu')(input_shape)
tower_2 = Conv2D(32, (2, 2), padding='same', activation='relu')(input_shape)
tower_3 = Conv2D(32, (1, 1), padding='same', activation='relu')(input_shape)
merged = Concatenate()([tower_1, tower_2, tower_3])
out = Flatten()(merged)
conv_model = Model(input_shape, out)

def model_builder(hp):
    
    model = tf.keras.Sequential()
    model.add(conv_model)
    
    hp_units = hp.Int('units_1', min_value=8, max_value=512, step=32)
    model.add(Dense(units=hp_units, activation='relu'))
      
    hp_fl = hp.Float('drop_1', 0.1, 0.5)
    model.add(Dropout(hp_fl))
    
    hp_units = hp.Int('units_2', min_value=8, max_value=512, step=32)
    model.add(Dense(units=hp_units, activation='relu'))
    
    hp_fl = hp.Float('drop_2', 0.1, 0.5)
    model.add(Dropout(hp_fl))
    
    hp_units = hp.Int('units_3', min_value=8, max_value=256, step=16)
    model.add(Dense(units=hp_units, activation='relu'))
    
    hp_fl = hp.Float('drop_3', 0.1, 0.5)
    model.add(Dropout(hp_fl))
    
    hp_units = hp.Int('units_4', min_value=8, max_value=128, step=8)
    model.add(Dense(units=hp_units, activation='relu'))
    
    hp_units = hp.Int('units_5', min_value=8, max_value=64, step=4)
    model.add(Dense(units=hp_units, activation='relu'))
    
    model.add(Dense(1, activation='linear'))
    
    learning_rate = hp.Float("learning_rate", min_value=1e-5, max_value=1e-1, sampling="log")
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.keras.losses.MeanAbsoluteError(),
                  metrics=[tf.keras.metrics.MeanAbsoluteError(),
                           tf.keras.metrics.LogCoshError()])
    return model

model_builder(kt.HyperParameters()) #tests that model compiles correctly

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective = "mean_absolute_error",
                     max_epochs = 200,
                     overwrite = True,# need to have a directory so will overwrite it everytime ez
                     project_name = None)

In [None]:
start = time.time()
import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning) #removes depreaction warnings (of which there are loads)
callback = tf.keras.callbacks.EarlyStopping(monitor='mean_absolute_error', patience=5)
tuner.search(X, y, 
             epochs = 50,
             validation_split = 0.2,
             callbacks = [callback],
             verbose = 1)

In [None]:
time_taken(time.time() - start)

In [None]:
model = tuner.get_best_hyperparameters()[0]
model = tuner.hypermodel.build(model)
model.values

In [None]:
path = '/home/conny4-Model'
model.save(path)
#tf.keras.models.load_model(path)

In [None]:
print('finito') 

In [None]:
tuner.get_best_hyperparameters(num_trials = 3)

In [None]:
tuner.results_summary()

### Plan going forwards

With this we have ran 1mil games in parallel and used them to determine best parameters for CNN.
Using this as our CNN, we will do the following to fully train the model:

1. Run 1 mil games with current CNN.
2. Plug score dictionary into CNN - incremental learning with train_on_batch <br> https://stackoverflow.com/questions/64796163/is-incremental-learning-possible-with-tensorflow.
4. Adjust exploit rate in `Q_L` class and learning_rate in the CNN.
5. Repeat steps 1-4.

To adjust the exploit rate and learning rates well i have developed the below function to map interative calls to a desired function