In [5]:
import re

import pandas as pd
import numpy as np
from tqdm import tqdm

import keras
from keras import losses, metrics, layers, ops

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [6]:
train_df = pd.read_csv('data/train.csv')

In [7]:
train_set_df = train_df[['agent1', 'agent2', 'EnglishRules', 'LudRules']]

In [8]:
lud_preprocessor = layers.TextVectorization(
    vocabulary='ludii_tokens.dic',
    standardize='strip_punctuation'
)

eng_preprocessor = layers.TextVectorization(
    vocabulary='eng_tokens.dic'
)

max_eng = 0
max_lud = 0

eng_uniques = train_set_df['EnglishRules'].unique()

for engrul in tqdm(eng_uniques):
    eng_vector = eng_preprocessor(engrul)
    max_eng = max(max_eng, eng_vector.shape[0])

print(max_eng)


100%|██████████| 1328/1328 [00:05<00:00, 249.32it/s]

858





In [9]:
lud_uniques = train_set_df['LudRules'].unique()

EQPMNT_RE = r'\(equipment'
RULESTRIP_RE = r'[^a-zA-Z\(\)\{\}]'

for rule in lud_uniques:
    start = re.search(EQPMNT_RE, rule).span()[0]
    pure_rule = rule[start:]
    pure_rule = re.sub(RULESTRIP_RE, ' ', pure_rule)
    lud_vector = lud_preprocessor(pure_rule)
    max_lud = max(max_lud, lud_vector.shape[0])

print(max_lud)

23613


In [10]:
agent1_uniques = train_set_df['agent1'].unique()
agent2_uniques = train_set_df['agent2'].unique()

set(agent1_uniques) >= set(agent2_uniques)

code_agent = {}
agent_code = {}

for id, agent in enumerate(agent1_uniques):
    code_agent[id] = agent
    agent_code[agent] = id


eng_vector.shape=(900,)

lud_vector.shape=(23700,)

agents_to_categorical.shape=(144)

In [11]:
# 900+23700+144 = 24744
# defining input shape and input data distribution

agents_len = 144
engvector_len = 900
ludvector_len = 23700

enter = layers.Input(shape=(24744,))
agent_input_layer = layers.Input(shape=(agents_len,))
agent_input_data = ops.slice(enter, (0,), (agents_len,))

engrul_input_data = ops.slice(enter, (agents_len,), (engvector_len,))

ludrul_input_data = ops.slice(enter, (agents_len+engvector_len,), (ludvector_len,))

In [12]:
# English rules LSTM features extractor.
# 
# Train set contains 384 preextracted features.
# Let`s assume that game rule contains these features and ather that describes 
# Number of these features is unknown and needs to be discovered
# presume that total amount of fetures, that are important for game result prediction is not less then double amount 
# of preextracted features and to be 800

FEATURES_NUM = 800
ENG_VECTOR_DIM = 512
LUD_VECTOR_DIM = 768


In [13]:
# EngRule LSTM

engrul_input_layer = layers.Input(shape=(engvector_len,))
eng_emb = layers.Embedding(input_dim=3692, output_dim=ENG_VECTOR_DIM)(engrul_input_layer)
eng_x = layers.Bidirectional(layers.LSTM(ENG_VECTOR_DIM, return_sequences=True))(eng_emb)
eng_x = layers.Bidirectional(layers.LSTM(ENG_VECTOR_DIM))(eng_x)
eng_out = layers.Dense(FEATURES_NUM, activation='relu')(eng_x)

# model = keras.Model(inputs=[enter], outputs=[eng_out])

# model.summary()

In [14]:
# LudRule LSTM

ludrul_input_layer = layers.Input(shape=(ludvector_len,))
lud_emb = layers.Embedding(input_dim=1240, output_dim=LUD_VECTOR_DIM)(ludrul_input_layer)
lud_x = layers.Bidirectional(layers.LSTM(LUD_VECTOR_DIM, return_sequences=True))(lud_emb)
lud_x = layers.Bidirectional(layers.LSTM(LUD_VECTOR_DIM))(lud_x)
lud_out = layers.Dense(FEATURES_NUM, activation='relu')(lud_x)


In [15]:
assemble = layers.Concatenate(axis=1)([agent_input_layer, eng_out, lud_out])
x = layers.Dense(1200, activation='relu')(assemble)
x = layers.Dense(512, activation='relu')(x)
x = layers.Dropout(0.3)(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(1, activation='tanh')(x)

model = keras.Model(inputs=[agent_input_layer, engrul_input_layer, ludrul_input_layer],
                    outputs=[out])


In [16]:
# model.summary()

In [17]:
# keras.utils.plot_model(model,
#                        show_shapes=True,
#                        expand_nested=True,
#                        show_layer_activations=True)

## DataGenerator

In [37]:

class GameDataGenerator(keras.utils.Sequence):
    eqpmnt_re = r'\(equipment'
    rulestrip_re = r'[^a-zA-Z\(\)\{\}]'
    engvector_len = 900
    ludvector_len = 23700
    
    def __init__(self, list_IDs, dataframe, labels_dict, batch_size=32, shuffle=True):
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.labels_dict = labels_dict
        self.df = dataframe[['agent1', 'agent2', 'EnglishRules', 'LudRules']]
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.list_IDs))
        self._agent_code = agent_code
    
    @property
    def agent_code(self):
        return self._agent_code

    @agent_code.setter
    def agent_code(self):
        agents = self.df['agent1'].unique().tolist()
        agents.sort()
        ret = {}
        for id_, agent in enumerate(agents):
            ret[agent] = id_
        self._agent_code = ret

    def __len__(self):
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, batch_num):
          # Generate indexes of the batch
        indexes = self.indexes[batch_num*self.batch_size:(batch_num+1)*self.batch_size]
        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_ends(self):
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle:
            np.random.shuffle(self.indexes)
    
    def _lud_prepare(self, rule) -> str:
        start = re.search(self.eqpmnt_re, rule).span()[0]
        pure_rule = rule[start:]
        pure_rule = re.sub(self.rulestrip_re, ' ', pure_rule)
        return pure_rule

    def __data_generation(self, list_IDs_temp):
        X = np.empty(self.batch_size, dtype=np.int32)
        y = []

        agents_num = len(self.agent_code)
        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            row = self.df.loc[ID]
            agent1, agent2, engrul, ludrul = row
            
            y.append(self.labels_dict[ID])

            agent1_encoded = keras.utils.to_categorical(self.agent_code[agent1], agents_num)
            agent2_encoded = keras.utils.to_categorical(self.agent_code[agent2], agents_num)
            agents_encoded = np.hstack((agent1_encoded, agent2_encoded))
            
            engrul_vector = eng_preprocessor(engrul)
            engrul_vector = keras.utils.pad_sequences((engrul_vector,), maxlen=self.engvector_len)

            ludrul_vector = lud_preprocessor(self._lud_prepare(ludrul))
            ludrul_vector = keras.utils.pad_sequences((ludrul_vector,), maxlen=self.ludvector_len)

            X[i] = [agents_encoded, engrul_vector, ludrul_vector]

        return X, np.array(y)

In [21]:
train_shuffled_df = shuffle(train_df)

labels = train_shuffled_df['utility_agent1'].to_dict()

X_train, X_test = train_test_split(train_shuffled_df, test_size=0.25)

In [38]:
train_generator = GameDataGenerator(X_train.index, train_shuffled_df, labels)
test_generator = GameDataGenerator(X_test.index, train_shuffled_df, labels)

In [29]:
model.compile(optimizer='adam',
              loss=losses.MeanSquaredError,
              metrics=[metrics.F1Score]
              )

In [39]:
model.fit(train_generator, epochs=10, validation_data=test_generator, verbose=1)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3,) + inhomogeneous part.