cleaning

In [22]:
import os
import re

def parse_battle_log(log_text, username):
    lines = [ln.strip() for ln in log_text.splitlines() if ln.strip()]
    player_role = None
    
    for ln in lines:
        if ln.startswith("|player|"):
            parts = ln.split("|")
            if len(parts) >= 4:
                role = parts[2]  # 'p1' or 'p2'
                name = parts[3]
                if name.lower() == username.lower():
                    player_role = role
                    break
    if player_role is None:
        raise ValueError("Username not found in log")

    data = []
    context_lines = []
    current_turn = 0

    for ln in lines:
        if ln.startswith("|j|") or ln.startswith("|l|") or ln.startswith("|c|"):
            continue
        if ln.startswith("|turn|"):
            try:
                current_turn = int(ln.split("|")[2])
            except:
                current_turn += 1
            context_lines.append(f"Turn {current_turn}")
            continue
        if ln.startswith("|move|") or ln.startswith("|switch|"):
            parts = ln.split("|")
            action_type = parts[1]
            actor = parts[2]
            if actor.startswith(player_role):
                if action_type == "move":
                    move_name = parts[3]
                    action_text = f"move {move_name}"
                elif action_type == "switch":
                    poke_name = parts[3].split(',')[0]
                    action_text = f"switch {poke_name}"
                state_text = " ".join(context_lines)
                data.append((state_text, action_text))
                context_lines.append(action_text)
            else:
                if action_type == "move":
                    move_name = parts[3]
                    context_lines.append(f"opponent move {move_name}")
                elif action_type == "switch":
                    opp_poke = parts[3].split(',')[0]
                    context_lines.append(f"opponent switch {opp_poke}")
        elif ln.startswith("|faint|"):
            faint_parts = ln.split("|")
            if len(faint_parts) >= 3:
                faint_actor = faint_parts[2]
                faint_text = "ally fainted" if faint_actor.startswith(player_role) else "opponent fainted"
                context_lines.append(faint_text)

    return data

# === ITERATE THROUGH FOLDER ===

username = "coatoverwatch"
log_folder = "DoubleBattleLogs"
all_samples = []

for filename in os.listdir(log_folder):
    if filename.endswith(".log"):
        filepath = os.path.join(log_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            try:
                log_text = f.read()
                samples = parse_battle_log(log_text, username)
                all_samples.extend(samples)
                print(f"Parsed {len(samples)} from {filename}")
            except Exception as e:
                print(f"Error parsing {filename}: {e}")

print(f"\nTotal state-action pairs collected: {len(all_samples)}")
print("Example sample:", all_samples[0] if all_samples else "No samples found.")


Parsed 27 from gen9doublesubers-2374223819.log
Parsed 11 from gen9doublesubers-2374225913.log
Parsed 25 from gen9doublesubers-2374228798.log
Parsed 29 from gen9doublesubers-2374524555.log
Parsed 19 from gen9doublesubers-2374529916.log
Parsed 22 from gen9doublesubers-2374533533.log
Parsed 21 from gen9doublesubers-2374545013.log
Parsed 15 from gen9doublesubers-2374589924.log
Parsed 6 from gen9doublesubers-2374592302.log
Parsed 35 from gen9doublesubers-2374595064.log
Parsed 16 from gen9doublesubers-2374600429.log
Parsed 20 from gen9doublesubers-2374603218-i2jq8es1gtleug5vxnmvfn6xrqy58c1pw.log
Parsed 25 from gen9doublesubers-2374608156-cf6wq2nh5is68mk15iq4j1vx072jnkgpw.log
Parsed 41 from gen9doublesubers-2374618018.log
Parsed 22 from gen9doublesubers-2374624385.log
Parsed 28 from gen9doublesubers-2374633057-ky2fuunboavtsa55fwspvn676r1p2hipw.log

Total state-action pairs collected: 362
Example sample: ('opponent switch Maushold-Four opponent switch Archaludon', 'switch Amoonguss')


Tokenizer

In [23]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Suppose states and actions are collected in lists
states = [s for (s, a) in all_samples]
actions = [a for (s, a) in all_samples]

# Build a tokenizer on the states
tokenizer = Tokenizer(num_words=None, filters="", lower=False, oov_token="<UNK>")
tokenizer.fit_on_texts(states)
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding or OOV
# Convert state texts to sequences of integers
state_sequences = tokenizer.texts_to_sequences(states)
# Pad sequences to a fixed length (maxlen)
maxlen = 200  # you can choose a max length, e.g. 200 tokens
state_sequences = pad_sequences(state_sequences, maxlen=maxlen, padding='post', truncating='post')

# Create mapping for actions to integer labels
action_to_idx = {act: i for i, act in enumerate(sorted(set(actions)))}
idx_to_action = {i: act for act, i in action_to_idx.items()}
num_actions = len(action_to_idx)
# Convert actions to numeric labels
action_labels = [action_to_idx[a] for a in actions]

In [24]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define model hyperparameters
embed_dim = 64   # Embedding dimension for tokens
num_heads = 4    # Number of attention heads
ff_dim = 128     # Hidden layer size in transformer feed-forward network
maxlen = 200     # Sequence length (should match the padding length used)
vocab_size = len(tokenizer.word_index) + 1  # total vocabulary size
num_actions = len(action_to_idx)  # number of output classes

# 1. Token and Position Embedding Layer
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb   = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions  # add token embedding and position embedding

# 2. Transformer Encoder Block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)
    def call(self, inputs, training=False):
        # Self-attention
        attn_output = self.att(inputs, inputs)           # self-attend
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)     # add & norm
        # Feed-forward
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)        # add & norm
        return out2

# 3. Build the model using the layers above
inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
# Add one or more Transformer blocks
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
# (Optionally, you could stack multiple TransformerBlock layers for a deeper model)
# Pool the sequence to get a fixed-size vector
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)                        # a bit of dropout for regularization
x = layers.Dense(64, activation='relu')(x)        # a small dense layer to mix features
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(num_actions, activation='softmax')(x)

model = models.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 200)]             0         
                                                                 
 token_and_position_embeddi  (None, 200, 64)           28160     
 ng_4 (TokenAndPositionEmbe                                      
 dding)                                                          
                                                                 
 transformer_block_4 (Trans  (None, 200, 64)           83200     
 formerBlock)                                                    
                                                                 
 global_average_pooling1d_4  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_18 (Dropout)        (None, 64)                0   

In [28]:
import numpy as np
from sklearn.model_selection import train_test_split

# Convert action_labels to NumPy array
action_labels = np.array(action_labels)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    state_sequences, action_labels, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    X_train, y_train,
    batch_size=32,
    epochs=200,
    validation_data=(X_val, y_val)
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [7]:
seq_lengths = [len(seq) for seq in tokenizer.texts_to_sequences(states)]
print(f"Max length: {max(seq_lengths)}")
print(f"95th percentile: {sorted(seq_lengths)[int(len(seq_lengths) * 0.95)]}")


Max length: 292
95th percentile: 223
