Initial Testing of a Decoder-only (GPT style) architecture.

# NOTE:

The training of the tokenizer and other items was done on the FULL dataset.  This causes a data leak and proper train/val/test splitting should be done in the final product.

# Imports

In [None]:
import os

import tokenizers
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

import transformers
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, load_dataset, load_metric


import ast # Because the tokenized_text column data is stored as a string instead of a list...
import pandas as pd


# Grab the dataset

1) Get the dataset
2) Fix the tokenize_text column from a string to a list of strings
3) Pull out all of the words into a mega-list

In [None]:
# Read the processed card data from wherever you have it.

# df = pd.read_csv('../data/processed/mtg_carddata_processed.csv')
df = pd.read_csv('../data_git/mtg_carddata_processed_2_23_25.csv')

In [37]:
df

Unnamed: 0,name,mana_cost,type_line,oracle_text,power,toughness,colors,keywords,mtgo_id,loyalty,defense,tokenized_text,tfidf_vector,processed_oracle_text
0,"Nissa, Worldsoul Speaker",{3}{G},Legendary Creature — Elf Druid,"Landfall — Whenever a land you control enters,...",3,3,['G'],['Landfall'],,,,"['Landfall', 'Whenever', 'a', 'land', 'you', '...",[0. 0. 0. ... 0. 0. 0.],Landfall — Whenever a land you control enters ...
1,Static Orb,{3},Artifact,"As long as <name> is untapped, players can't u...",,,[],[],15870.0,,,"['As', 'long', 'as', '<name>', 'is', 'untapped...",[0. 0. 0. ... 0. 0. 0.],"As long as <name> is untapped , players can't ..."
2,Sensory Deprivation,{U},Enchantment — Aura,Enchant creature\r\nEnchanted creature gets -3...,,,['U'],['Enchant'],49283.0,,,"['Enchant', 'creature', '\\n', 'Enchanted', 'c...",[0. 0. 0. ... 0. 0. 0.],Enchant creature \n Enchanted creature gets -3...
3,Road of Return,{G}{G},Sorcery,Choose one —\r\n• Return target permanent card...,,,['G'],['Entwine'],77122.0,,,"['Choose', 'one', '\\n', 'Return', 'target', '...",[0. 0. 0. ... 0. 0. 0.],Choose one — \n • Return target permanent card...
4,Storm Crow,{1}{U},Creature — Bird,Flying (This creature can't be blocked except ...,1,2,['U'],['Flying'],22609.0,,,"['Flying', 'This', 'creature', ""can't"", 'be', ...",[0. 0. 0. ... 0. 0. 0.],Flying (This creature can't be blocked except ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31040,Devoted Hero,{W},Creature — Elf Soldier,,1,2,['W'],[],,,,[],[0. 0. 0. ... 0. 0. 0.],
31041,Without Weakness,{1}{B},Instant,Target creature you control gains indestructib...,,,['B'],['Cycling'],64646.0,,,"['Target', 'creature', 'you', 'control', 'gain...",[0. 0. 0. ... 0. 0. 0.],Target creature you control gains indestructib...
31042,Firesong and Sunspeaker,{4}{R}{W},Legendary Creature — Minotaur Cleric,Red instant and sorcery spells you control hav...,4,6,"['R', 'W']",[],101914.0,,,"['Red', 'instant', 'and', 'sorcery', 'spells',...",[0. 0. 0. ... 0. 0. 0.],Red instant and sorcery spells you control hav...
31043,"Samut, the Tested",{2}{R}{G},Legendary Planeswalker — Samut,+1: Up to one target creature gains double str...,,,"['G', 'R']",[],64772.0,4,,"['1', ':', 'Up', 'to', 'one', 'target', 'creat...",[0. 0. 0. ... 0. 0. 0.],+1 : Up to one target creature gains double st...


In [40]:
df['tokenized_text'][0]

"['Landfall', 'Whenever', 'a', 'land', 'you', 'control', 'enters', ',', 'you', 'get', '{E}', '{E}', 'two', 'energy', 'counters', '.', '\\\\n', 'You', 'may', 'pay', 'eight', '{E}', 'rather', 'than', 'pay', 'the', 'mana', 'cost', 'for', 'permanent', 'spells', 'you', 'cast', '.']"

In [41]:
import ast

# Apply ast.literal_eval to each row in the tokenized_text column
df['tokenized_text'] = df['tokenized_text'].apply(ast.literal_eval)

# Display first few rows to verify
df['tokenized_text'].head()

0    [Landfall, Whenever, a, land, you, control, en...
1    [As, long, as, <name>, is, untapped, ,, player...
2    [Enchant, creature, \n, Enchanted, creature, g...
3    [Choose, one, \n, Return, target, permanent, c...
4    [Flying, This, creature, can't, be, blocked, e...
Name: tokenized_text, dtype: object

In [42]:
df['tokenized_text'][0][0]

'Landfall'

In [43]:
# Combine all tokens into one large list
all_tokens = []
for tokens in df['tokenized_text']:
    all_tokens.extend(tokens)

# Alternative one-liner using list comprehension
# all_tokens = [token for tokens in df['tokenized_text'] for token in tokens]

# Display the first 20 tokens to verify
print(f"Total tokens: {len(all_tokens)}")
print("First 20 tokens:", all_tokens[:20])

Total tokens: 974220
First 20 tokens: ['Landfall', 'Whenever', 'a', 'land', 'you', 'control', 'enters', ',', 'you', 'get', '{E}', '{E}', 'two', 'energy', 'counters', '.', '\\n', 'You', 'may', 'pay']


In [47]:
corpus = df['processed_oracle_text']
corpus

0        Landfall — Whenever a land you control enters ...
1        As long as <name> is untapped , players can't ...
2        Enchant creature \n Enchanted creature gets -3...
3        Choose one — \n • Return target permanent card...
4        Flying (This creature can't be blocked except ...
                               ...                        
31040                                                  NaN
31041    Target creature you control gains indestructib...
31042    Red instant and sorcery spells you control hav...
31043    +1 : Up to one target creature gains double st...
31044                     All Sliver creatures get +1/+1 .
Name: processed_oracle_text, Length: 31045, dtype: object

In [55]:
# Convert corpus to list of strings if it's not already
corpus_list = corpus.tolist() if hasattr(corpus, 'tolist') else list(corpus)


In [60]:
type(corpus)

pandas.core.series.Series

In [59]:
type(corpus_list[0])

str

In [62]:
print(corpus[0:3])

0    Landfall — Whenever a land you control enters ...
1    As long as <name> is untapped , players can't ...
2    Enchant creature \n Enchanted creature gets -3...
Name: processed_oracle_text, dtype: object


In [63]:
# Convert pandas Series to list of strings
corpus_list = corpus.values.tolist()

# Ensure all elements are strings
corpus_list = [str(text) for text in corpus_list]



In [65]:
corpus_list[0]

'Landfall — Whenever a land you control enters , you get {E} {E} (two energy counters) . \\n You may pay eight {E} rather than pay the mana cost for permanent spells you cast .'

In [73]:
print(type(corpus_list))  # Should show: <class 'list'>
print(type(corpus_list[0]))  # Should show: <class 'str'>

<class 'list'>
<class 'str'>


# Tokenize

In [None]:
# Create a directory for tokenizer files if it doesn't exist
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)

# Initialize trainer with specific output directory
tk_trainer = tokenizers.trainers.WordPieceTrainer(
    vocab_size=8192,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
)

# Initialize tokenizer with output directory
tokenizer = tokenizers.Tokenizer(tokenizers.models.WordPiece())
tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()

tokenizer.enable_padding(pad_id=0, pad_token="[PAD]")
tokenizer.enable_truncation(max_length=512) # Maybe we cut this down to 258?

# Train the tokenizer
tokenizer.train_from_iterator(corpus_list, trainer=tk_trainer)

# Save the tokenizer
tokenizer.save(os.path.join(models_dir, "tokenizer_leak.json")) # Your tokenizer may have been trained with the full dataset causing a data leak...

In [86]:
# Verify the tokenizer works
sample_text = corpus_list[0]
encoded = tokenizer.encode(sample_text)
print(f"Encoded: {encoded.tokens}")

Encoded: ['Landfall', '—', 'Whenever', 'a', 'land', 'you', 'control', 'enters', ',', 'you', 'get', '{', 'E', '}', '{', 'E', '}', '(', 'two', 'energy', 'counters', ')', '.', '\\', 'n', 'You', 'may', 'pay', 'eight', '{', 'E', '}', 'rather', 'than', 'pay', 'the', 'mana', 'cost', 'for', 'permanent', 'spells', 'you', 'cast', '.']


In [None]:
# Test the tokenizer on a string with a tab character and non-sensical text
test = tokenizer.encode("This is a test of \\t nonzensicallicalness")
print(test.tokens)

['This', 'is', 'a', 'te', '##st', 'of', '\\', 't', 'non', '##zen', '##sical', '##lic', '##al', '##ness']


## Load your Tokenizer

In [None]:
# Since you have a pre-trained tokenizer, you can now load it directly
tokenizer = Tokenizer.from_file(os.path.join(models_dir, "tokenizer_leak.json"))

# Build the Decoder

In [None]:
EMBEDDING_DIM = 256
FEED_FORWARD_DIM = 2048
DROPOUT = 0.5
NUM_HEADS = 8

# Encoder branch
encoder_inputs = tf.keras.Input(shape=(None,), dtype=tf.int64, name="encoder_inputs")
positional_embedding_fr = PositionalEmbedding(
    fr_vocab_size, EMBEDDING_DIM, SEQUENCE_LENGTH)(encoder_inputs)
encoder_self_attention = EncoderSelfAttention(NUM_HEADS, EMBEDDING_DIM)(positional_embedding_fr)
add_and_norm_fr_1 = AddAndNormalization()([encoder_self_attention, positional_embedding_fr])
# Feed Forward network
ff_dense_fr_1 = tf.keras.layers.Dense(FEED_FORWARD_DIM, activation="relu")(add_and_norm_fr_1)
ff_dense_fr_2 = tf.keras.layers.Dense(EMBEDDING_DIM, activation="relu")(ff_dense_fr_1)
ff_dropout_fr_1 = tf.keras.layers.Dropout(DROPOUT)(ff_dense_fr_2)
add_and_norm_fr_2 = AddAndNormalization()([ff_dropout_fr_1, add_and_norm_fr_1])
encoder = tf.keras.models.Model(inputs=encoder_inputs, outputs=add_and_norm_fr_2)

# Decoder branch
decoder_inputs = tf.keras.Input(shape=(None,), dtype=tf.int64, name="decoder_inputs")
positional_embedding_en = PositionalEmbedding(
    en_vocab_size, EMBEDDING_DIM, SEQUENCE_LENGTH)(decoder_inputs)
decoder_self_attention = DecoderSelfAttention(NUM_HEADS, EMBEDDING_DIM)(positional_embedding_en)
add_and_norm_en_1 = AddAndNormalization()([decoder_self_attention, positional_embedding_en])
decoder = tf.keras.models.Model(inputs=decoder_inputs, outputs=add_and_norm_en_1)

# Decoder-encoder branch
de_attention = DecoderEncoderAttention(NUM_HEADS, EMBEDDING_DIM)([add_and_norm_fr_2, add_and_norm_en_1])
de_add_and_norm_1 = AddAndNormalization()([de_attention, add_and_norm_en_1])
# Feed Forward network
ff_dense_de_1 = tf.keras.layers.Dense(FEED_FORWARD_DIM, activation="relu")(de_add_and_norm_1)
ff_dense_de_2 = tf.keras.layers.Dense(EMBEDDING_DIM, activation="relu")(ff_dense_de_1)
ff_dropout_de_1 = tf.keras.layers.Dropout(DROPOUT)(ff_dense_de_2)
de_add_and_norm_2 = AddAndNormalization()([ff_dropout_de_1, de_add_and_norm_1])

# Linear prediction layer
prediction = tf.keras.layers.Dense(len(en_vec_layer.get_vocabulary()), activation="softmax")(de_add_and_norm_2)

transformer = tf.keras.models.Model(
    inputs=[encoder.input, decoder.input], outputs=prediction, name="transformer")

# Tokenize your train/val/test

## Set up datasets for loading data

# Training Loop

# Setup Training Arguments

OLD CODING WORK:

# Set up the Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors='pt')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_reduced['train'],
    eval_dataset=tokenized_dataset_reduced['validation'],0
    tokenizer=tokenizer,
    data_collator=data_collator
)
