Initial Testing of a Decoder-only (GPT style) architecture.

# Imports

In [163]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
from tokenizers.normalizers import NFKC, Sequence
import os
from typing import List, Optional, Union
from tqdm.notebook import tqdm
import random

import transformers
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, DataCollatorForLanguageModeling
from datasets import Dataset, DatasetDict,load_dataset #, load_metric

# import tensorflow as tf

from sklearn.model_selection import train_test_split

import torch


import ast # Because the tokenized_text column data is stored as a string instead of a list...
import pandas as pd


In [2]:
# Check tensforflow GPU availability
# print(tf.config.list_physical_devices(device_type='GPU'))

# It looks like Brandon's rig hates him right now... stupid WSL2...

In [3]:
# Check pytorch GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'You are usisng {device} on a {torch.cuda.get_device_name()}.')

You are usisng cuda on a NVIDIA GeForce GTX 1070.


# Grab the dataset

1) Get the dataset
2) Fix the tokenize_text column from a string to a list of strings
3) Pull out all of the words into a mega-list

**SOMTHING IS WRONG WITH THE DATASET.  Likely an issue with the preprocessing steps...**

In [4]:
# Read the processed card data from wherever you have it.

# df = pd.read_csv('../data/processed/mtg_carddata_processed.csv')
df = pd.read_csv('../data/processed/mtg_carddata_processed_2_23_25.csv')

In [5]:
df[:7]

Unnamed: 0,name,mana_cost,type_line,oracle_text,power,toughness,colors,keywords,mtgo_id,loyalty,defense,processed_oracle_text,tokenized_text,tfidf_vector
0,"Nissa, Worldsoul Speaker",{3}{G},Legendary Creature — Elf Druid,"Landfall — Whenever a land you control enters,...",3.0,3.0,['G'],['Landfall'],,,,Landfall — Whenever a land you control enters ...,"['Landfall', 'Whenever', 'a', 'land', 'you', '...",[0. 0. 0. ... 0. 0. 0.]
1,Static Orb,{3},Artifact,"As long as <name> is untapped, players can't u...",,,[],[],15870.0,,,"As long as Static Orb is untapped , players ca...","['As', 'long', 'as', '<name>', 'is', 'untapped...",[0. 0. 0. ... 0. 0. 0.]
2,Sensory Deprivation,{U},Enchantment — Aura,Enchant creature\r\nEnchanted creature gets -3...,,,['U'],['Enchant'],49283.0,,,Enchant creature \n Enchanted creature gets -3...,"['Enchant', 'creature', '\\n', 'Enchanted', 'c...",[0. 0. 0. ... 0. 0. 0.]
3,Road of Return,{G}{G},Sorcery,Choose one —\r\n• Return target permanent card...,,,['G'],['Entwine'],77122.0,,,Choose one — \n • Return target permanent card...,"['Choose', 'one', '\\n', 'Return', 'target', '...",[0. 0. 0. ... 0. 0. 0.]
4,Storm Crow,{1}{U},Creature — Bird,Flying (This creature can't be blocked except ...,1.0,2.0,['U'],['Flying'],22609.0,,,Flying (This creature can't be blocked except ...,"['Flying', 'This', 'creature', ""can't"", 'be', ...",[0. 0. 0. ... 0. 0. 0.]
5,Snarlfang Vermin,{B},Creature — Rat,Whenever <name> deals combat damage to a creat...,2.0,1.0,['B'],[],,,,Whenever Snarlfang Vermin deals combat damage ...,"['Whenever', '<name>', 'deals', 'combat', 'dam...",[0. 0. 0. ... 0. 0. 0.]
6,Walking Sponge,{1}{U},Creature — Sponge,{T}: Target creature loses your choice of flyi...,1.0,1.0,['U'],[],12637.0,,,{T} : Target creature loses your choice of fly...,"['{T}', ':', 'Target', 'creature', 'loses', 'y...",[0. 0. 0. ... 0. 0. 0.]


In [6]:
df['tokenized_text'][0]

"['Landfall', 'Whenever', 'a', 'land', 'you', 'control', 'enters', ',', 'you', 'get', '{E}', '{E}', 'two', 'energy', 'counters', '.', '\\\\n', 'You', 'may', 'pay', 'eight', '{E}', 'rather', 'than', 'pay', 'the', 'mana', 'cost', 'for', 'permanent', 'spells', 'you', 'cast', '.']"

In [7]:
# Apply ast.literal_eval to each row in the tokenized_text column
df['tokenized_text'] = df['tokenized_text'].apply(ast.literal_eval)

# Display first few rows to verify
df['tokenized_text'].head(10)

0    [Landfall, Whenever, a, land, you, control, en...
1    [As, long, as, <name>, is, untapped, ,, player...
2    [Enchant, creature, \n, Enchanted, creature, g...
3    [Choose, one, \n, Return, target, permanent, c...
4    [Flying, This, creature, can't, be, blocked, e...
5    [Whenever, <name>, deals, combat, damage, to, ...
6    [{T}, :, Target, creature, loses, your, choice...
7            [Exile, all, multicolored, permanents, .]
8    [When, <name>, enters, ,, create, a, Food, tok...
9    [<name>, deals, damage, to, any, target, equal...
Name: tokenized_text, dtype: object

In [8]:
df['tokenized_text'][0][0]

'Landfall'

In [9]:
# Combine all tokens into one large list
all_tokens = []
for tokens in df['tokenized_text']:
    all_tokens.extend(tokens)

# Alternative one-liner using list comprehension
# all_tokens = [token for tokens in df['tokenized_text'] for token in tokens]

# Display the first 20 tokens to verify
print(f"Total tokens: {len(all_tokens)}")
print("First 20 tokens:", all_tokens[:20])

Total tokens: 968000
First 20 tokens: ['Landfall', 'Whenever', 'a', 'land', 'you', 'control', 'enters', ',', 'you', 'get', '{E}', '{E}', 'two', 'energy', 'counters', '.', '\\n', 'You', 'may', 'pay']


In [10]:
corpus = df['processed_oracle_text']
corpus

0        Landfall — Whenever a land you control enters ...
1        As long as Static Orb is untapped , players ca...
2        Enchant creature \n Enchanted creature gets -3...
3        Choose one — \n • Return target permanent card...
4        Flying (This creature can't be blocked except ...
                               ...                        
30618                                                  NaN
30619    Target creature you control gains indestructib...
30620    Red instant and sorcery spells you control hav...
30621    +1 : Up to one target creature gains double st...
30622                     All Sliver creatures get +1/+1 .
Name: processed_oracle_text, Length: 30623, dtype: object

In [11]:
# Convert corpus to list of strings if it's not already
corpus_list = corpus.tolist() if hasattr(corpus, 'tolist') else list(corpus)


In [12]:
type(corpus)

pandas.core.series.Series

In [13]:
type(corpus_list[0])

str

In [14]:
print(corpus[0:3])

0    Landfall — Whenever a land you control enters ...
1    As long as Static Orb is untapped , players ca...
2    Enchant creature \n Enchanted creature gets -3...
Name: processed_oracle_text, dtype: object


In [15]:
# Convert pandas Series to list of strings
corpus_list = corpus.values.tolist()

# Ensure all elements are strings
corpus_list = [str(text) for text in corpus_list]



In [16]:
corpus_list[0]

'Landfall — Whenever a land you control enters , you get {E} {E} (two energy counters) . \\n You may pay eight {E} rather than pay the mana cost for permanent spells you cast .'

In [17]:
print(type(corpus_list))  # Should show: <class 'list'>
print(type(corpus_list[0]))  # Should show: <class 'str'>

<class 'list'>
<class 'str'>


# Split into train/val/test datasets

Do we need to keep other information with the corpus_list as we split it up?  Names, color, etc.?

If we're just generating text, I think that answer is no.  If we want more, then the answer is likely yes.

In [18]:
train_list, temp = train_test_split(corpus_list, test_size=0.2, random_state=42) # Set 80% for training
val, test = train_test_split(temp, test_size=0.5, random_state=42) # Set 10% for validation and 10% for testing

In [19]:
train = [{"sentence": text} for text in train_list]  # Wrap each sentence in a dict
val = [{"sentence": text} for text in val]
test = [{"sentence": text} for text in test]

In [None]:
# Create the Dataset Dictionary for future mapping
data = DatasetDict({
    'train': Dataset.from_list(train),
    'validation': Dataset.from_list(val),
    'test': Dataset.from_list(test)
    })

data

DatasetDict({
    train: Dataset({
        features: ['sentence'],
        num_rows: 24498
    })
    validation: Dataset({
        features: ['sentence'],
        num_rows: 3062
    })
    test: Dataset({
        features: ['sentence'],
        num_rows: 3063
    })
})

In [21]:
data['train'][:5]

{'sentence': ['Choose one — \\n • Destroy target Cleric . \\n • Return target Cleric card from your graveyard to your hand . \\n • Target player loses 2 life .',
  '( {T} : Add {W} or {B} . )',
  'Explosive Welcome deals 5 damage to any target and 3 damage to any other target . Add {R} {R} {R} .',
  "Vigilance (Attacking doesn't cause this creature to tap . )",
  "Trample \\n Alexios , Deimos of Kosmos attacks each combat if able , can't be sacrificed , and can't attack its owner . \\n At the beginning of each player's upkeep , that player gains control of Alexios , untaps it , and puts a +1/+1 counter on it . It gains haste until end of turn ."]}

In [24]:
data['train']

Dataset({
    features: ['sentence'],
    num_rows: 24498
})

In [25]:
data['validation']['sentence'][0]

'Flash \\n Other white creatures you control get +1/+1 . \\n Other blue creatures you control get +1/+1 .'

# Train the Tokenizer on Training Data

In [26]:
# Create a directory for tokenizer files if it doesn't exist
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)


In [None]:
# Create a ton of helper functions to build out a tokenizer and train it on the dataset


def create_gpt2_tokenizer(vocab_size=8192, min_frequency=2):
    """
    Create a GPT2-style BPE tokenizer from scratch using Hugging Face tokenizers library.

    Args:
        vocab_size: The size of the vocabulary to learn
        min_frequency: Minimum frequency for a token to be considered in the BPE algorithm

    Returns:
        A tokenizer object ready for training
    """
    # Initialize a ByteLevelBPE model-based tokenizer
    tokenizer = Tokenizer(models.BPE())

    # Add byte-level pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

    # Add decoder to properly decode byte-level tokens
    tokenizer.decoder = decoders.ByteLevel()

    # Set up normalizers - GPT-2 doesn't do much normalization
    tokenizer.normalizer = Sequence([NFKC()])

    # Return the tokenizer (to be trained later)
    return tokenizer

def train_tokenizer_from_texts(
    tokenizer: Tokenizer,
    texts: List[str],
    vocab_size: int = 8192,
    min_frequency: int = 2,
    batch_size: int = 512,
    output_dir: str = "tokenizer"
):
    """
    Train the tokenizer on a list of texts using batching

    Args:
        tokenizer: The tokenizer object to train
        texts: List of strings to use for training
        vocab_size: Maximum vocabulary size
        min_frequency: Minimum frequency for a token
        batch_size: Number of texts to process in each batch
        output_dir: Directory to save the trained tokenizer

    Returns:
        The trained tokenizer
    """
    if not texts:
        raise ValueError("No texts provided for training")

    # Configure the BPE trainer
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=["<|endoftext|>", "<|pad|>"],
        show_progress=True,
    )

    # Create temporary files for batched training
    temp_dir = os.path.join(output_dir, "temp")
    os.makedirs(temp_dir, exist_ok=True)

    # Split texts into batches
    batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
    batch_files = []

    print(f"Preparing {len(batches)} batches for training...")

    # Write batches to temporary files
    for i, batch in enumerate(tqdm(batches)):
        batch_file = os.path.join(temp_dir, f"batch_{i}.txt")
        with open(batch_file, "w", encoding="utf-8") as f:
            f.write("\n<|endoftext|>\n".join(batch))
            f.write("\n<|endoftext|>\n")  # Add final separator
        batch_files.append(batch_file)

    # Train the tokenizer
    print("Training tokenizer...")
    tokenizer.train(batch_files, trainer)

    # Add post-processor to handle special tokens
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

    # Enable padding with the PAD token
    pad_id = tokenizer.token_to_id("<|pad|>")
    if pad_id is not None:
        tokenizer.enable_padding(pad_id=pad_id, pad_token="<|pad|>")
    else:
        print("Warning: <|pad|> token not found in vocabulary. Padding won't work correctly.")

    # Save the trained tokenizer
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    tokenizer.save(f"{output_dir}/gpt2_tokenizer.json")

    # Clean up temporary files
    for file in batch_files:
        os.remove(file)
    os.rmdir(temp_dir)

    print(f"Tokenizer trained and saved to {output_dir}/gpt2_tokenizer.json")
    return tokenizer

def use_tokenizer(tokenizer, text, padding=True, max_length=None):
    """
    Demonstrate how to use the tokenizer with padding

    Args:
        tokenizer: The trained tokenizer
        text: Text to tokenize or list of texts
        padding: Whether to pad the sequences
        max_length: Maximum length for padding (optional)

    Returns:
        Encoding object(s) with tokens, ids, etc.
    """
    # Handle both single text and lists of texts
    is_single = isinstance(text, str)
    texts = [text] if is_single else text

    # Configure padding in the tokenizer object itself
    if padding:
        pad_id = tokenizer.token_to_id("<|pad|>")
        if pad_id is not None:
            if max_length:
                tokenizer.enable_padding(pad_id=pad_id, pad_token="<|pad|>", length=max_length)
            else:
                tokenizer.enable_padding(pad_id=pad_id, pad_token="<|pad|>")

    # Encode text(s)
    encodings = tokenizer.encode_batch(texts)

    # Display results for single text for demonstration
    if is_single:
        print(f"Input text: {text}")
        print(f"Token IDs: {encodings[0].ids}")
        print(f"Tokens: {encodings[0].tokens}")
        if padding:
            print(f"Attention mask: {encodings[0].attention_mask}")

        # Decoding demonstration
        decoded = tokenizer.decode(encodings[0].ids)
        print(f"Decoded text: {decoded}")

    # Disable padding for future calls if it was temporarily enabled
    if padding and max_length:
        tokenizer.no_padding()

    return encodings[0] if is_single else encodings

# def load_pretrained_tokenizer(): # Gives me the GPT-2 pretrained tokenizer
#     """
#     Load the pre-trained GPT-2 tokenizer from Hugging Face

#     Returns:
#         A pre-trained GPT-2 tokenizer
#     """
#     from transformers import GPT2Tokenizer

#     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
#     tokenizer.pad_token = tokenizer.eos_token  # Set padding token
#     return tokenizer


# Load the saved tokenizer from the JSON file
def load_tokenizer(path):
    """
    Load a previously saved tokenizer from a file

    Args:
        path: Path to the saved tokenizer JSON file

    Returns:
        The loaded tokenizer
    """
    tokenizer = Tokenizer.from_file(path)

    # Re-enable padding if needed
    pad_id = tokenizer.token_to_id("<|pad|>")
    if pad_id is not None:
        tokenizer.enable_padding(pad_id=pad_id, pad_token="<|pad|>")

    return tokenizer

# Example usage in a Jupyter Notebook
def demonstrate_tokenizer():
    """
    Full demonstration of creating, training, and using a GPT-2 style tokenizer
    """
    # Create a new tokenizer
    print("Creating tokenizer...")
    tokenizer = create_gpt2_tokenizer()

    # Generate some sample texts for demonstration
    print("Generating sample texts...")
    sample_texts = [
        "This is an example of text that could be used to train a tokenizer.",
        "It should include diverse vocabulary, punctuation (like commas, periods, question marks?).",
        "Multiple paragraphs are good to include.",
        "Numbers like 42, 3.14159, and 2023 should be represented.",
        "Code snippets might be important if your model will process code:\ndef hello_world():\n    print('Hello, world!')",
        # Add more sample texts as needed
    ]

    # Add some more generated texts for better training
    for i in range(50):
        words = ["The", "quick", "brown", "fox", "jumps", "over", "lazy", "dog",
                "Hello", "world", "Python", "programming", "is", "fun", "GPT",
                "natural", "language", "processing", "models", "work", "well"]
        length = random.randint(5, 20)
        text = " ".join(random.choices(words, k=length)) + "."
        sample_texts.append(text)

    # Train the tokenizer on the sample texts
    print(f"Training tokenizer on {len(sample_texts)} texts...")
    tokenizer = train_tokenizer_from_texts(
        tokenizer,
        sample_texts,
        vocab_size=1000,  # Smaller vocab for demo
        min_frequency=1,
        batch_size=20,
        output_dir="tokenizer_demo"
    )

    # Test the tokenizer
    print("\nTesting tokenizer with padding...")
    test_texts = [
        "This is a short text.",
        "This is a slightly longer text with more content.",
        "Let's see how padding works on texts of different lengths."
    ]

    # Encode with padding
    encodings = use_tokenizer(tokenizer, test_texts, padding=True)

    # Display batch results
    for i, enc in enumerate(encodings):
        print(f"\nText {i+1} length: {len(enc.ids)}")
        if hasattr(enc, 'attention_mask'):
            print(f"Attention mask: {enc.attention_mask}")

    return tokenizer

# Example for loading and using a pre-trained tokenizer
# Not going to use this because I don't want the pre-trained GPT2 tokenizer
# def use_pretrained_example():
#     pretrained = load_pretrained_tokenizer()

#     sample_text = "Hello, I'm a language model like GPT-2!"

#     # Tokenize with the Transformers tokenizer
#     tokens = pretrained.tokenize(sample_text)
#     encoded = pretrained(sample_text, padding=True, return_tensors="pt")
#     token_ids = encoded["input_ids"][0].tolist()

#     print(f"Input text: {sample_text}")
#     print(f"Tokens: {tokens}")
#     print(f"Token IDs: {token_ids}")
#     print(f"Decoded text: {pretrained.decode(token_ids)}")

#     return pretrained

# Run the demonstration if in a Jupyter notebook
# Just call demonstrate_tokenizer() in a cell to see the full workflow

In [None]:
# Prove that your tokenizer is doing things as anticipated
# Please work...
demonstrate_tokenizer()

Creating tokenizer...
Generating sample texts...
Training tokenizer on 55 texts...
Preparing 3 batches for training...


  0%|          | 0/3 [00:00<?, ?it/s]

Training tokenizer...
Tokenizer trained and saved to tokenizer_demo/gpt2_tokenizer.json

Testing tokenizer with padding...

Text 1 length: 33
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Text 2 length: 33
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

Text 3 length: 33
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Tokenizer(version="1.0", truncation=None, padding=PaddingParams(strategy=BatchLongest, direction=Right, pad_to_multiple_of=None, pad_id=1, pad_type_id=0, pad_token="<|pad|>"), added_tokens=[{"id":0, "content":"<|endoftext|>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"<|pad|>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=Sequence(normalizers=[NFKC()]), pre_tokenizer=ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True), post_processor=ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True), decoder=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"<|endoftext|>":0, "<|pad|>":1, "!":2, "'":3, "(":4, ")":5, ",":6, ".":7, "0":8, "1":9, "2":10, "3":11, "4":12, 

In [132]:
# Load tokenizer from file if you have one saved; though training does not take long here
trained_tokenizer = load_tokenizer(models_dir + "/gpt2_tokenizer.json")

In [133]:
# Train a tokenizer if you don't have one yet
trained_tokenizer = create_gpt2_tokenizer(vocab_size=8192, min_frequency=2)

train_tokenizer_from_texts(tokenizer=trained_tokenizer, texts=data['train']['sentence'], batch_size=512, output_dir=models_dir)

Preparing 48 batches for training...


  0%|          | 0/48 [00:00<?, ?it/s]

Training tokenizer...
Tokenizer trained and saved to ../models/gpt2_tokenizer.json


Tokenizer(version="1.0", truncation=None, padding=PaddingParams(strategy=BatchLongest, direction=Right, pad_to_multiple_of=None, pad_id=1, pad_type_id=0, pad_token="<|pad|>"), added_tokens=[{"id":0, "content":"<|endoftext|>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"<|pad|>", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=Sequence(normalizers=[NFKC()]), pre_tokenizer=ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=True), post_processor=ByteLevel(add_prefix_space=True, trim_offsets=False, use_regex=True), decoder=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"<|endoftext|>":0, "<|pad|>":1, "!":2, """:3, "#":4, "$":5, "&":6, "'":7, "(":8, ")":9, "+":10, ",":11, "-":12, 

In [134]:
encodings = use_tokenizer(trained_tokenizer, data['validation']['sentence'][1], padding=True)
encodings

Input text: {2} {U} : Daring Saboteur can't be blocked this turn . \n Whenever Daring Saboteur deals combat damage to a player , you may draw a card . If you do , discard a card .
Token IDs: [87, 17, 89, 168, 51, 89, 208, 4635, 3816, 1203, 188, 322, 293, 238, 485, 214, 219, 134, 166, 74, 341, 4635, 3816, 1203, 188, 328, 434, 279, 173, 133, 271, 143, 147, 257, 407, 133, 180, 134, 357, 147, 435, 143, 507, 133, 180, 134]
Tokens: ['{', '2', '}', 'Ġ{', 'U', '}', 'Ġ:', 'ĠDaring', 'ĠSab', 'ote', 'ur', 'Ġcan', "'t", 'Ġbe', 'Ġblocked', 'Ġthis', 'Ġturn', 'Ġ.', 'Ġ\\', 'n', 'ĠWhenever', 'ĠDaring', 'ĠSab', 'ote', 'ur', 'Ġdeals', 'Ġcombat', 'Ġdamage', 'Ġto', 'Ġa', 'Ġplayer', 'Ġ,', 'Ġyou', 'Ġmay', 'Ġdraw', 'Ġa', 'Ġcard', 'Ġ.', 'ĠIf', 'Ġyou', 'Ġdo', 'Ġ,', 'Ġdiscard', 'Ġa', 'Ġcard', 'Ġ.']
Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Decoded text: {2} {U} : Daring Saboteur can't be blocked this 

Encoding(num_tokens=46, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [136]:
# Tokenize the entire dataset

def tokenize_dataset_dict(dataset_dict, tokenizer, column_name="sentence", max_length=None, batch_size=1000):
    """
    Tokenize all sentences in a DatasetDict and return a new DatasetDict with tokenized data

    Args:
        dataset_dict: The input DatasetDict containing text data
        tokenizer: A Hugging Face tokenizer instance
        column_name: The name of the column containing the text to tokenize
        max_length: Optional maximum length for padding/truncation
        batch_size: Batch size for processing

    Returns:
        A new DatasetDict with tokenized data
    """
    # Configure padding if needed
    if max_length:
        pad_id = tokenizer.token_to_id("<|pad|>")
        if pad_id is not None:
            tokenizer.enable_padding(pad_id=pad_id, pad_token="<|pad|>", length=max_length)

    # Helper function to process a batch of examples
    def tokenize_batch(examples):
        # Get the texts from the specified column
        texts = examples[column_name]

        # Tokenize the batch
        encodings = tokenizer.encode_batch(texts)

        # Extract the required fields
        input_ids = [enc.ids for enc in encodings]
        attention_mask = [enc.attention_mask if hasattr(enc, 'attention_mask') else [1] * len(enc.ids) for enc in encodings]

        # Create the output dictionary
        result = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            # Keep the original text
            column_name: texts
        }

        return result

    # Create a new DatasetDict to store the results
    tokenized_datasets = DatasetDict()

    # Process each split in the dataset_dict
    for split_name, dataset in dataset_dict.items():
        print(f"Tokenizing {split_name} split...")

        # Use the map function to apply tokenization in batches
        tokenized_dataset = dataset.map(
            tokenize_batch,
            batched=True,
            batch_size=batch_size,
            desc=f"Tokenizing {split_name}"
        )

        # Add to the result DatasetDict
        tokenized_datasets[split_name] = tokenized_dataset

    # Disable padding if it was enabled
    if max_length:
        tokenizer.no_padding()

    return tokenized_datasets

## Tokenize the Datasets

In [None]:
# Tokenize the DatasetDict
# TODO: We can probably decrease the max_length here to 128 or 64 to save memory; We'll have to test it out
# Or maybe we don't use a max length...?? TODO: Figure out how to handle this
tokenize_dataset_dict = tokenize_dataset_dict(data, trained_tokenizer, max_length=256, batch_size=1000)
tokenize_dataset_dict

Tokenizing train split...


Tokenizing train:   0%|          | 0/24498 [00:00<?, ? examples/s]

Tokenizing validation split...


Tokenizing validation:   0%|          | 0/3062 [00:00<?, ? examples/s]

Tokenizing test split...


Tokenizing test:   0%|          | 0/3063 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 24498
    })
    validation: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 3062
    })
    test: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 3063
    })
})

In [138]:
tokenize_dataset_dict['validation']['sentence'][0]

'Flash \\n Other white creatures you control get +1/+1 . \\n Other blue creatures you control get +1/+1 .'

In [139]:
tokenize_dataset_dict['validation']['input_ids'][0]

[792,
 166,
 74,
 840,
 607,
 307,
 147,
 201,
 283,
 244,
 16,
 246,
 16,
 134,
 166,
 74,
 840,
 754,
 307,
 147,
 201,
 283,
 244,
 16,
 246,
 16,
 134,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [140]:
def ids_to_tokens(tokenizer, ids):
    '''
    Convert token IDs back to tokens using the tokenizer
    Really just a mapping function for tokenizer.decode()
    '''
    tokens = tokenizer.decode(ids)

    return tokens

In [141]:
trained_tokenizer.decode(tokenize_dataset_dict['validation']['input_ids'][0])

'Flash \\n Other white creatures you control get +1/+1 . \\n Other blue creatures you control get +1/+1 .'

In [142]:
ids_to_tokens(trained_tokenizer, tokenize_dataset_dict['validation']['input_ids'][0])

'Flash \\n Other white creatures you control get +1/+1 . \\n Other blue creatures you control get +1/+1 .'

In [143]:
tokenize_dataset_dict['validation']['attention_mask'][0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


### There are strange Dungeon and Token cards that are currently as of 2/27/25 not being parsed correctly.

I may need to go drop these items and then rerun the tokenizer... 

In [144]:
# Longest length of oracle text
# max_length = max([len(tokenizer.encode(text).ids) for text in corpus_list])
max_length = max([len(trained_tokenizer.encode(text).ids) for text in data['train']['sentence']])
max_length

388

In [145]:
max_len = 0
for c, item in enumerate(data['train']['sentence']):
    if len(item) > max_len:
        max_len = len(item)
        q = item
        card = c

print(f'The max length is: {max_len} on item: {card} {q}')

The max length is: 1569 on item: 19152 Crash Landing — Search your library for a basic land card , reveal it , put it into your hand , then shuffle . \n Goblin Camp — Create a Treasure token . \n Emerald Grove — Create a 2/2 white Knight creature token . \n Auntie's Teahouse — Scry 3 . \n Defiled Temple — You may sacrifice a permanent . If you do , draw a card . \n Mountain Pass — You may put a land card from your hand onto the battlefield . \n Ebonlake Grotto — Create two 1/1 blue Faerie Dragon creature tokens with flying . \n Grymforge — For each opponent , goad up to one target creature that player controls . \n Githyanki Crèche — Distribute three +1/+1 counters among up to three target creatures you control . \n Last Light Inn — Draw two cards . \n Reithwin Tollhouse — Roll 2d4 and create that many Treasure tokens . \n Moonrise Towers — Instant and sorcery spells you cast this turn cost 3 less to cast . \n Gauntlet of Shar — Each opponent loses 5 life . \n Balthazar's Lab — Return 

Item 19152 seems like it is too many cards put together... I need to figure out what is going on.


In [146]:
# TODO: Fix multiple cards being lumped together

In [37]:
data['train']['sentence'][19152]

'Crash Landing — Search your library for a basic land card , reveal it , put it into your hand , then shuffle . \\n Goblin Camp — Create a Treasure token . \\n Emerald Grove — Create a 2/2 white Knight creature token . \\n Auntie\'s Teahouse — Scry 3 . \\n Defiled Temple — You may sacrifice a permanent . If you do , draw a card . \\n Mountain Pass — You may put a land card from your hand onto the battlefield . \\n Ebonlake Grotto — Create two 1/1 blue Faerie Dragon creature tokens with flying . \\n Grymforge — For each opponent , goad up to one target creature that player controls . \\n Githyanki Crèche — Distribute three +1/+1 counters among up to three target creatures you control . \\n Last Light Inn — Draw two cards . \\n Reithwin Tollhouse — Roll 2d4 and create that many Treasure tokens . \\n Moonrise Towers — Instant and sorcery spells you cast this turn cost 3 less to cast . \\n Gauntlet of Shar — Each opponent loses 5 life . \\n Balthazar\'s Lab — Return up to two target creatu

In [147]:
tokenize_dataset_dict['train']['sentence'][19152]

'Crash Landing — Search your library for a basic land card , reveal it , put it into your hand , then shuffle . \\n Goblin Camp — Create a Treasure token . \\n Emerald Grove — Create a 2/2 white Knight creature token . \\n Auntie\'s Teahouse — Scry 3 . \\n Defiled Temple — You may sacrifice a permanent . If you do , draw a card . \\n Mountain Pass — You may put a land card from your hand onto the battlefield . \\n Ebonlake Grotto — Create two 1/1 blue Faerie Dragon creature tokens with flying . \\n Grymforge — For each opponent , goad up to one target creature that player controls . \\n Githyanki Crèche — Distribute three +1/+1 counters among up to three target creatures you control . \\n Last Light Inn — Draw two cards . \\n Reithwin Tollhouse — Roll 2d4 and create that many Treasure tokens . \\n Moonrise Towers — Instant and sorcery spells you cast this turn cost 3 less to cast . \\n Gauntlet of Shar — Each opponent loses 5 life . \\n Balthazar\'s Lab — Return up to two target creatu

In [148]:
tokenize_dataset_dict['train']['input_ids'][19152]

[33,
 2783,
 2374,
 190,
 397,
 798,
 194,
 330,
 291,
 133,
 786,
 384,
 180,
 143,
 539,
 185,
 143,
 285,
 185,
 399,
 194,
 299,
 143,
 406,
 559,
 134,
 166,
 74,
 867,
 4168,
 397,
 619,
 133,
 871,
 321,
 134,
 166,
 74,
 6750,
 3124,
 397,
 619,
 133,
 386,
 14,
 17,
 607,
 1124,
 161,
 321,
 134,
 166,
 74,
 5767,
 247,
 1207,
 2031,
 3317,
 397,
 1163,
 470,
 134,
 166,
 74,
 5401,
 3696,
 3736,
 397,
 403,
 257,
 537,
 133,
 402,
 134,
 357,
 147,
 435,
 143,
 407,
 133,
 180,
 134,
 166,
 74,
 1199,
 3401,
 397,
 403,
 257,
 285,
 133,
 384,
 180,
 269,
 194,
 299,
 540,
 170,
 369,
 134,
 166,
 74,
 3691,
 72,
 1110,
 2047,
 80,
 288,
 397,
 619,
 380,
 315,
 14,
 16,
 754,
 1774,
 793,
 161,
 605,
 243,
 477,
 134,
 166,
 74,
 5834,
 73,
 2009,
 162,
 397,
 827,
 280,
 334,
 143,
 1962,
 360,
 173,
 371,
 213,
 161,
 242,
 271,
 501,
 134,
 166,
 74,
 398,
 235,
 85,
 148,
 3145,
 1606,
 105,
 94,
 1157,
 397,
 5000,
 523,
 244,
 16,
 246,
 16,
 409,
 620,
 360,
 173,
 52

In [149]:
# Make sure the tokenized dataset is in the correct format for PyTorch
tokenize_dataset_dict.set_format(type='torch', columns=['sentence', 'input_ids', 'attention_mask'])

In [150]:
tokenize_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 24498
    })
    validation: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 3062
    })
    test: Dataset({
        features: ['sentence', 'input_ids', 'attention_mask'],
        num_rows: 3063
    })
})

In [None]:
# Note that these have been padded out to the max_length

for i in range(0, 11):
    print(tokenize_dataset_dict['train'][i]['input_ids'].shape)



torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])


In [152]:
for i in range(0, 11):
    print(tokenize_dataset_dict['train'][i]['attention_mask'].shape)

torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])
torch.Size([256])


In [153]:
for i in range(0, 11):
    print(tokenize_dataset_dict['train'][i]['sentence'])

Choose one — \n • Destroy target Cleric . \n • Return target Cleric card from your graveyard to your hand . \n • Target player loses 2 life .
( {T} : Add {W} or {B} . )
Explosive Welcome deals 5 damage to any target and 3 damage to any other target . Add {R} {R} {R} .
Vigilance (Attacking doesn't cause this creature to tap . )
Trample \n Alexios , Deimos of Kosmos attacks each combat if able , can't be sacrificed , and can't attack its owner . \n At the beginning of each player's upkeep , that player gains control of Alexios , untaps it , and puts a +1/+1 counter on it . It gains haste until end of turn .
When Gang of Devils dies , it deals 3 damage divided as you choose among one , two , or three targets .
Ninjutsu {2} {U} ( {2} {U} , Return an unblocked attacker you control to hand : Put this card onto the battlefield from your hand tapped and attacking . ) \n Whenever Mist-Syndicate Naga deals combat damage to a player , create a token that's a copy of Mist-Syndicate Naga .
Boros Ga

# Build the Decoder

## From TensorFlow Keras

In [None]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embed_dims, seq_len):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size + 1, embed_dims)
        self.pos_embedding = tf.keras.layers.Embedding(seq_len, embed_dims, mask_zero=True)

    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)

    def call(self, x):
        index_range = tf.shape(x)[-1]
        y = self.embedding(x)
        indices = tf.range(index_range)
        pos = self.pos_embedding(indices)
        return y + pos

In [None]:
class DecoderSelfAttention(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim):
        super().__init__()
        self.attention = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim=key_dim)
        
    def call(self, x):
        y = self.attention(
            query=x,
            value=x,
            key=x,
            use_causal_mask=True)
        return y

In [None]:
VOCAB_SIZE = 8912 # Define this based on training data vocab size / tokenize size.
# TODO: Or does it need to relate to the tokenizer?
SEQUENCE_LENGTH = 500 # Define this based on training data sequence length.
EMBEDDING_DIM = 512 # Configure
FEED_FORWARD_DIM = 4 * EMBEDDING_DIM # This is usual value used by other LLMs, from below reference.
DROPOUT = 0.5 # Configure
NUM_HEADS = 16 # Configure
NUM_DECODER_LAYERS = 4 # Configure

### Reference: https://cameronrwolfe.substack.com/p/decoder-only-transformers-the-workhorse

decoder_inputs = tf.keras.Input(shape=(None,), dtype=tf.int64, name="decoder_inputs")
positional_embedding = PositionalEmbedding(
    VOCAB_SIZE, EMBEDDING_DIM, SEQUENCE_LENGTH)(decoder_inputs)

layer_input = positional_embedding
layer_output = None
for i in range(NUM_DECODER_LAYERS):
    decoder_norm_1 = tf.keras.layers.LayerNormalization()(layer_input)
    decoder_self_attention = DecoderSelfAttention(NUM_HEADS, EMBEDDING_DIM)(decoder_norm_1)
    decoder_add_1 = tf.keras.layers.Add()([decoder_self_attention, decoder_norm_1])
    decoder_norm_2 = tf.keras.layers.LayerNormalization()(decoder_add_1)
    decoder_feedforward_1 = tf.keras.layers.Dense(FEED_FORWARD_DIM, activation="gelu")(decoder_norm_2)
    decoder_feedforward_2 = tf.keras.layers.Dense(EMBEDDING_DIM)(decoder_feedforward_1)
    decoder_dropout = tf.keras.layers.Dropout(DROPOUT)(decoder_feedforward_2)
    decoder_add_2 = tf.keras.layers.Add()([decoder_dropout, decoder_add_1])

    layer_input = decoder_add_2
    layer_output = decoder_add_2

prediction = tf.keras.layers.Dense(VOCAB_SIZE, activation="softmax")(layer_output)
transformer = tf.keras.models.Model(
    inputs=decoder_inputs, outputs=prediction, name="transformer")

In [None]:
transformer.summary()

## A Short Rebuild of the above but with Hugging Face API

We might be able to leverage accelerated training by using HF so this could be worth to have.  It also uses a PyTorch backend which I know works on Brandon's system... TensorFlow has been giving me problems in my environment lately.

In [154]:
# Grab the decoder-only setup
from transformers import GPT2Config, GPT2LMHeadModel

In [155]:
VOCAB_SIZE = 8912 # Define this based on training data vocab size / tokenize size.
# TODO: Or does it need to relate to the tokenizer?
SEQUENCE_LENGTH = 500 # Define this based on training data sequence length. aka. context window
EMBEDDING_DIM = 512 # Configure
FEED_FORWARD_DIM = 4 * EMBEDDING_DIM # Standard for decoder-only items.
DROPOUT = 0.5 # Configure
NUM_HEADS = 16 # Configure attention heads
NUM_DECODER_LAYERS = 4 # Configure

# Define the model configuration
model_config = GPT2Config(
    vocab_size=VOCAB_SIZE,
    n_positions=SEQUENCE_LENGTH,
    n_ctx=SEQUENCE_LENGTH,
    n_embd=EMBEDDING_DIM,
    n_layer=NUM_DECODER_LAYERS,
    n_head=NUM_HEADS,
    resid_pdrop=DROPOUT,
    embd_pdrop=DROPOUT,
    attn_pdrop=DROPOUT,
    # layer_norm_epsilon=1e-5,
    # initializer_range=0.02,
)

# Initialize the model
gptmodel = GPT2LMHeadModel(model_config)
# Output the model layers for inspection
gptmodel

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(8912, 512)
    (wpe): Embedding(500, 512)
    (drop): Dropout(p=0.5, inplace=False)
    (h): ModuleList(
      (0-3): 4 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=1536, nx=512)
          (c_proj): Conv1D(nf=512, nx=512)
          (attn_dropout): Dropout(p=0.5, inplace=False)
          (resid_dropout): Dropout(p=0.5, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=2048, nx=512)
          (c_proj): Conv1D(nf=512, nx=2048)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.5, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=512, out_features=8912, bias=False)
)

In [156]:
# How many trainable parameters exist in the gptmodel? (setting p.requires_grad to True for this)
trainable_params = sum(p.numel() for p in gptmodel.parameters() if p.requires_grad)
print("Number of trainable parameters:", trainable_params)

Number of trainable parameters: 17429504


In [157]:
# Print each parameter's name and trainable count individually
for name, param in gptmodel.named_parameters():
    if param.requires_grad:
        print(f"{name}: {param.numel()} trainable parameters")

transformer.wte.weight: 4562944 trainable parameters
transformer.wpe.weight: 256000 trainable parameters
transformer.h.0.ln_1.weight: 512 trainable parameters
transformer.h.0.ln_1.bias: 512 trainable parameters
transformer.h.0.attn.c_attn.weight: 786432 trainable parameters
transformer.h.0.attn.c_attn.bias: 1536 trainable parameters
transformer.h.0.attn.c_proj.weight: 262144 trainable parameters
transformer.h.0.attn.c_proj.bias: 512 trainable parameters
transformer.h.0.ln_2.weight: 512 trainable parameters
transformer.h.0.ln_2.bias: 512 trainable parameters
transformer.h.0.mlp.c_fc.weight: 1048576 trainable parameters
transformer.h.0.mlp.c_fc.bias: 2048 trainable parameters
transformer.h.0.mlp.c_proj.weight: 1048576 trainable parameters
transformer.h.0.mlp.c_proj.bias: 512 trainable parameters
transformer.h.1.ln_1.weight: 512 trainable parameters
transformer.h.1.ln_1.bias: 512 trainable parameters
transformer.h.1.attn.c_attn.weight: 786432 trainable parameters
transformer.h.1.attn.c_at

There is a difference in the reported number of trained parameters between Etienne's custom build and the Hugging Face model.

I'm 90% certain it has to do with the positional eoncodings being done slightly different.  48M of his model parameters are from the PosEnc which doesn't show up the same way in the GPT-model.

Both models are likely to generate similar results.

# Training Loop

## Setup Training Arguments

In [158]:
# Set up the Training Args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5, # Prior runs show that I am easily over fitting the data at 25 epochs...
    weight_decay=0.01,
    logging_dir='./logs',
    # logging_steps=10, # This made my loss vs epoch plot too noisy...
    logging_strategy='epoch',
    save_strategy='epoch',
)

In [167]:
# Set up the Trainer

#mlm is the masked language modeling

data_collator = DataCollatorForLanguageModeling(tokenizer=trained_tokenizer, mlm=False, return_tensors='pt')

trainer = Trainer(
    model=gptmodel,
    args=training_args,
    train_dataset=tokenize_dataset_dict['train'],
    eval_dataset=tokenize_dataset_dict['validation'],
    processing_class=trained_tokenizer,
    data_collator=data_collator
)

In [168]:
# Run the Trainer
trainer.train()

AttributeError: 'tokenizers.Tokenizer' object has no attribute 'pad'

OLD CODING WORK:

# Set up the Trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors='pt')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_reduced['train'],
    eval_dataset=tokenized_dataset_reduced['validation'],0
    tokenizer=tokenizer,
    data_collator=data_collator
)
