#  BM20A6100 Advanced Data Analysis and Machine Learning
## Erik Kuitunen, 0537275

Cleaning data, load as dataset dictionary

In [104]:
from datasets import load_dataset

file = open("robinhood.txt", 'rb')
lines = []
for line in file:
    line = line.strip()
    line = line.decode("ascii", "ignore")
    if len(line) == 0:
        continue
    lines.append(line)
file.close()

file = open("robinhood_cleaned.txt", 'w')
for line in lines:
    file.write(line + "\n")
file.close()

datasets = load_dataset("text", data_files={"train": "robinhood_cleaned.txt" } )

text_all = " ".join( datasets["train"]["text"] )
words = text_all.split()

# set of characters that occur in the text
chars = set( [c for c in text_all] )

# Total items in our vocabulary
unique_chars = len( chars )



Generating train split: 0 examples [00:00, ? examples/s]

In [105]:
datasets["train"][0]

{'text': 'THE MERRY ADVENTURES OF ROBIN HOOD'}

Building a tokenizer using Byte-Pair Encoding (BPE) tokenization

In [106]:
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer

tokenizer = Tokenizer( models.BPE() )
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False )    # add_prefix_space: whether to add a space before the first word "hello" or " hello"



Training the tokenizer and adding post processing

In [107]:
tokenizer_trainer = trainers.BpeTrainer( vocab_size=unique_chars )    # Using same vocab size as last week for better comparison
tokenizer.train_from_iterator( text_all, trainer=tokenizer_trainer )

tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
tokenizer.decoder = decoders.ByteLevel()

Wrapping our own tokenizer into GPT-2 tokenizer object. We will be using GPT-2 architecture also model in training and evaluating

In [108]:
from transformers import GPT2TokenizerFast

robinhood_tokenizer = GPT2TokenizerFast( tokenizer_object=tokenizer )

Tokenizing the dataset

In [109]:
def tokenize_function(examples):
    return robinhood_tokenizer( examples["text"] )

tokenized_datasets = datasets.map( tokenize_function, remove_columns=["text"] )

Map:   0%|          | 0/9362 [00:00<?, ? examples/s]

Grouping text; I do not wholly understand everything this function does. I understand it combines the tokenized data and corresponding output ("X", and "y") for the model, in addition to splitting the data into equal sized chunks.

In [110]:
def group_texts(examples):
    block_size = 128        # Size of each input block, may be adjusted. Has big impact on training time.
    
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=100,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/9362 [00:00<?, ? examples/s]

Creating model (GPT-2 architecture)

In [111]:
from transformers import AutoConfig, AutoModelForCausalLM

config = AutoConfig.from_pretrained( "gpt2" )
model = AutoModelForCausalLM.from_config( config )

Instantiating trainer and its arguments

In [112]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "checkpoints",
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs = 200
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
)

Training the model

In [113]:
trainer.train()

  0%|          | 0/111400 [00:00<?, ?it/s]

{'loss': 3.2439, 'grad_norm': 2.126720428466797, 'learning_rate': 1.991023339317774e-05, 'epoch': 0.9}
{'loss': 2.2916, 'grad_norm': 2.686671018600464, 'learning_rate': 1.9820466786355476e-05, 'epoch': 1.8}
{'loss': 2.0704, 'grad_norm': 3.501823663711548, 'learning_rate': 1.9730700179533215e-05, 'epoch': 2.69}
{'loss': 1.9011, 'grad_norm': 3.5388927459716797, 'learning_rate': 1.9640933572710953e-05, 'epoch': 3.59}
{'loss': 1.7795, 'grad_norm': 3.3054778575897217, 'learning_rate': 1.955116696588869e-05, 'epoch': 4.49}
{'loss': 1.688, 'grad_norm': 3.7335259914398193, 'learning_rate': 1.9461400359066428e-05, 'epoch': 5.39}
{'loss': 1.616, 'grad_norm': 3.553478479385376, 'learning_rate': 1.9371633752244166e-05, 'epoch': 6.28}
{'loss': 1.5593, 'grad_norm': 3.7092859745025635, 'learning_rate': 1.9281867145421905e-05, 'epoch': 7.18}
{'loss': 1.5077, 'grad_norm': 3.804635763168335, 'learning_rate': 1.9192100538599644e-05, 'epoch': 8.08}
{'loss': 1.4592, 'grad_norm': 3.8332812786102295, 'learni

TrainOutput(global_step=111400, training_loss=0.2805910570017953, metrics={'train_runtime': 12918.8094, 'train_samples_per_second': 68.969, 'train_steps_per_second': 8.623, 'total_flos': 5.8202800128e+16, 'train_loss': 0.2805910570017953, 'epoch': 200.0})

Saving the model and tokenizer locally for later use without need for retraining

In [None]:
# robinhood_tokenizer.save_pretrained("F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer")
# trainer.save_model("F:/Opiskelu/adaml-2024fall/week-11/robinhood_model")

('F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer\\tokenizer_config.json',
 'F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer\\special_tokens_map.json',
 'F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer\\vocab.json',
 'F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer\\merges.txt',
 'F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer\\added_tokens.json',
 'F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer\\tokenizer.json')

Testing model loading

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_test = AutoModelForCausalLM.from_pretrained("F:/Opiskelu/adaml-2024fall/week-11/robinhood_model")
tokenizer_test = AutoTokenizer.from_pretrained("F:/Opiskelu/adaml-2024fall/week-11/robinhood_tokenizer")

AttributeError: GPT2TokenizerFast has no attribute to

Testing text generation on the trained model using beam search. Reference: https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb

In [None]:
input_text = "Little John was sus, no cap."

input_ids = robinhood_tokenizer.encode( input_text, return_tensors="pt" ).to( model.device )

greedy_output_ids = model.generate(
    input_ids,  
    max_length=200, 
    num_beams=5, 
    early_stopping=True
)

greedy_output = robinhood_tokenizer.decode( greedy_output_ids[0], skip_special_tokens=False )

print( "\n", greedy_output )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



 Little John was sus, no cap.  There is no so far Little John and six to Little John and Will Stutely cameleaping and stood upon the stope, relad thime thim whilllly that t thathe d no be mear bext t t


In [133]:
input_text = "Little John was sus, no cap."

input_ids_test = tokenizer_test.encode( input_text, return_tensors="pt" ).to('cuda')

greedy_output_ids = model_test.generate(
    input_ids_test,  
    max_length=200, 
    num_beams=5, 
    early_stopping=True
)

greedy_output = tokenizer_test.decode( greedy_output_ids[0], skip_special_tokens=False )

print( "\n", greedy_output )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



 Little John was sus, no cap.  But for the two stout yeoman in Lincolngreen green away through the others were never could belonginging to the the r, this hatror had ppayoush roff t trof trof thenof t 


Epochs: Huge impact on computational time; block size: huge impact on comp time; 

References:

https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb

https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb

https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb

Additionally, Hugging Face's documentation on Transformers, Datasets and Tokenizers:

https://huggingface.co/docs