In [1]:
from datasets import load_dataset, DatasetDict

## Parameters

In [2]:

dataset_path='omgbobbyg/spock'
pretrained_model = "distilbert/distilgpt2"
finetuned_modelname = "distilgpt2-spock"
huggingface_username = "omgbobbyg"
huggingface_reponame = f"{huggingface_username}/{finetuned_modelname}"  

In [3]:
import torch; 
import gc;

is_gpu_available = torch.cuda.is_available()
device = 'cuda' if is_gpu_available else 'cpu'
if is_gpu_available:
    print("GPU available for notebook")
    torch.cuda.empty_cache()
    print("GPU Memory cleaned")
else:
    print("No GPU available for notebook")
    
gc.collect()


from huggingface_hub import notebook_login
notebook_login()

GPU available for notebook
GPU Memory cleaned


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import logging    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configure the logger if it hasn't been configured before
if not logger.handlers:
    handler = logging.FileHandler('training.log')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)

In [5]:
#load the spock dataset from the hub
spock_dataset = load_dataset(dataset_path)
print(spock_dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'original_airdate', 'production_number', 'dialogue'],
        num_rows: 3476
    })
    validation: Dataset({
        features: ['title', 'original_airdate', 'production_number', 'dialogue'],
        num_rows: 869
    })
})


## Model Creation

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(pretrained_model)

## Tokenization

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

block_size = 512
print(f"Using max block size of {block_size}")

def tokenize_function(examples):
    return tokenizer(examples["dialogue"])

tokenized_datasets = spock_dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=spock_dataset["train"].column_names)

print(tokenized_datasets)

Using max block size of 512
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3476
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 869
    })
})


In [8]:
#takes an input a tokenized dataset and then concatenates all the tokens together and then splits them into blocks of block_size
def concatenate_tokens(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()

    return result

final_tokenized_dataset = tokenized_datasets.map(concatenate_tokens, batched=True, num_proc=4)
print(final_tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 121
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 29
    })
})


In [9]:
print(len(final_tokenized_dataset['train'][0]['input_ids'])) #this should output the same as block_size

512


## Data Collator

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Trainer and Training Args

In [11]:
#Now we train the model using the Trainer API
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    finetuned_modelname,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=is_gpu_available,
    push_to_hub=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    hub_model_id=huggingface_reponame
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=final_tokenized_dataset["train"],
    eval_dataset=final_tokenized_dataset["validation"]
)

## Evaluate Untrained Model

In [12]:
import math
#lets now run against the base model and log the results
initial_results = trainer.evaluate()
print(initial_results)
#log the results to file
logger.info(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")
print(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'eval_loss': 4.568508148193359, 'eval_runtime': 1.3122, 'eval_samples_per_second': 22.1, 'eval_steps_per_second': 6.097}
Baseline distilbert/distilgpt2 Results: Perplexity: 96.40


In [13]:
from transformers import pipeline
#do a test prediction on the baseline model
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)

# grab a piece of text from the eval data set to use as a prompt
test_prompt = "It is your only other choice"
result = text_generator(test_prompt, max_length=100, num_return_sequences=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Baseline distilbert/distilgpt2 generated result: It is your only other choice...It is your only other choice for the role that YOU can play, and your character is in your own right.



To help you start with the process of building for YOU, please remember that each member of your group has the resources and resources of their peers to do so. Every member of your group has the resources. They are all your own.
As you become a member, you will be able to support your friends. I know that every member of your group wants to


## Train and Evaluate Fine Tune Model

In [14]:
trainer.train()
eval_results = trainer.evaluate()
perplexity = math.exp(eval_results['eval_loss'])
eval_results['perplexity'] = perplexity

logger.info(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")
print(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {perplexity:.2f}")

#generate text based on the fine tuned model
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
result = text_generator(test_prompt, max_length=100, num_return_sequences=1)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")

Epoch,Training Loss,Validation Loss
1,No log,4.23254
2,No log,4.172004
3,No log,4.150917


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Fine-tuned distilgpt2-spock Results: Perplexity: 63.49
Fine-tuned distilgpt2-spock generated result: It is your only other choice...It is your only other choice. My only choice is an alternative. A small amount of light with no significant electrical input. In fact, you were the one who made the decision. I can see you are standing on the way.

A very short time ago the ship was still active.
At that moment it was completely unshielded. What were you thinking of it right now? What did you think of it all?
I can understand this decision. There was an impulse from
