In [1]:
from datasets import load_dataset, DatasetDict



In [2]:
spock_dataset  = load_dataset("csv", data_files="/home/bobby/projects/bll-ai-toolbox-model-1/datasets/_scripts_TOS_cleaned.csv")



In [3]:

#drop all the columns except dialogue
spock_dataset = spock_dataset.map(lambda x: {"dialogue": x["dialogue"]})
#split the dataset into a training and validation set
train_dataset = spock_dataset["train"]
dataset_split = train_dataset.train_test_split(test_size=0.2,seed=22)
#Rename the default 'test' split to 'validation'
dataset_split['validation'] = dataset_split.pop('test')
print(dataset_split)




DatasetDict({
    train: Dataset({
        features: ['title', 'original_airdate', 'production_number', 'dialogue'],
        num_rows: 3476
    })
    validation: Dataset({
        features: ['title', 'original_airdate', 'production_number', 'dialogue'],
        num_rows: 869
    })
})


In [5]:
#subset_size=100
#the following is temporary code to help debug training issues by choosing a small subset to work with
#train_subset = train_dataset.select(range(subset_size))
#validation_subset = dataset_split['validation'].select(range(subset_size))

#Create a new DatasetDict with the subsets
#dataset_split = DatasetDict({'train': train_subset, 'validation': validation_subset})





  ## Parameters

In [6]:
pretrained_model = "distilbert/distilgpt2"
finetuned_modelname = "distilgpt2-spock"
huggingface_username = "omgbobbyg"
huggingface_reponame = f"{huggingface_username}/{finetuned_modelname}"  


  ## Tokenization

In [7]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained(pretrained_model)
# Get the maximum context size
max_length = model.config.max_position_embeddings
print(f"Maximum context size: {max_length}")



Maximum context size: 1024


In [8]:
from transformers import GPT2Tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Define a function to tokenize the text data
#def tokenize_function(examples):
#    return tokenizer(examples["dialogue"], max_length=max_length, truncation=True, padding=True)

def tokenize_function(examples):
    return tokenizer(examples["dialogue"],max_length=max_length)




# Apply the tokenization function to the entire dataset
tokenized_datasets = dataset_split.map(
    tokenize_function,
    batched=True,
    batch_size=10,
    remove_columns=dataset_split["train"].column_names
)



In [9]:
#Verify that none of our tokenized inputs are greater the maximum context size
# Example: Checking the length of the first few tokenized inputs
# for i, input_ids in enumerate(tokenized_datasets['train'][range(5)]):
#     print(f"Length of input {i}: {len(input_ids)}")
#     assert len(input_ids) <= max_length, f"Input {i} exceeds the maximum context size."




In [10]:
#We need to create data collator to manage the batches, we can use DataCollatorForLanguageModeling
from transformers import DataCollatorForLanguageModeling
tokenizer.pad_token ="<pad>" #tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)
# Iterate over the generator
out = data_collator([tokenized_datasets["train"][i] for i in range(1)])
for key in out:
    print(f"{key} shape: {out[key].shape}")



You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([1, 8])
attention_mask shape: torch.Size([1, 8])
labels shape: torch.Size([1, 8])


In [11]:
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
import torch; torch.cuda.is_available()



True

In [13]:
import logging    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Configure the logger if it hasn't been configured before
if not logger.handlers:
    handler = logging.FileHandler('training.log')
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)



In [14]:
#we define our metrics computation
from transformers import EvalPrediction, PreTrainedTokenizer
import numpy as np
import math

def index_of_longest_sequence(labels):
    """
    Finds the index of the longest sequence in the labels array.
    
    Parameters:
    - labels: A 2D numpy.ndarray or a list of lists, where each inner list contains token IDs for one sequence.
              Padding tokens are represented by -100.
    
    Returns:
    - An integer representing the index of the longest sequence.
    """
    # Count the number of non-padding tokens (-100) in each sequence
    sequence_lengths = np.count_nonzero(labels != -100, axis=1)
    
    # Find the index of the sequence with the maximum length
    longest_sequence_index = np.argmax(sequence_lengths)
    
    return longest_sequence_index

def decode_prediction_row(tokenizer: PreTrainedTokenizer, prediction_logits_row):
    """
    Decodes a single row of prediction logits back to a string.

    Parameters:
    - tokenizer: An instance of transformers.PreTrainedTokenizer.
    - prediction_logits_row: A 2D tensor or array containing logits for a single row of predictions.
      The shape is expected to be [sequence_length, vocabulary_size].

    Returns:
    - A string decoded from the most likely token IDs in prediction_logits_row.
    """

     # Ensure prediction_logits_row is a PyTorch tensor
    if isinstance(prediction_logits_row, np.ndarray):
        prediction_logits_row = torch.tensor(prediction_logits_row)
    
    # Convert logits to token IDs by taking the argmax over the vocabulary dimension
    token_ids = torch.argmax(prediction_logits_row, dim=-1)
    
    # Decode the token IDs to a string
    decoded_string = tokenizer.decode(token_ids, skip_special_tokens=True)
    
    return decoded_string

def decode_labels_row(tokenizer: PreTrainedTokenizer, labels_row):
    """
    Decodes a single row of labels back to a string, ignoring padding tokens.

    Parameters:
    - tokenizer: An instance of transformers.PreTrainedTokenizer.
    - labels_row: A 1D tensor or array containing token IDs for a single row of labels.

    Returns:
    - A string decoded from the labels_row, excluding padding tokens.
    """
    # Filter out padding tokens (-100) from the labels row
    filtered_tokens = [token for token in labels_row if token != -100]
    
    # Decode the filtered token IDs to a string
    decoded_string = tokenizer.decode(filtered_tokens, skip_special_tokens=True)
    
    return decoded_string

def compute_metrics(eval_pred: EvalPrediction):
     # Create or get the logger
    logger = logging.getLogger(__name__)
    

    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    
    index = index_of_longest_sequence(labels)
    decoded_prediction = decode_prediction_row(tokenizer, logits[index])
    decoded_label = decode_labels_row(tokenizer, labels[index])

    shift_logits = logits[..., :-1, :]
    shift_labels = labels[..., 1:]
    #this code flattens the 3D array to a 2d array by collapsing dimension 1 and 2 together and keeping dimension 3 the same
    shift_logits_flat = shift_logits.reshape(-1, shift_logits.shape[-1])
    #this equivalent to doing reshape(-1)
    shift_labels_flat = shift_labels.flatten()
    
    loss_fct = torch.nn.CrossEntropyLoss(reduction='sum')
    loss = loss_fct(torch.from_numpy(shift_logits_flat), torch.from_numpy(shift_labels_flat))
    
    num_tokens = np.count_nonzero(shift_labels_flat != -100)
    perplexity = math.exp(loss.item() / num_tokens)
    logger.info(f"Perplexity: {perplexity}")

    
    # Log the predictions and labels
    logger.info(f'Generated Text: {decoded_prediction}')
    logger.info(f'Actual Text: {decoded_label}')
    logger.info(f"Perplexity: {perplexity}")
    return {"perplexity": perplexity}

def compute_metrics_2(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    # Ensure logits and labels are PyTorch tensors
    if not isinstance(logits, torch.Tensor):
        logits = torch.tensor(logits)
    if not isinstance(labels, torch.Tensor):
        labels = torch.tensor(labels)

    # Reshape and shift as necessary, foc
    # Shift labels to the right to align with model's predictions
    # Flatten the logits and labels to calculate loss only for non-ignored indices
    #logits = logits[..., :-1, :].reshape(-1, logits.shape[-1])
    #labels = labels[..., 1:].flatten()

    logits = logits.reshape(-1, logits.shape[-1])
    labels = labels.flatten()


    # Filter out `-100` used for ignored indices in labels
    valid_indices = labels != -100
    valid_logits = logits[valid_indices]
    valid_labels = labels[valid_indices]

    # Calculate Cross Entropy Loss for valid positions
    loss_fct = torch.nn.CrossEntropyLoss(reduction='mean')
    loss = loss_fct(valid_logits, valid_labels)

    # Calculate Perplexity
    perplexity = torch.exp(loss)

    return {"perplexity": perplexity.item()}

    



In [15]:
import gc

gc.collect()

torch.cuda.empty_cache()



In [16]:
#Now we train the model using the Trainer API
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    finetuned_modelname,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    push_to_hub=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    hub_model_id=huggingface_reponame
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)





In [17]:
#lets now run against the base model and log the results
initial_results = trainer.evaluate()
print(initial_results)
#log the results to file
logger.info(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")
print(f"Baseline {pretrained_model} Results: Perplexity: {math.exp(initial_results['eval_loss']):.2f}")


RuntimeError: cannot pin 'torch.cuda.LongTensor' only dense CPU tensors can be pinned

In [17]:
#do a test prediction using the HuggingFace pipeline and this base model
from transformers import pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# grab a piece of text from the eval data set to use as a prompt
test_prompt = "It is your only other choice"
result = text_generator(test_prompt, max_length=100, num_return_sequences=1)
print(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Baseline {pretrained_model} generated result: {test_prompt}...{result[0]['generated_text']}")



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
trainer.train()
eval_results = trainer.evaluate()
logger.info(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
print(f"Fine-tuned {finetuned_modelname} Results: Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

#generate text based on the fine tuned model
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
result = text_generator(test_prompt, max_length=100, num_return_sequences=1)
print(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")
logger.info(f"Fine-tuned {finetuned_modelname} generated result: {test_prompt}...{result[0]['generated_text']}")


In [None]:
trainer.push_to_hub()



In [None]:
import torch
for batch in trainer.get_train_dataloader():
    break
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: v.to(device) for k, v in batch.items()}
trainer.create_optimizer()

for _ in range(20):
    outputs = trainer.model(**batch)
    loss = outputs.loss
    loss.backward()
    trainer.optimizer.step()
    trainer.optimizer.zero_grad()



In [None]:
import numpy as np
import evaluate


with torch.no_grad():
    outputs = trainer.model(**batch)
preds = outputs.logits
labels = batch["labels"]

eval_prediction = EvalPrediction(predictions=preds.cpu().numpy(), label_ids=labels.cpu().numpy())
compute_metrics(eval_prediction)





