<a href="https://colab.research.google.com/github/0x11c11e/the-art-of-fine-tuning/blob/main/Mistral_RolePlay.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install transformers
! pip install datasets
! pip install torch

In [None]:
from datasets import load_dataset

# Load the dataset from the specified repository
dataset = load_dataset("iamketan25/roleplay-instructions-dataset", split='train')

# Perform a train-test split to separate 10% of the data for evaluation
train_test_split = dataset.train_test_split(test_size=0.1) # 10% of the dataset

# Extract the training and evaluation datasets from the split
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


In [None]:
print(train_dataset[0])

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer for the specified pretrained model
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Ensure the tokenizer utilizes the end-of-sequence token as the padding token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    """
    Tokenizes the input examples using the pretrained tokenizer.

    Parameters:
    - examples: A dictionary containing the input prompts and the corresponding chosen responses.

    Returns:
    A dictionary of tokenized model inputs and labels, suitable for training a machine learning model.
    """
    # Tokenize the prompts with specified options
    model_inputs = tokenizer(examples['prompt'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # Tokenize the chosen responses to use as labels, ensuring consistent tokenization settings
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['chosen'], padding="max_length", truncation=True, max_length=512, return_tensors="pt")["input_ids"]

    # Append the tokenized labels to the model inputs
    model_inputs["labels"] = labels
    return model_inputs

# Tokenize the training and evaluation datasets, removing original columns to streamline the data structure
tokenized_train = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names)


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the pre-trained causal language model from the specified source
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Note: It is assumed that the corresponding tokenizer has been loaded previously in the workflow.
# The tokenizer is essential for pre-processing the input data before feeding it to the model.

In [None]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

# Initialize the model with a pre-trained causal language model
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

# Configure the training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory where the model predictions and checkpoints will be stored
    num_train_epochs=3,  # Total number of training epochs
    per_device_train_batch_size=4,  # Batch size per device during training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay if we apply some.
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=10,  # Log every X updates steps.
    evaluation_strategy="epoch",  # Evaluation is done at the end of each epoch
    load_best_model_at_end=True,  # Load the best model at the end of training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,  # The instantiated 🤗 Transformers model to be trained
    args=training_args,  # Training arguments, defined above
    train_dataset=tokenized_train,  # Training dataset
    eval_dataset=tokenized_eval,  # Evaluation dataset
)

# Note: The `tokenized_train` and `tokenized_eval` datasets should be prepared beforehand,
# following the tokenization process suitable for your model and data.


In [None]:
# Start the model training
trainer.train()

# After training, to visualize and monitor the training process using TensorBoard,
# the following Jupyter notebook magic commands are used:

# Load the TensorBoard notebook extension. This line is only needed once per notebook.
%load_ext tensorboard

# Launch TensorBoard within the notebook to visualize metrics.
# The log directory is specified to where the training logs are saved.
%tensorboard --logdir ./logs
