# Install libraries

In [None]:
# # installs
# !pip install -i https://pypi.org/simple/ bitsandbytes
# !pip install accelerate
# !pip install wandb
# !pip install peft
# !pip install datasets


# Logins


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import wandb
# wandb.login()

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

# Set varibles

In [None]:
import os
#setting the environment variable
os.environ["WANDB_PROJECT"] = "llama_for_booking_travel"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [None]:
model_name = 'KvrParaskevi/Llama-2-7b-Hotel-Booking-Model'
output_dir =  '/content/drive/MyDrive/llama_booking'


# Imports

In [None]:
# imports
from transformers import AutoTokenizer, BitsAndBytesConfig,AutoModelForCausalLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model, AutoPeftModelForCausalLM
import torch

from transformers.integrations import WandbCallback
import pandas as pd

from datasets import load_from_disk


# Download model

In [None]:
# Configuration for quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Load weights in 4-bit format
    bnb_4bit_compute_dtype=torch.float16  # Use 16-bit floating point for 4-bit computations
)

In [None]:
# Load the model with quantization configuration
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=quantization_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

#  Add adapter

In [None]:
# Configuration for LoRA (Low-Rank Adaptation)
lora_config = LoraConfig(
    target_modules=["q_proj", "v_proj"],  # Target modules for adaptation
    lora_dropout=0.05,  # Dropout probability for LoRA
    task_type="CAUSAL_LM"  # Type of task (Causal Language Modeling)
)

In [None]:
# Obtain a LoRA model
lora_model = get_peft_model(model, lora_config, adapter_name='booking' )
lora_model.print_trainable_parameters() # Print trainable parameters of the LoRA model

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199


trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06220594176090199

In [None]:
# Check the active adapters in the LoRA model
lora_model.active_adapters

['booking']

['booking']

# Load dataset

In [None]:
# Load the dataset
dataset= load_from_disk('data')
dataset

Dataset({
    features: ['system', 'user', 'assistant', 'text'],
    num_rows: 3355
})

In [None]:
# Split the dataset into training and validation subsets
dataset = dataset.train_test_split(test_size=0.1)
dataset

DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant', 'text'],
        num_rows: 3019
    })
    test: Dataset({
        features: ['system', 'user', 'assistant', 'text'],
        num_rows: 336
    })
})

In [None]:
# Instantiate a tokenizer from a pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Tokenize the dataset
tokenized_dataset = dataset.map(lambda example : tokenizer(example['text']), batched=True)
tokenized_dataset

Map:   0%|          | 0/3019 [00:00<?, ? examples/s]

Map:   0%|          | 0/336 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['system', 'user', 'assistant', 'text', 'input_ids', 'attention_mask'],
        num_rows: 3019
    })
    test: Dataset({
        features: ['system', 'user', 'assistant', 'text', 'input_ids', 'attention_mask'],
        num_rows: 336
    })
})

# Set training arguments

In [None]:
# Define training arguments
args = TrainingArguments(
    output_dir = output_dir, 
    num_train_epochs = 5,

    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,

    gradient_accumulation_steps = 16,
    max_grad_norm = 0.3,
    weight_decay = 0.001, 

    learning_rate = 1e-5,
    warmup_ratio = 0.03,

    lr_scheduler_type = "constant", 

    evaluation_strategy = 'epoch',

    save_steps = 25,
    logging_first_step = True,
    logging_steps = 5,

    report_to="wandb"
)

In [None]:
def decode_predictions(tokenizer: PreTrainedTokenizer, predictions: TransformersPredictionPipelineOutput ) -> dict:
    """
    Decode model predictions and labels.

    Args:
        tokenizer (PreTrainedTokenizer): Tokenizer used for decoding.
        predictions (TransformersPredictionPipelineOutput): Model predictions.

    Returns:
        dict: Decoded predictions and labels.
    """
    # Decode model predictions
    logits = predictions.predictions.argmax(axis=-1)
    cleaned_logits = [list(filter(lambda x: x != 0, seq)) for seq in logits]
    prediction_text = tokenizer.batch_decode(cleaned_logits, skip_special_tokens=True)

    # Decode labels
    labels = predictions.label_ids
    cleaned_labels = [list(filter(lambda x: x != -100, seq)) for seq in labels]
    decoded_labels = tokenizer.batch_decode(cleaned_labels, skip_special_tokens=True)

    # Function to extract text from labels
    get_text = lambda x,idx: x.split('<</SYS>>')[-1].strip().split('[/INST]')[idx].strip()

    # Extract user input, labels, and predictions
    user = list(map(lambda x: get_text(x,0), decoded_labels))
    labels = list(map(lambda x:get_text(x,1), decoded_labels))
    prediction_text = prediction_text

    return {'user':user,"labels": labels, "predictions": prediction_text}


class WandbPredictionProgressCallback(WandbCallback):
    """
    Custom WandbCallback for logging sample predictions during training and evaluation.
    """
    def __init__(self, trainer: Trainer , tokenizer: PreTrainedTokenizer , val_dataset: Dataset ,
                 num_samples_eval: int =50, num_samples_on_save: int =30):
        """
        Initializes the WandbPredictionProgressCallback.

        Args:
            trainer (Trainer): The Trainer object.
            tokenizer (PreTrainedTokenizer): The tokenizer used for decoding.
            val_dataset (Dataset): The validation dataset.
            num_samples_eval (int): Number of samples to use for evaluation predictions.
            num_samples_on_save (int): Number of samples to use for predictions during save checkpoints.

        Attributes:
            trainer (Trainer): The Trainer object.
            tokenizer (PreTrainedTokenizer): Tokenizer used for decoding.
            sample_dataset_eval (Dataset): Subset of validation dataset for evaluation.
            sample_dataset_on_save (Dataset): Subset of validation dataset for predictions on save.
            eval_table (wandb.Table): Wandb table for storing evaluation predictions.
            on_save_table (wandb.Table): Wandb table for storing predictions on checkpoint save.

        """

        super().__init__()
        self.trainer = trainer
        self.tokenizer = tokenizer
        # Select a subset of validation dataset for evaluation and on-save predictions
        self.sample_dataset_eval = val_dataset.select(range(num_samples_eval))
        self.sample_dataset_on_save = val_dataset.select(range(num_samples_on_save))
        # Create tables for storing predictions
        columns = ["user", "labels", "prediction","epoch"]
        self.eval_table = wandb.Table(columns=columns)
        self.on_save_table = wandb.Table(columns=columns)

    def update_predictions_table(self, table: wandb.Table, predictions_df: pd.DataFrame) -> wandb.Table:
        """
        Update a Wandb table with a predictions dataframe.

        Args:
            table (wandb.Table): Existing Wandb table.
            predictions_df (pd.DataFrame): DataFrame containing predictions.

        Returns:
            wandb.Table: Updated wandb table.
        """
        # Add data from predictions DataFrame to the table
        for row in predictions_df.values:
          table.add_data(*row)

        # Create a new wandb.Table with updated data
        new_table = wandb.Table(columns=table.columns, data=table.data)

        return new_table


    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs) -> None:

        """
        Callback function called during evaluation to log sample predictions.

        Args:
            args (TrainingArguments): The training arguments.
            state (TrainerState): The trainer state.
            control (TrainerControl): The trainer control.
            **kwargs: Additional keyword arguments.
        """
        super().on_evaluate(args, state, control, **kwargs)
        # Predict on a subset of validation dataset
        predictions = self.trainer.predict(self.sample_dataset_eval)
        # Decode predictions
        predictions = decode_predictions(self.tokenizer, predictions)
        # Convert predictions to DataFrame
        predictions_df = pd.DataFrame(predictions)
        # Add epoch information to the DataFrame
        predictions_df["epoch"] = state.epoch

        # Create a wandb.Table from predictions DataFrame
        records_table = self.update_predictions_table(self.eval_table, predictions_df)
        # Log predictions to wandb
        self._wandb.log({"sample_predictions_eval": records_table})

    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs) -> None:

        """
        Callback function called during model saving to log sample predictions.

        Args:
            args (TrainingArguments): The training arguments.
            state (TrainerState): The trainer state.
            control (TrainerControl): The trainer control.
            **kwargs: Additional keyword arguments.
        """
        super().on_save(args, state, control, **kwargs)
        # Predict on a subset of validation dataset
        predictions = self.trainer.predict(self.sample_dataset_on_save)
        # Decode predictions
        predictions = decode_predictions(self.tokenizer, predictions)
        # Convert predictions to DataFrame
        predictions_df = pd.DataFrame(predictions)
        # Add epoch information to the DataFrame
        predictions_df["epoch"] = state.epoch

        # Create a wandb.Table from predictions DataFrame
        records_table = self.update_predictions_table( self.on_save_table, predictions_df)
        # Log predictions to wandb
        self._wandb.log({"sample_predictions_on_save": records_table})




In [None]:
# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Initialize Trainer for model training
trainer = Trainer(
    lora_model,
    args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator

)

In [None]:
# Instantiate the WandbPredictionProgressCallback
progress_callback = WandbPredictionProgressCallback(
    trainer=trainer,
    tokenizer=tokenizer,
    val_dataset=tokenized_dataset["test"],
)
# Add the callback to the trainer
trainer.add_callback(progress_callback)

In [None]:
# train
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33malgolovanova[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
0,3.0172,2.939822
1,2.0079,1.966136
2,0.9242,0.870978
3,0.4796,0.480028
4,0.3614,0.362291


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-25)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-25)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-50)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-50)... Done. 0.7s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-75)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-75)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-100)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-100)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-125)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-125)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-150)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-150)... Done. 0.5s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-175)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-175)... Done. 0.7s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-200)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-200)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-225)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-225)... Done. 0.2s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-250)... Done. 0.4s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-250)... Done. 0.6s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-275)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-275)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-300)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-300)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-325)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-325)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-350)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-350)... Done. 0.7s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-375)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-375)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-400)... Done. 0.2s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-400)... Done. 0.3s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-425)... Done. 0.4s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-425)... Done. 0.6s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')


[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-450)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (/content/drive/MyDrive/llama_booking/checkpoint-450)... Done. 0.4s


Index(['user', 'labels', 'predictions', 'epoch'], dtype='object')




TrainOutput(global_step=470, training_loss=1.650777107096733, metrics={'train_runtime': 11203.4241, 'train_samples_per_second': 1.347, 'train_steps_per_second': 0.042, 'total_flos': 1.912435607586816e+17, 'train_loss': 1.650777107096733, 'epoch': 4.9801324503311255})

In [None]:
# Save model
trainer.save_model(output_dir+'/last')