In [None]:
!pip install datasets transformers trl evaluate huggingface_hub pynvml psutil rouge_score trackio

Collecting trl
  Downloading trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trackio
  Downloading trackio-0.8.1-py3-none-any.whl.metadata (8.4 kB)
Collecting plotly<7.0.0,>=6.0.0 (from trackio)
  Downloading plotly-6.4.0-py3-none-any.whl.metadata (8.5 kB)
Downloading trl-0.25.0-py3-none-any.whl (462 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trackio-0.8.1-py3-none-any.whl (874 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.0/875.0 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?

In [None]:


from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import evaluate
from transformers import TrainerCallback
import time
from huggingface_hub import login
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import pynvml
import psutil

print("login")

login(token="YourToken")

print("data is loading")

# Load full FINDSum dataset
ds_train = load_dataset('csv', data_files='./drive/MyDrive/text_only_train_data.csv')
ds_val = load_dataset('csv', data_files={'validation':'./drive/MyDrive/text_only_val_data.csv'})

train = ds_train["train"]

val = ds_val["validation"]

def add_prefix(example):
    prefix = "Summarize the following document: "
    example["document"] = prefix + example["document"]
    return example

train = train.map(add_prefix)
val = val.map(add_prefix)

print("model loading")

model_id_mistral = "mistralai/Mistral-7B-Instruct-v0.3"
model_ids = [{"id": model_id_mistral, "name": "mistral_4"}]

model_tokenizer_list = []

for model_id in model_ids:
    tokenizer = AutoTokenizer.from_pretrained(model_id["id"])
    model = AutoModelForCausalLM.from_pretrained(model_id["id"], dtype=torch.bfloat16, device_map="auto")
    model_tokenizer_list.append({
        "model": model,
        "tokenizer": tokenizer,
        "name": model_id["name"]
    })

print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    # decode token IDs into strings if needed (tokenizer needed here)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Run rouge evaluation on decoded text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print("ROUGE Log", result)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }
class ResourceLoggingCallback(TrainerCallback):
    def __init__(self, log_interval=60):
        super().__init__()
        self.log_interval = log_interval  # seconds
        self.last_log_time = time.time()
        nvmlInit()
        self.handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_log_time >= self.log_interval:
            # Log GPU memory
            mem_info = nvmlDeviceGetMemoryInfo(self.handle)
            gpu_used = mem_info.used / (1024 ** 3)
            gpu_total = mem_info.total / (1024 ** 3)

            # Log CPU/RAM usage
            cpu_percent = psutil.cpu_percent()
            ram_info = psutil.virtual_memory()

            print(f"[Resource Log | Step {state.global_step}] "
                  f"GPU Memory Used: {gpu_used:.2f} GB / {gpu_total:.2f} GB, "
                  f"CPU Usage: {cpu_percent}%, "
                  f"RAM Usage: {ram_info.used / (1024 ** 3):.2f} GB / {ram_info.total / (1024 ** 3):.2f} GB")

            self.last_log_time = current_time

    def on_train_end(self, args, state, control, **kwargs):
        total_flos = getattr(state, 'total_flos', None)
        if total_flos is not None:
            print(f"Total FLOPs during training: {total_flos}")
        else:
            print("Total FLOPs information not available.")

print(model_id)
print("training")
peft_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM")

training_args = SFTConfig(
    dataset_text_field="document",
    learning_rate=2e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    num_train_epochs=2,
    report_to=["trackio"],
    save_strategy="epoch" ,  # Save checkpoint at end of each epoch
    gradient_checkpointing=True
)
for models in model_tokenizer_list:
    print(models["name"] +"started training")
    training_args.output_dir = f"./drive/MyDrive/Results_Lora/Mistral/sft-lora-{models['name']}"
    trainer = SFTTrainer(
        model=model,
        train_dataset=train,
        eval_dataset=val,
        peft_config=peft_config,
        args=training_args,
        compute_metrics=compute_metrics,
        callbacks=[ResourceLoggingCallback(log_interval=120)],
        processing_class=tokenizer  # Pass tokenizer here, not as "tokenizer"
    )

    trainer.train()
    trainer.save_model("./drive/MyDrive/Results_Lora/Mistral/my_trained__mistral_model")
    trainer.save_state()
    print(models["name"] +"finished training")

login
data is loading


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16820 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

model loading


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Loading ROUGE metric...


Downloading builder script: 0.00B [00:00, ?B/s]

{'id': 'mistralai/Mistral-7B-Instruct-v0.3', 'name': 'mistral_4'}
training
mistral_4started training


Adding EOS to train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Dali-bot/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Dali-bot/trackio
* View dashboard by going to: https://Dali-bot-trackio.hf.space/


* Created new run: Dali-bot-1762784594
[Resource Log | Step 1] GPU Memory Used: 15.72 GB / 40.00 GB, CPU Usage: 14.6%, RAM Usage: 5.39 GB / 83.47 GB


Step,Training Loss
10,1.5792
20,1.4905
30,1.474
40,1.4664
50,1.4672
60,1.459
70,1.467
80,1.4375
90,1.4371
100,1.4441


[Resource Log | Step 8] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.37 GB / 83.47 GB
[Resource Log | Step 15] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.38 GB / 83.47 GB
[Resource Log | Step 22] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.36 GB / 83.47 GB
[Resource Log | Step 29] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.36 GB / 83.47 GB
[Resource Log | Step 36] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.34 GB / 83.47 GB
[Resource Log | Step 43] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.33 GB / 83.47 GB
[Resource Log | Step 50] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.35 GB / 83.47 GB
[Resource Log | Step 57] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.33 GB / 83.47 GB
[Resource Log | Step 64] GPU Memory Used: 15.77 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.34 GB / 83.47 GB
[R

In [None]:


from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import evaluate
from transformers import TrainerCallback
import time
from huggingface_hub import login
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import pynvml
import psutil

print("login")

login(token="YourToken")

print("data is loading")

# Load full FINDSum dataset
ds_train = load_dataset('csv', data_files='./drive/MyDrive/text_only_train_data.csv')
ds_val = load_dataset('csv', data_files={'validation':'./drive/MyDrive/text_only_val_data.csv'})

train = ds_train["train"]

val = ds_val["validation"]

def add_prefix(example):
    prefix = "Summarize the following document: "
    example["document"] = prefix + example["document"]
    return example

train = train.map(add_prefix)
val = val.map(add_prefix)

print("model loading")

model_id_qwen2_5 = "Qwen/Qwen2.5-7B-Instruct"
model_ids = [{"id": model_id_qwen2_5, "name": "qwen_4"}]

model_tokenizer_list = []

for model_id in model_ids:
    tokenizer = AutoTokenizer.from_pretrained(model_id["id"])
    model = AutoModelForCausalLM.from_pretrained(model_id["id"], dtype=torch.bfloat16, device_map="auto")
    model_tokenizer_list.append({
        "model": model,
        "tokenizer": tokenizer,
        "name": model_id["name"]
    })

print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    # decode token IDs into strings if needed (tokenizer needed here)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Run rouge evaluation on decoded text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print("ROUGE Log", result)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }
class ResourceLoggingCallback(TrainerCallback):
    def __init__(self, log_interval=60):
        super().__init__()
        self.log_interval = log_interval  # seconds
        self.last_log_time = time.time()
        nvmlInit()
        self.handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_log_time >= self.log_interval:
            # Log GPU memory
            mem_info = nvmlDeviceGetMemoryInfo(self.handle)
            gpu_used = mem_info.used / (1024 ** 3)
            gpu_total = mem_info.total / (1024 ** 3)

            # Log CPU/RAM usage
            cpu_percent = psutil.cpu_percent()
            ram_info = psutil.virtual_memory()

            print(f"[Resource Log | Step {state.global_step}] "
                  f"GPU Memory Used: {gpu_used:.2f} GB / {gpu_total:.2f} GB, "
                  f"CPU Usage: {cpu_percent}%, "
                  f"RAM Usage: {ram_info.used / (1024 ** 3):.2f} GB / {ram_info.total / (1024 ** 3):.2f} GB")

            self.last_log_time = current_time

    def on_train_end(self, args, state, control, **kwargs):
        total_flos = getattr(state, 'total_flos', None)
        if total_flos is not None:
            print(f"Total FLOPs during training: {total_flos}")
        else:
            print("Total FLOPs information not available.")

print(model_id)
print("training")
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM")

training_args = SFTConfig(
    dataset_text_field="document",
    learning_rate=2e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    num_train_epochs=2,
    report_to=["trackio"],
    save_strategy="epoch" ,  # Save checkpoint at end of each epoch
    gradient_checkpointing=True
)
for models in model_tokenizer_list:
    print(models["name"] +"started training")
    training_args.output_dir = f"./drive/MyDrive/Results_Lora/Qwen/sft-lora-{models['name']}"
    trainer = SFTTrainer(
        model=model,
        train_dataset=train,
        eval_dataset=val,
        peft_config=peft_config,
        args=training_args,
        compute_metrics=compute_metrics,
        callbacks=[ResourceLoggingCallback(log_interval=120)],
        processing_class=tokenizer  # Pass tokenizer here, not as "tokenizer"
    )

    trainer.train()
    trainer.save_model("./drive/MyDrive/Results_Lora/Qwen/my_trained_qwen_model")
    trainer.save_state()
    print(models["name"] +"finished training")

login
data is loading


Map:   0%|          | 0/16820 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

model loading


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Loading ROUGE metric...
{'id': 'Qwen/Qwen2.5-7B-Instruct', 'name': 'qwen_4'}
training
qwen_4started training


Adding EOS to train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


* Created new run: Dali-bot-1762796197
[Resource Log | Step 1] GPU Memory Used: 17.92 GB / 40.00 GB, CPU Usage: 5.1%, RAM Usage: 7.99 GB / 83.47 GB


Step,Training Loss
10,1.7428
20,1.6662
30,1.6271
40,1.6172
50,1.6078
60,1.6019
70,1.6008
80,1.583
90,1.5828
100,1.5797


[Resource Log | Step 8] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.98 GB / 83.47 GB
[Resource Log | Step 15] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.97 GB / 83.47 GB
[Resource Log | Step 22] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.97 GB / 83.47 GB
[Resource Log | Step 29] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.97 GB / 83.47 GB
[Resource Log | Step 36] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.97 GB / 83.47 GB
[Resource Log | Step 43] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.97 GB / 83.47 GB
[Resource Log | Step 50] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.97 GB / 83.47 GB
[Resource Log | Step 57] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.94 GB / 83.47 GB
[Resource Log | Step 64] GPU Memory Used: 17.96 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.95 GB / 83.47 GB
[R

In [None]:


from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import evaluate
from transformers import TrainerCallback
import time
from huggingface_hub import login
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import pynvml
import psutil

print("login")

login(token="YourToken")

print("data is loading")

# Load full FINDSum dataset
ds_train = load_dataset('csv', data_files='./drive/MyDrive/text_only_train_data.csv')
ds_val = load_dataset('csv', data_files={'validation':'./drive/MyDrive/text_only_val_data.csv'})

train = ds_train["train"]

val = ds_val["validation"]

def add_prefix(example):
    prefix = "Summarize the following document: "
    example["document"] = prefix + example["document"]
    return example

train = train.map(add_prefix)
val = val.map(add_prefix)

print("model loading")

model_id_cohereLabs = "CohereLabs/c4ai-command-r7b-12-2024"
model_ids = [{"id": model_id_cohereLabs, "name": "coherelabs_4"}]

model_tokenizer_list = []

for model_id in model_ids:
    tokenizer = AutoTokenizer.from_pretrained(model_id["id"])
    model = AutoModelForCausalLM.from_pretrained(model_id["id"], dtype=torch.bfloat16, device_map="auto")
    model_tokenizer_list.append({
        "model": model,
        "tokenizer": tokenizer,
        "name": model_id["name"]
    })

print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    # decode token IDs into strings if needed (tokenizer needed here)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Run rouge evaluation on decoded text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print("ROUGE Log", result)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }
class ResourceLoggingCallback(TrainerCallback):
    def __init__(self, log_interval=60):
        super().__init__()
        self.log_interval = log_interval  # seconds
        self.last_log_time = time.time()
        nvmlInit()
        self.handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_log_time >= self.log_interval:
            # Log GPU memory
            mem_info = nvmlDeviceGetMemoryInfo(self.handle)
            gpu_used = mem_info.used / (1024 ** 3)
            gpu_total = mem_info.total / (1024 ** 3)

            # Log CPU/RAM usage
            cpu_percent = psutil.cpu_percent()
            ram_info = psutil.virtual_memory()

            print(f"[Resource Log | Step {state.global_step}] "
                  f"GPU Memory Used: {gpu_used:.2f} GB / {gpu_total:.2f} GB, "
                  f"CPU Usage: {cpu_percent}%, "
                  f"RAM Usage: {ram_info.used / (1024 ** 3):.2f} GB / {ram_info.total / (1024 ** 3):.2f} GB")

            self.last_log_time = current_time

    def on_train_end(self, args, state, control, **kwargs):
        total_flos = getattr(state, 'total_flos', None)
        if total_flos is not None:
            print(f"Total FLOPs during training: {total_flos}")
        else:
            print("Total FLOPs information not available.")

print(model_id)
print("training")
peft_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM")

training_args = SFTConfig(
    dataset_text_field="document",
    learning_rate=2e-4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    num_train_epochs=2,
    report_to=["trackio"],
    save_strategy="epoch",  # Save checkpoint at end of each epoch
    gradient_checkpointing=True
)
for models in model_tokenizer_list:
    print(models["name"] +"started training")
    training_args.output_dir = f"./drive/MyDrive/Results_Lora/Command/sft-lora-{models['name']}"
    trainer = SFTTrainer(
        model=model,
        train_dataset=train,
        eval_dataset=val,
        peft_config=peft_config,
        args=training_args,
        compute_metrics=compute_metrics,
        callbacks=[ResourceLoggingCallback(log_interval=120)],
        processing_class=tokenizer  # Pass tokenizer here, not as "tokenizer"
    )

    trainer.train()
    trainer.save_model("./drive/MyDrive/Results_Lora/Command/my_trained_command_model")
    trainer.save_state()
    print(models["name"] +"finished training")

login
data is loading


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16820 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

model loading


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/45.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/178 [00:00<?, ?B/s]

Loading ROUGE metric...


Downloading builder script: 0.00B [00:00, ?B/s]

{'id': 'CohereLabs/c4ai-command-r7b-12-2024', 'name': 'coherelabs_4'}
training
coherelabs_4started training


Adding EOS to train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Dali-bot/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Dali-bot/trackio
* View dashboard by going to: https://Dali-bot-trackio.hf.space/


* Created new run: Dali-bot-1762834190
[Resource Log | Step 1] GPU Memory Used: 20.62 GB / 40.00 GB, CPU Usage: 10.8%, RAM Usage: 5.27 GB / 83.47 GB


Step,Training Loss
10,1.7859
20,1.7023
30,1.6603
40,1.6513
50,1.6469
60,1.6416
70,1.6449
80,1.6235
90,1.6219
100,1.6235


[Resource Log | Step 7] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.31 GB / 83.47 GB
[Resource Log | Step 13] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.30 GB / 83.47 GB
[Resource Log | Step 19] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.30 GB / 83.47 GB
[Resource Log | Step 25] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.30 GB / 83.47 GB
[Resource Log | Step 31] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.29 GB / 83.47 GB
[Resource Log | Step 37] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.29 GB / 83.47 GB
[Resource Log | Step 43] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.30 GB / 83.47 GB
[Resource Log | Step 49] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.29 GB / 83.47 GB
[Resource Log | Step 55] GPU Memory Used: 20.67 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.26 GB / 83.47 GB
[R