In [None]:
!pip install datasets transformers trl evaluate huggingface_hub pynvml psutil rouge_score trackio



Collecting trl
  Downloading trl-0.25.0-py3-none-any.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trackio
  Downloading trackio-0.8.1-py3-none-any.whl.metadata (8.4 kB)
Collecting plotly<7.0.0,>=6.0.0 (from trackio)
  Downloading plotly-6.4.0-py3-none-any.whl.metadata (8.5 kB)
Downloading trl-0.25.0-py3-none-any.whl (462 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trackio-0.8.1-py3-none-any.whl (874 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.0/875.0 kB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[?

In [None]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import evaluate
from transformers import TrainerCallback
import time
from huggingface_hub import login
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import pynvml
import psutil
from peft import AdaLoraConfig, get_peft_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

torch.cuda.empty_cache()
print("login")

login(token="YourToken")

print("data is loading")

ds_train = load_dataset('csv', data_files='./drive/MyDrive/text_only_train_data.csv')
ds_val = load_dataset('csv', data_files={'validation':'./drive/MyDrive/text_only_val_data.csv'})

train = ds_train["train"]

val = ds_val["validation"]

# Optionally add prefix on the validation set for instruction tuning
def add_prefix(example):
    prefix = "Summarize the following document: "
    example["document"] = prefix + example["document"]
    return example

train = train.map(add_prefix)
val = val.map(add_prefix)

print("model loading")

model_id = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")

print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    # decode token IDs into strings if needed (tokenizer needed here)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Run rouge evaluation on decoded text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print("ROUGE Log", result)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }
class ResourceLoggingCallback(TrainerCallback):
    def __init__(self, log_interval=60):
        super().__init__()
        self.log_interval = log_interval  # seconds
        self.last_log_time = time.time()
        nvmlInit()
        self.handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_log_time >= self.log_interval:
            # Log GPU memory
            mem_info = nvmlDeviceGetMemoryInfo(self.handle)
            gpu_used = mem_info.used / (1024 ** 3)
            gpu_total = mem_info.total / (1024 ** 3)

            # Log CPU/RAM usage
            cpu_percent = psutil.cpu_percent()
            ram_info = psutil.virtual_memory()

            print(f"[Resource Log | Step {state.global_step}] "
                  f"GPU Memory Used: {gpu_used:.2f} GB / {gpu_total:.2f} GB, "
                  f"CPU Usage: {cpu_percent}%, "
                  f"RAM Usage: {ram_info.used / (1024 ** 3):.2f} GB / {ram_info.total / (1024 ** 3):.2f} GB")

            self.last_log_time = current_time

    def on_train_end(self, args, state, control, **kwargs):
        total_flos = getattr(state, 'total_flos', None)
        if total_flos is not None:
            print(f"Total FLOPs during training: {total_flos}")
        else:
            print("Total FLOPs information not available.")

print(model_id)
print("training")
adalora_config = AdaLoraConfig(
    init_r=64,
    target_r = 32,# rank for LoRA matrices
    target_modules=["q_proj", "v_proj"],  # modules to adapt
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    total_step=471
)

# Wrap the base model with AdaLoRA PEFT model
ada_lora_model = get_peft_model(model, adalora_config)

training_args = SFTConfig(
    dataset_text_field="document",
    learning_rate=2e-4,
    per_device_train_batch_size=1, # Further reduced batch size
    per_device_eval_batch_size=1, # Further reduced eval batch size
    gradient_accumulation_steps=64, # Increased gradient accumulation steps
    num_train_epochs=2,
    report_to=["trackio"],  # Add this to save checkpoints
    save_strategy="epoch",
    output_dir = "./drive/MyDrive/Results_AdaLora/Mistral/sft-mistral-ada-lora",  # Save checkpoint at end of each epoch
    gradient_checkpointing=True # Enable gradient checkpointing
)

trainer = SFTTrainer(
        model=ada_lora_model,  # reload or clone model if needed
        train_dataset=train,
        eval_dataset=val,
        args=training_args,
        compute_metrics=compute_metrics,
        callbacks=[ResourceLoggingCallback(log_interval=120)],
        processing_class=tokenizer,
    )
trainer.train()
trainer.save_model("./drive/MyDrive/Results_AdaLora/Mistral/my_trained_model")
trainer.save_state()

login
data is loading


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16820 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

model loading


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Loading ROUGE metric...


Downloading builder script: 0.00B [00:00, ?B/s]

mistralai/Mistral-7B-Instruct-v0.3
training


Adding EOS to train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Dali-bot/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Dali-bot/trackio
* View dashboard by going to: https://Dali-bot-trackio.hf.space/


* Created new run: Dali-bot-1762764337
[Resource Log | Step 1] GPU Memory Used: 15.75 GB / 40.00 GB, CPU Usage: 6.1%, RAM Usage: 5.46 GB / 83.47 GB


Step,Training Loss
10,144.6066
20,95.6056
30,52.9004
40,29.0399
50,21.8623
60,13.0173
70,5.5984
80,4.1084
90,3.0012
100,2.7252


[Resource Log | Step 7] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.47 GB / 83.47 GB
[Resource Log | Step 13] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.46 GB / 83.47 GB
[Resource Log | Step 19] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.45 GB / 83.47 GB
[Resource Log | Step 25] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.45 GB / 83.47 GB
[Resource Log | Step 31] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.44 GB / 83.47 GB
[Resource Log | Step 37] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.42 GB / 83.47 GB
[Resource Log | Step 43] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.43 GB / 83.47 GB
[Resource Log | Step 49] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.44 GB / 83.47 GB
[Resource Log | Step 55] GPU Memory Used: 15.85 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.43 GB / 83.47 GB
[R

In [None]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import evaluate
from transformers import TrainerCallback
import time
from huggingface_hub import login
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import pynvml
import psutil
from peft import AdaLoraConfig, get_peft_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

torch.cuda.empty_cache()
print("login")

login(token="YourToken")

print("data is loading")

ds_train = load_dataset('csv', data_files='./drive/MyDrive/text_only_train_data.csv')
ds_val = load_dataset('csv', data_files={'validation':'./drive/MyDrive/text_only_val_data.csv'})

train = ds_train["train"]

val = ds_val["validation"]

# Optionally add prefix on the validation set for instruction tuning
def add_prefix(example):
    prefix = "Summarize the following document: "
    example["document"] = prefix + example["document"]
    return example

train = train.map(add_prefix)
val = val.map(add_prefix)

print("model loading")

model_id = "CohereLabs/c4ai-command-r7b-12-2024"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")

print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    # decode token IDs into strings if needed (tokenizer needed here)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Run rouge evaluation on decoded text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print("ROUGE Log", result)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }
class ResourceLoggingCallback(TrainerCallback):
    def __init__(self, log_interval=60):
        super().__init__()
        self.log_interval = log_interval  # seconds
        self.last_log_time = time.time()
        nvmlInit()
        self.handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_log_time >= self.log_interval:
            # Log GPU memory
            mem_info = nvmlDeviceGetMemoryInfo(self.handle)
            gpu_used = mem_info.used / (1024 ** 3)
            gpu_total = mem_info.total / (1024 ** 3)

            # Log CPU/RAM usage
            cpu_percent = psutil.cpu_percent()
            ram_info = psutil.virtual_memory()

            print(f"[Resource Log | Step {state.global_step}] "
                  f"GPU Memory Used: {gpu_used:.2f} GB / {gpu_total:.2f} GB, "
                  f"CPU Usage: {cpu_percent}%, "
                  f"RAM Usage: {ram_info.used / (1024 ** 3):.2f} GB / {ram_info.total / (1024 ** 3):.2f} GB")

            self.last_log_time = current_time

    def on_train_end(self, args, state, control, **kwargs):
        total_flos = getattr(state, 'total_flos', None)
        if total_flos is not None:
            print(f"Total FLOPs during training: {total_flos}")
        else:
            print("Total FLOPs information not available.")

print(model_id)
print("training")
adalora_config = AdaLoraConfig(
    init_r=64,
    target_r = 32,# rank for LoRA matrices
    target_modules=["q_proj", "v_proj"],  # modules to adapt
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    total_step=471
)

# Wrap the base model with AdaLoRA PEFT model
ada_lora_model = get_peft_model(model, adalora_config)

training_args = SFTConfig(
    dataset_text_field="document",
    learning_rate=2e-4,
    per_device_train_batch_size=1, # Further reduced batch size
    per_device_eval_batch_size=1, # Further reduced eval batch size
    gradient_accumulation_steps=64, # Increased gradient accumulation steps
    num_train_epochs=2,
    report_to=["trackio"],  # Add this to save checkpoints
    save_strategy="epoch",
    output_dir = "./drive/MyDrive/Results_AdaLora/Command/sft-command-ada-lora",  # Save checkpoint at end of each epoch
    gradient_checkpointing=True # Enable gradient checkpointing
)

trainer = SFTTrainer(
        model=ada_lora_model,  # reload or clone model if needed
        train_dataset=train,
        eval_dataset=val,
        args=training_args,
        compute_metrics=compute_metrics,
        callbacks=[ResourceLoggingCallback(log_interval=120)],
        processing_class=tokenizer,
    )
trainer.train()
trainer.save_model("./drive/MyDrive/Results_AdaLora/Command/my_trained_command_model")
trainer.save_state()

login
data is loading


Map:   0%|          | 0/16820 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

model loading


tokenizer_config.json:   0%|          | 0.00/45.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/178 [00:00<?, ?B/s]

Loading ROUGE metric...
CohereLabs/c4ai-command-r7b-12-2024
training


Adding EOS to train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


* Created new run: Dali-bot-1762776522
[Resource Log | Step 1] GPU Memory Used: 20.65 GB / 40.00 GB, CPU Usage: 13.8%, RAM Usage: 5.72 GB / 83.47 GB


Step,Training Loss
10,144.7676
20,95.7743
30,53.0629
40,29.2067
50,22.0321
60,13.2084
70,5.8044
80,4.3077
90,3.2087
100,2.9371


[Resource Log | Step 6] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.76 GB / 83.47 GB
[Resource Log | Step 11] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.73 GB / 83.47 GB
[Resource Log | Step 16] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.74 GB / 83.47 GB
[Resource Log | Step 21] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.74 GB / 83.47 GB
[Resource Log | Step 26] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.74 GB / 83.47 GB
[Resource Log | Step 31] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.73 GB / 83.47 GB
[Resource Log | Step 36] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.5%, RAM Usage: 5.74 GB / 83.47 GB
[Resource Log | Step 41] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 5.71 GB / 83.47 GB
[Resource Log | Step 46] GPU Memory Used: 20.75 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 5.72 GB / 83.47 GB
[R

In [None]:
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import evaluate
from transformers import TrainerCallback
import time
from huggingface_hub import login
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
import pynvml
import psutil
from peft import AdaLoraConfig, get_peft_model
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

torch.cuda.empty_cache()
print("login")

login(token="YourToken")

print("data is loading")

ds_train = load_dataset('csv', data_files='./drive/MyDrive/text_only_train_data.csv')
ds_val = load_dataset('csv', data_files={'validation':'./drive/MyDrive/text_only_val_data.csv'})

train = ds_train["train"]

val = ds_val["validation"]

# Optionally add prefix on the validation set for instruction tuning
def add_prefix(example):
    prefix = "Summarize the following document: "
    example["document"] = prefix + example["document"]
    return example

train = train.map(add_prefix)
val = val.map(add_prefix)

print("model loading")

model_id = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map="auto")

print("Loading ROUGE metric...")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    # decode token IDs into strings if needed (tokenizer needed here)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Run rouge evaluation on decoded text
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    print("ROUGE Log", result)
    return {
        "rouge1": result["rouge1"],
        "rouge2": result["rouge2"],
        "rougeL": result["rougeL"]
    }
class ResourceLoggingCallback(TrainerCallback):
    def __init__(self, log_interval=60):
        super().__init__()
        self.log_interval = log_interval  # seconds
        self.last_log_time = time.time()
        nvmlInit()
        self.handle = nvmlDeviceGetHandleByIndex(0)  # GPU 0

    def on_step_end(self, args, state, control, **kwargs):
        current_time = time.time()
        if current_time - self.last_log_time >= self.log_interval:
            # Log GPU memory
            mem_info = nvmlDeviceGetMemoryInfo(self.handle)
            gpu_used = mem_info.used / (1024 ** 3)
            gpu_total = mem_info.total / (1024 ** 3)

            # Log CPU/RAM usage
            cpu_percent = psutil.cpu_percent()
            ram_info = psutil.virtual_memory()

            print(f"[Resource Log | Step {state.global_step}] "
                  f"GPU Memory Used: {gpu_used:.2f} GB / {gpu_total:.2f} GB, "
                  f"CPU Usage: {cpu_percent}%, "
                  f"RAM Usage: {ram_info.used / (1024 ** 3):.2f} GB / {ram_info.total / (1024 ** 3):.2f} GB")

            self.last_log_time = current_time

    def on_train_end(self, args, state, control, **kwargs):
        total_flos = getattr(state, 'total_flos', None)
        if total_flos is not None:
            print(f"Total FLOPs during training: {total_flos}")
        else:
            print("Total FLOPs information not available.")

print(model_id)
print("training")
adalora_config = AdaLoraConfig(
    init_r=64,
    target_r = 32,# rank for LoRA matrices
    target_modules=["q_proj", "v_proj"],  # modules to adapt
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    total_step=471
)

# Wrap the base model with AdaLoRA PEFT model
ada_lora_model = get_peft_model(model, adalora_config)

training_args = SFTConfig(
    dataset_text_field="document",
    learning_rate=2e-4,
    per_device_train_batch_size=1, # Further reduced batch size
    per_device_eval_batch_size=1, # Further reduced eval batch size
    gradient_accumulation_steps=64, # Increased gradient accumulation steps
    num_train_epochs=2,
    report_to=["trackio"],  # Add this to save checkpoints
    save_strategy="epoch",
    output_dir = "./drive/MyDrive/Results_AdaLora/Qwen/sft-qwen-ada-lora",  # Save checkpoint at end of each epoch
    gradient_checkpointing=True # Enable gradient checkpointing
)

trainer = SFTTrainer(
        model=ada_lora_model,  # reload or clone model if needed
        train_dataset=train,
        eval_dataset=val,
        args=training_args,
        compute_metrics=compute_metrics,
        callbacks=[ResourceLoggingCallback(log_interval=120)],
        processing_class=tokenizer,
    )
trainer.train()
trainer.save_model("./drive/MyDrive/Results_AdaLora/Qwen/my_trained_qwen_model")
trainer.save_state()

login
data is loading


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16820 [00:00<?, ? examples/s]

Map:   0%|          | 0/2102 [00:00<?, ? examples/s]

model loading


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Loading ROUGE metric...


Downloading builder script: 0.00B [00:00, ?B/s]

Qwen/Qwen2.5-7B-Instruct
training


Adding EOS to train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/16820 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/2102 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


* Trackio project initialized: huggingface
* Trackio metrics will be synced to Hugging Face Dataset: Dali-bot/trackio-dataset
* Found existing space: https://huggingface.co/spaces/Dali-bot/trackio
* View dashboard by going to: https://Dali-bot-trackio.hf.space/


* Created new run: Dali-bot-1762796536
[Resource Log | Step 1] GPU Memory Used: 17.92 GB / 40.00 GB, CPU Usage: 12.1%, RAM Usage: 7.78 GB / 83.47 GB


Step,Training Loss
10,123.3038
20,81.8767
30,53.669
40,50.0236
50,43.5228
60,39.1568
70,35.2217
80,31.4325
90,27.5817
100,23.6817


[Resource Log | Step 7] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.80 GB / 83.47 GB
[Resource Log | Step 13] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.80 GB / 83.47 GB
[Resource Log | Step 19] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.79 GB / 83.47 GB
[Resource Log | Step 25] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.78 GB / 83.47 GB
[Resource Log | Step 31] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.75 GB / 83.47 GB
[Resource Log | Step 37] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.6%, RAM Usage: 7.79 GB / 83.47 GB
[Resource Log | Step 43] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.79 GB / 83.47 GB
[Resource Log | Step 49] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.7%, RAM Usage: 7.76 GB / 83.47 GB
[Resource Log | Step 55] GPU Memory Used: 17.98 GB / 40.00 GB, CPU Usage: 10.4%, RAM Usage: 7.78 GB / 83.47 GB
[R

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
