<a href="https://colab.research.google.com/github/AgaZgo/peft_fine_tuning/blob/main/flan_t5_lora_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install \
    transformers==4.31.0 \
    datasets==2.14.4 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    peft==0.4.0 \
    loralib==0.1.1 \
    accelerate==0.21.0 \
    wandb==0.15.8 \
    ray[tune]==2.6.0 \
    bayesian-optimization -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.6 MB/s[0m e

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.bayesopt import BayesOptSearch

from datetime import datetime

import numpy as np
import evaluate
import nltk
import torch
import wandb
import os

os.environ['WANDB_LOG_MODEL']='end'
os.environ['WANDB_PROJECT']='sandbox'

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
DATASET_NAME='knkarthick/dialogsum'
MODEL_SIZE='base'

dataset = load_dataset(DATASET_NAME)
model_name = f"google/flan-t5-{MODEL_SIZE}"
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto')  # bf16 only for Ampere GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(examples):
    start_prompt = "Summarize the following dialogue:\n\n"
    end_prompt = "\n\nSummarize:"
    prompts = [start_prompt + dialogue + end_prompt for dialogue in examples['dialogue']]
    examples['input_ids'] = tokenizer(prompts, padding='max_length', truncation=True)['input_ids']
    examples['label'] = tokenizer(examples['summary'], padding='max_length', truncation=True)['input_ids']
    examples['label'] = [
        [-100 if token==tokenizer.pad_token_id else token for token in label] for label in examples['label']
    ]
    return examples

dataset_tokenized = dataset.map(preprocess_data, batched=True)
dataset_tokenized.set_format('torch')

Downloading readme:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/442k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
nltk.download('punkt', quiet=True)

True

In [None]:
def train_model(config):
    MODEL_SIZE = 'base'
    run_name = f"lora_{datetime.now().strftime('%m%d%H%M')}"

    wandb.init(name=run_name, config=config)


    model_name = f"google/flan-t5-{MODEL_SIZE}"
    base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map='auto')  # bf16 only for Ampere GPU
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_data(examples):
        start_prompt = "Summarize the following dialogue:\n\n"
        end_prompt = "\n\nSummarize:"
        prompts = [start_prompt + dialogue + end_prompt for dialogue in examples['dialogue']]
        examples['input_ids'] = tokenizer(prompts, padding='max_length', truncation=True)['input_ids']
        examples['label'] = tokenizer(examples['summary'], padding='max_length', truncation=True)['input_ids']
        examples['label'] = [
            [-100 if token==tokenizer.pad_token_id else token for token in label] for label in examples['label']
        ]
        return examples

    rouge = evaluate.load('rouge')

    def compute_metrics(eval_preds):
        preds, labels = eval_preds

        labels = np.where(labels!=-100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

        # sentences seperated with '\n' as required by rougeLsum
        decoded_preds = ["\n".join(nltk.sent_tokenize(preds.strip())) for preds in decoded_preds]
        decoded_labels = ["\n".join(nltk.sent_tokenize(labels.strip())) for labels in decoded_labels]

        results =  rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

        return results

    output_dir = f"./lora_training/{datetime.now().strftime('%y%m%d%H%M')}"

    lora_params = {
        'r': int(config['lora_r']),
        'lora_alpha': int(config['lora_alpha']),
        'target_modules': ['q', 'v'],
        'lora_dropout': 0.05,
        'bias': 'none',
        'task_type': "SEQ_2_SEQ_LM"
    }

    lora_config = LoraConfig(**lora_params)

    lora_model = get_peft_model(
        base_model,
        lora_config
    )

    lora_training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        learning_rate=config['learning_rate'],
        max_steps=10,
        auto_find_batch_size=True,
        logging_steps=5,
        evaluation_strategy='steps',
        # save_steps=50,
        logging_dir=output_dir+"/logs",
        predict_with_generate=True,
        load_best_model_at_end=True,
        disable_tqdm=True,
        report_to='wandb'
        run_name=run_name
    )

    lora_trainer = Seq2SeqTrainer(
        model=lora_model,
        args=lora_training_args,
        train_dataset=dataset_tokenized['train'],
        eval_dataset=dataset_tokenized['validation'],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    lora_trainer.train()
    result = lora_trainer.evaluate()
    wandb.finish()
    return result

In [None]:
train_model = tune.with_resources(train_model, {"gpu": 1, 'cpu':1})

In [None]:
search_space = {
    'lora_r': tune.uniform(8,25),
    'lora_alpha': tune.uniform(1,13),
    'learning_rate': 1e-3
}

tune_config = tune.TuneConfig(
    num_samples=4,
    search_alg=BayesOptSearch(metric='eval_loss',mode='min'),
    scheduler=ASHAScheduler(metric='eval_loss', mode='min')
)

tuner = tune.Tuner(
    train_model,
    param_space=search_space,
    tune_config=tune_config
)

In [None]:
tuner.fit()

2023-08-29 15:05:34,581	INFO tune.py:666 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+--------------------------------------------------------------------+
| Configuration for experiment     train_model_2023-08-29_15-05-31   |
+--------------------------------------------------------------------+
| Search algorithm                 SearchGenerator                   |
| Scheduler                        AsyncHyperBandScheduler           |
| Number of trials                 4                                 |
+--------------------------------------------------------------------+

View detailed results here: /root/ray_results/train_model_2023-08-29_15-05-31
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/train_model_2023-08-29_15-05-31`

Trial status: 1 PENDING
Current time: 2023-08-29 15:05:34. Total running time: 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-----------------------------------------------------------+
| Trial name             status       lora_r     lora_alpha |
+-----------------------------------------------



Trial train_model_10157ce5 started with configuration:
+-----------------------------------------------+
| Trial train_model_10157ce5 config             |
+-----------------------------------------------+
| learning_rate                           0.001 |
| lora_alpha                            10.3324 |
| lora_r                                12.0382 |
+-----------------------------------------------+



[2m[36m(train_model pid=7876)[0m wandb: Currently logged in as: agazgo. Use `wandb login --relogin` to force relogin
[2m[36m(train_model pid=7876)[0m wandb: wandb version 0.15.9 is available!  To upgrade, please run:
[2m[36m(train_model pid=7876)[0m wandb:  $ pip install wandb --upgrade
[2m[36m(train_model pid=7876)[0m wandb: Tracking run with wandb version 0.15.8
[2m[36m(train_model pid=7876)[0m wandb: Run data is saved locally in /root/ray_results/train_model_2023-08-29_15-05-31/train_model_10157ce5_1_learning_rate=0.0010,lora_alpha=10.3324,lora_r=12.0382_2023-08-29_15-05-34/wandb/run-20230829_150545-qy14zal0
[2m[36m(train_model pid=7876)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(train_model pid=7876)[0m wandb: Syncing run lora_08291505
[2m[36m(train_model pid=7876)[0m wandb: ⭐️ View project at https://wandb.ai/agazgo/sandbox
[2m[36m(train_model pid=7876)[0m wandb: 🚀 View run at https://wandb.ai/agazgo/sandbox/runs/qy14zal0
[2m[36m(train_

[2m[36m(train_model pid=7876)[0m {'loss': 2.0991, 'learning_rate': 0.0005, 'epoch': 0.0}
Trial status: 1 RUNNING | 1 PENDING
Current time: 2023-08-29 15:06:04. Total running time: 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-----------------------------------------------------------+
| Trial name             status       lora_r     lora_alpha |
+-----------------------------------------------------------+
| train_model_10157ce5   RUNNING     12.0382        10.3324 |
| train_model_ae1d9d5c   PENDING     24.4177        10.8913 |
+-----------------------------------------------------------+

Trial status: 1 RUNNING | 1 PENDING
Current time: 2023-08-29 15:06:34. Total running time: 1min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-----------------------------------------------------------+
| Trial name             status       lora_r     lora_alpha |
+-----------------------------------------------------------+
| train_model_10157ce5   RUNNING     12.0382        10.3324 |

[2m[36m(train_model pid=7876)[0m wandb: Waiting for W&B process to finish... (success).


[2m[36m(train_model pid=7876)[0m {'eval_loss': 1.4855414628982544, 'eval_rouge1': 0.2840209812529511, 'eval_rouge2': 0.09359461581076228, 'eval_rougeL': 0.24282252467012738, 'eval_rougeLsum': 0.26156406370659135, 'eval_runtime': 106.8238, 'eval_samples_per_second': 4.681, 'eval_steps_per_second': 0.59, 'epoch': 0.0}


[2m[36m(train_model pid=7876)[0m wandb: 
[2m[36m(train_model pid=7876)[0m wandb: Run history:
[2m[36m(train_model pid=7876)[0m wandb:                      eval/loss █▁▁
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge1 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge2 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rougeL ▁██
[2m[36m(train_model pid=7876)[0m wandb:                 eval/rougeLsum ▁██
[2m[36m(train_model pid=7876)[0m wandb:                   eval/runtime ▃█▁
[2m[36m(train_model pid=7876)[0m wandb:        eval/samples_per_second ▆▁█
[2m[36m(train_model pid=7876)[0m wandb:          eval/steps_per_second ▆▁█
[2m[36m(train_model pid=7876)[0m wandb:                    train/epoch ▁▁▁▁▁▁
[2m[36m(train_model pid=7876)[0m wandb:              train/global_step ▁▁████
[2m[36m(train_model pid=7876)[0m wandb:            train/learning_rate █▁
[2m[36m(train_model pid=7876)[0m w

Trial train_model_10157ce5 completed after 1 iterations at 2023-08-29 15:11:33. Total running time: 5min 59s
+-----------------------------------------------+
| Trial train_model_10157ce5 result             |
+-----------------------------------------------+
| time_this_iter_s                      348.983 |
| time_total_s                          348.983 |
| training_iteration                          1 |
| epoch                                       0 |
| eval_loss                             1.48554 |
| eval_rouge1                           0.28402 |
| eval_rouge2                           0.09359 |
| eval_rougeL                           0.24282 |
| eval_rougeLsum                        0.26156 |
| eval_runtime                          106.824 |
| eval_samples_per_second                 4.681 |
| eval_steps_per_second                    0.59 |
+-----------------------------------------------+

Trial train_model_ae1d9d5c started with configuration:
+----------------------------------

[2m[36m(train_model pid=7876)[0m wandb: - Waiting for wandb.init()...
[2m[36m(train_model pid=7876)[0m wandb: \ Waiting for wandb.init()...


Trial status: 1 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2023-08-29 15:11:35. Total running time: 6min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status         lora_r     lora_alpha     iter     total time (s)     eval_loss     eval_rouge1     eval_rouge2     eval_rougeL |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_ae1d9d5c   RUNNING       24.4177        10.8913                                                                                           |
| train_model_10157ce5   TERMINATED    12.0382        10.3324        1            348.983       1.48554        0.284021       0.0935946        0.242823 |
| train_model_939fcb6d   PENDING       15.7086        12.6712        

[2m[36m(train_model pid=7876)[0m wandb: wandb version 0.15.9 is available!  To upgrade, please run:
[2m[36m(train_model pid=7876)[0m wandb:  $ pip install wandb --upgrade
[2m[36m(train_model pid=7876)[0m wandb: Tracking run with wandb version 0.15.8
[2m[36m(train_model pid=7876)[0m wandb: Run data is saved locally in /root/ray_results/train_model_2023-08-29_15-05-31/train_model_ae1d9d5c_2_learning_rate=0.0010,lora_alpha=10.8913,lora_r=24.4177_2023-08-29_15-05-44/wandb/run-20230829_151133-xrd9jq0r
[2m[36m(train_model pid=7876)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(train_model pid=7876)[0m wandb: Syncing run lora_08291511
[2m[36m(train_model pid=7876)[0m wandb: ⭐️ View project at https://wandb.ai/agazgo/sandbox
[2m[36m(train_model pid=7876)[0m wandb: 🚀 View run at https://wandb.ai/agazgo/sandbox/runs/xrd9jq0r
[2m[36m(train_model pid=7876)[0m Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/

[2m[36m(train_model pid=7876)[0m {'loss': 2.1007, 'learning_rate': 0.0005, 'epoch': 0.0}
Trial status: 1 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2023-08-29 15:12:05. Total running time: 6min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status         lora_r     lora_alpha     iter     total time (s)     eval_loss     eval_rouge1     eval_rouge2     eval_rougeL |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_ae1d9d5c   RUNNING       24.4177        10.8913                                                                                           |
| train_model_10157ce5   TERMINATED    12.0382        10.3324        1            348.983       1.48554        0.284021       0.09

[2m[36m(train_model pid=7876)[0m wandb: Waiting for W&B process to finish... (success).


[2m[36m(train_model pid=7876)[0m {'eval_loss': 1.4840362071990967, 'eval_rouge1': 0.2825821259708068, 'eval_rouge2': 0.09470185046321697, 'eval_rougeL': 0.24406965294983257, 'eval_rougeLsum': 0.26164535710503106, 'eval_runtime': 106.9606, 'eval_samples_per_second': 4.675, 'eval_steps_per_second': 0.589, 'epoch': 0.0}


[2m[36m(train_model pid=7876)[0m wandb: - 12.496 MB of 12.496 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: \ 12.496 MB of 12.496 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: | 12.497 MB of 12.517 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: / 12.497 MB of 12.517 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: 
[2m[36m(train_model pid=7876)[0m wandb: Run history:
[2m[36m(train_model pid=7876)[0m wandb:                      eval/loss █▁▁
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge1 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge2 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rougeL ▁██
[2m[36m(train_model pid=7876)[0m wandb:                 eval/rougeLsum ▁██
[2m[36m(train_model pid=7876)[0m wandb:                   eval/runtime █▇▁
[2m[36m(train_model pid=7876)[0m wandb:      

Trial train_model_ae1d9d5c completed after 1 iterations at 2023-08-29 15:17:21. Total running time: 11min 47s
+-----------------------------------------------+
| Trial train_model_ae1d9d5c result             |
+-----------------------------------------------+
| time_this_iter_s                      347.913 |
| time_total_s                          347.913 |
| training_iteration                          1 |
| epoch                                       0 |
| eval_loss                             1.48404 |
| eval_rouge1                           0.28258 |
| eval_rouge2                            0.0947 |
| eval_rougeL                           0.24407 |
| eval_rougeLsum                        0.26165 |
| eval_runtime                          106.961 |
| eval_samples_per_second                 4.675 |
| eval_steps_per_second                   0.589 |
+-----------------------------------------------+

Trial train_model_939fcb6d started with configuration:
+---------------------------------

[2m[36m(train_model pid=7876)[0m wandb: - Waiting for wandb.init()...
[2m[36m(train_model pid=7876)[0m wandb: \ Waiting for wandb.init()...
[2m[36m(train_model pid=7876)[0m wandb: wandb version 0.15.9 is available!  To upgrade, please run:
[2m[36m(train_model pid=7876)[0m wandb:  $ pip install wandb --upgrade
[2m[36m(train_model pid=7876)[0m wandb: Tracking run with wandb version 0.15.8
[2m[36m(train_model pid=7876)[0m wandb: Run data is saved locally in /root/ray_results/train_model_2023-08-29_15-05-31/train_model_939fcb6d_3_learning_rate=0.0010,lora_alpha=12.6712,lora_r=15.7086_2023-08-29_15-11-33/wandb/run-20230829_151721-0vd16lvn
[2m[36m(train_model pid=7876)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(train_model pid=7876)[0m wandb: Syncing run lora_08291517
[2m[36m(train_model pid=7876)[0m wandb: ⭐️ View project at https://wandb.ai/agazgo/sandbox
[2m[36m(train_model pid=7876)[0m wandb: 🚀 View run at https://wandb.ai/agazgo/sandbox/runs

[2m[36m(train_model pid=7876)[0m {'loss': 2.0826, 'learning_rate': 0.0005, 'epoch': 0.0}
Trial status: 2 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2023-08-29 15:17:35. Total running time: 12min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status         lora_r     lora_alpha     iter     total time (s)     eval_loss     eval_rouge1     eval_rouge2     eval_rougeL |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_939fcb6d   RUNNING       15.7086       12.6712                                                                                            |
| train_model_10157ce5   TERMINATED    12.0382       10.3324         1            348.983       1.48554        0.284021       0.09

[2m[36m(train_model pid=7876)[0m wandb: Waiting for W&B process to finish... (success).


[2m[36m(train_model pid=7876)[0m {'eval_loss': 1.4557255506515503, 'eval_rouge1': 0.2923953596483436, 'eval_rouge2': 0.09815064803369242, 'eval_rougeL': 0.25109032452452756, 'eval_rougeLsum': 0.26871385753291366, 'eval_runtime': 106.3196, 'eval_samples_per_second': 4.703, 'eval_steps_per_second': 0.593, 'epoch': 0.0}
Trial status: 2 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2023-08-29 15:23:05. Total running time: 17min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status         lora_r     lora_alpha     iter     total time (s)     eval_loss     eval_rouge1     eval_rouge2     eval_rougeL |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_939fcb6d   RUNNING       15.7086       

[2m[36m(train_model pid=7876)[0m wandb: 
[2m[36m(train_model pid=7876)[0m wandb: Run history:
[2m[36m(train_model pid=7876)[0m wandb:                      eval/loss █▁▁
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge1 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge2 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rougeL ▁██
[2m[36m(train_model pid=7876)[0m wandb:                 eval/rougeLsum ▁██
[2m[36m(train_model pid=7876)[0m wandb:                   eval/runtime ▇█▁
[2m[36m(train_model pid=7876)[0m wandb:        eval/samples_per_second ▂▁█
[2m[36m(train_model pid=7876)[0m wandb:          eval/steps_per_second ▂▁█
[2m[36m(train_model pid=7876)[0m wandb:                    train/epoch ▁▁▁▁▁▁
[2m[36m(train_model pid=7876)[0m wandb:              train/global_step ▁▁████
[2m[36m(train_model pid=7876)[0m wandb:            train/learning_rate █▁
[2m[36m(train_model pid=7876)[0m w

Trial train_model_939fcb6d completed after 1 iterations at 2023-08-29 15:23:09. Total running time: 17min 35s
+-----------------------------------------------+
| Trial train_model_939fcb6d result             |
+-----------------------------------------------+
| time_this_iter_s                      348.199 |
| time_total_s                          348.199 |
| training_iteration                          1 |
| epoch                                       0 |
| eval_loss                             1.45573 |
| eval_rouge1                            0.2924 |
| eval_rouge2                           0.09815 |
| eval_rougeL                           0.25109 |
| eval_rougeLsum                        0.26871 |
| eval_runtime                           106.32 |
| eval_samples_per_second                 4.703 |
| eval_steps_per_second                   0.593 |
+-----------------------------------------------+

Trial train_model_0d0e1d10 started with configuration:
+---------------------------------

[2m[36m(train_model pid=7876)[0m wandb: - Waiting for wandb.init()...
[2m[36m(train_model pid=7876)[0m wandb: \ Waiting for wandb.init()...
[2m[36m(train_model pid=7876)[0m wandb: wandb version 0.15.9 is available!  To upgrade, please run:
[2m[36m(train_model pid=7876)[0m wandb:  $ pip install wandb --upgrade
[2m[36m(train_model pid=7876)[0m wandb: Tracking run with wandb version 0.15.8
[2m[36m(train_model pid=7876)[0m wandb: Run data is saved locally in /root/ray_results/train_model_2023-08-29_15-05-31/train_model_0d0e1d10_4_learning_rate=0.0010,lora_alpha=8.3085,lora_r=21.1840_2023-08-29_15-17-21/wandb/run-20230829_152310-h0v2d35e
[2m[36m(train_model pid=7876)[0m wandb: Run `wandb offline` to turn off syncing.
[2m[36m(train_model pid=7876)[0m wandb: Syncing run lora_08291523
[2m[36m(train_model pid=7876)[0m wandb: ⭐️ View project at https://wandb.ai/agazgo/sandbox
[2m[36m(train_model pid=7876)[0m wandb: 🚀 View run at https://wandb.ai/agazgo/sandbox/runs/

[2m[36m(train_model pid=7876)[0m {'loss': 2.1134, 'learning_rate': 0.0005, 'epoch': 0.0}
Trial status: 3 TERMINATED | 1 RUNNING
Current time: 2023-08-29 15:23:36. Total running time: 18min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name             status         lora_r     lora_alpha     iter     total time (s)     eval_loss     eval_rouge1     eval_rouge2     eval_rougeL |
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| train_model_0d0e1d10   RUNNING       21.184         8.30851                                                                                           |
| train_model_10157ce5   TERMINATED    12.0382       10.3324         1            348.983       1.48554        0.284021       0.0935946       

[2m[36m(train_model pid=7876)[0m wandb: Waiting for W&B process to finish... (success).


[2m[36m(train_model pid=7876)[0m {'eval_loss': 1.5143159627914429, 'eval_rouge1': 0.2647286761954122, 'eval_rouge2': 0.0836573576250884, 'eval_rougeL': 0.22919919887289258, 'eval_rougeLsum': 0.24345419741749189, 'eval_runtime': 107.1695, 'eval_samples_per_second': 4.666, 'eval_steps_per_second': 0.588, 'epoch': 0.0}


[2m[36m(train_model pid=7876)[0m wandb: - 11.230 MB of 11.230 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: \ 11.230 MB of 11.230 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: | 11.241 MB of 11.251 MB uploaded (0.000 MB deduped)
[2m[36m(train_model pid=7876)[0m wandb: / 11.241 MB of 11.251 MB uploaded (0.000 MB deduped)


Trial train_model_0d0e1d10 completed after 1 iterations at 2023-08-29 15:28:57. Total running time: 23min 22s
+-----------------------------------------------+
| Trial train_model_0d0e1d10 result             |
+-----------------------------------------------+
| time_this_iter_s                      347.178 |
| time_total_s                          347.178 |
| training_iteration                          1 |
| epoch                                       0 |
| eval_loss                             1.51432 |
| eval_rouge1                           0.26473 |
| eval_rouge2                           0.08366 |
| eval_rougeL                            0.2292 |
| eval_rougeLsum                        0.24345 |
| eval_runtime                          107.169 |
| eval_samples_per_second                 4.666 |
| eval_steps_per_second                   0.588 |
+-----------------------------------------------+

Trial status: 4 TERMINATED
Current time: 2023-08-29 15:28:57. Total running time: 23min 2

[2m[36m(train_model pid=7876)[0m wandb: 
[2m[36m(train_model pid=7876)[0m wandb: Run history:
[2m[36m(train_model pid=7876)[0m wandb:                      eval/loss █▁▁
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge1 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rouge2 ▁██
[2m[36m(train_model pid=7876)[0m wandb:                    eval/rougeL ▁██
[2m[36m(train_model pid=7876)[0m wandb:                 eval/rougeLsum ▁██
[2m[36m(train_model pid=7876)[0m wandb:                   eval/runtime ▃█▁
[2m[36m(train_model pid=7876)[0m wandb:        eval/samples_per_second ▆▁█
[2m[36m(train_model pid=7876)[0m wandb:          eval/steps_per_second ▆▁█
[2m[36m(train_model pid=7876)[0m wandb:                    train/epoch ▁▁▁▁▁▁
[2m[36m(train_model pid=7876)[0m wandb:              train/global_step ▁▁████
[2m[36m(train_model pid=7876)[0m wandb:            train/learning_rate █▁
[2m[36m(train_model pid=7876)[0m w

ResultGrid<[
  Result(
    metrics={'eval_loss': 1.4855414628982544, 'eval_rouge1': 0.2840209812529511, 'eval_rouge2': 0.09359461581076228, 'eval_rougeL': 0.24282252467012738, 'eval_rougeLsum': 0.26156406370659135, 'eval_runtime': 106.8238, 'eval_samples_per_second': 4.681, 'eval_steps_per_second': 0.59, 'epoch': 0.0, 'done': True, 'trial_id': '10157ce5', 'experiment_tag': '1_learning_rate=0.0010,lora_alpha=10.3324,lora_r=12.0382'},
    path='/root/ray_results/train_model_2023-08-29_15-05-31/train_model_10157ce5_1_learning_rate=0.0010,lora_alpha=10.3324,lora_r=12.0382_2023-08-29_15-05-34',
    checkpoint=None
  ),
  Result(
    metrics={'eval_loss': 1.4840362071990967, 'eval_rouge1': 0.2825821259708068, 'eval_rouge2': 0.09470185046321697, 'eval_rougeL': 0.24406965294983257, 'eval_rougeLsum': 0.26164535710503106, 'eval_runtime': 106.9606, 'eval_samples_per_second': 4.675, 'eval_steps_per_second': 0.589, 'epoch': 0.0, 'done': True, 'trial_id': 'ae1d9d5c', 'experiment_tag': '2_learning_ra