In [None]:
!pip uninstall -y transformers tokenizers -q

!pip install --no-cache-dir "transformers==4.43.3" "peft==0.11.1" "accelerate==0.30.1"
!pip install --no-cache-dir datasets bert-score textstat
!pip install --no-cache-dir sentencepiece
!pip install --no-cache-dir alignscore-SpeedOfMagic spacy nltk
!python -m spacy download en_core_web_sm

import nltk
nltk.download("punkt")
nltk.download("punkt_tab")


Collecting transformers==4.43.3
  Downloading transformers-4.43.3-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.11.1
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Collecting accelerate==0.30.1
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.43.3)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.43.3-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m188.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m123.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
import os
os.environ["BITSANDBYTES_NOWELCOME"] = "1"
os.environ["BITSANDBYTES_DISABLE"] = "1"

# Desactivar wandb completamente
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"

In [None]:
import os
import zipfile
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any

import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model

from bert_score import score as bert_score
import textstat
from alignscore import AlignScore


ZIP_PATH = "/content/training_data.zip"
ROOT_DATA_DIR = "/content/training_data"
TRAINING_DATA_PATH = "/content/training_data/training_data"

BASE_MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ADAPTER_DIR = "/content/deepseek-r1-distill-qwen-1.5b-pls"

os.makedirs(ROOT_DATA_DIR, exist_ok=True)


if os.path.exists(ZIP_PATH):
    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        z.extractall(ROOT_DATA_DIR)
    print("Datos extraídos en:", ROOT_DATA_DIR)
else:
    print("No encontré training_data.zip")


for root, dirs, files in os.walk("/content"):
    level = root.replace("/content", "").count(os.sep)
    indent = " " * (2 * level)
    print(f"{indent}{os.path.basename(root)}")
    for f in files:
        print(f"{indent}  - {f}")
    if level >= 2:
        break

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Datos extraídos en: /content/training_data
content
  - training_data.zip
  .config
    - .last_opt_in_prompt.yaml
    - .last_survey_prompt.yaml
    - .last_update_check.json
    - hidden_gcloud_config_universe_descriptor_data_cache_configs.db
    - config_sentinel
    - active_config
    - gce
    - default_configs.db
    configurations
      - config_default
Device: cuda


In [None]:
@dataclass
class PromptPair:
    medical: str
    plain: str
    context: str | None = None
    pair_id: str | None = None
    flesch_score: float | None = None


def normalize_doc_id(doc_id: str) -> str:
    s = str(doc_id)
    if "_" in s:
        s = s.split("_")[00]
    s = s.replace("-abstract", "").replace("-pls", "")
    return s


def load_cochrane_pairs(training_data_path: str | Path):
    training_data_path = Path(training_data_path)

    print("\n=== DEBUG RUTA DE DATOS ===")
    print("training_data_path:", training_data_path)
    if training_data_path.exists():
        print("Contenido de la carpeta:", os.listdir(training_data_path))
    else:
        print("Ruta no existe")

    parquet_files = [
        "main_train.parquet",
        "main_test.parquet",
        "augmented_train.parquet",
        "augmented_test.parquet",
        "no_pls_clean.parquet",
        "pls_clean.parquet",
    ]

    all_rows = []

    print("\nLeyendo archivos y agrupando por doc_id normalizado...")
    for fname in parquet_files:
        fp = training_data_path / fname
        if not fp.exists():
            print(f" - [OMITIDO] {fname} (no existe en {training_data_path})")
            continue

        df = pd.read_parquet(fp)
        print(f" - [OK] {fname}: {df.shape[0]} filas")
        for col in ["doc_id", "text", "label"]:
            if col not in df.columns:
                raise ValueError(f"{fname} no tiene columna {col}")
        df = df[["doc_id", "text", "label"]].copy()
        df["doc_id_norm"] = df["doc_id"].apply(normalize_doc_id)
        all_rows.append(df)

    if not all_rows:
        raise RuntimeError("No se encontraron datos parquet válidos en la ruta.")

    full_df = pd.concat(all_rows, ignore_index=True)

    grouped = full_df.groupby("doc_id_norm")

    pairs: list[PromptPair] = []
    for doc_id_norm, g in grouped:
        med_candidates = g[g["label"] == "no_pls"]
        plain_candidates = g[g["label"] == "pls"]
        if med_candidates.empty or plain_candidates.empty:
            continue

        med_row = med_candidates.loc[med_candidates["text"].str.len().idxmax()]
        plain_row = plain_candidates.loc[plain_candidates["text"].str.len().idxmax()]

        pairs.append(
            PromptPair(
                medical=str(med_row["text"]),
                plain=str(plain_row["text"]),
                context=None,
                pair_id=doc_id_norm,
            )
        )

    print(f"\nTotal de doc_id (normalizados) con al menos 1 medical y 1 plain: {len(pairs)}")

    import random
    random.seed(42)
    random.shuffle(pairs)

    n_total = len(pairs)
    n_train = int(n_total * 0.70)
    n_val   = int(n_total * 0.15)
    n_test  = n_total - n_train - n_val

    train_pairs = pairs[:n_train]
    val_pairs   = pairs[n_train:n_train + n_val]
    eval_pairs  = pairs[n_train + n_val:]

    print(f"Total de pares construidos: {n_total}")
    print("Split:")
    print(f" - Train: {len(train_pairs)}")
    print(f" - Val:   {len(val_pairs)}")
    print(f" - Test:  {len(eval_pairs)}")

    return train_pairs, val_pairs, eval_pairs


train_pairs, val_pairs, eval_pairs = load_cochrane_pairs(TRAINING_DATA_PATH)
all_pairs = train_pairs + val_pairs + eval_pairs
print(f"\nTotal de pares preparados para RL / LoRA: {len(all_pairs)}")



=== DEBUG RUTA DE DATOS ===
training_data_path: /content/training_data/training_data
Contenido de la carpeta: ['no_pls_clean.parquet', 'main_train.parquet', 'augmented_test.parquet', 'augmented_train.parquet', 'main_test.parquet', 'pls_clean.parquet']

Leyendo archivos y agrupando por doc_id normalizado...
 - [OK] main_train.parquet: 7408 filas
 - [OK] main_test.parquet: 1851 filas
 - [OK] augmented_train.parquet: 16469 filas
 - [OK] augmented_test.parquet: 4120 filas
 - [OK] no_pls_clean.parquet: 8401 filas
 - [OK] pls_clean.parquet: 6778 filas

Total de doc_id (normalizados) con al menos 1 medical y 1 plain: 5688
Total de pares construidos: 5688
Split:
 - Train: 3981
 - Val:   853
 - Test:  854

Total de pares preparados para RL / LoRA: 5688


In [None]:
INSTRUCTION = (
    "You are a specialist in healthcare communication. "
    "Use the context to transform the following medical text into a clear, concise, "
    "and easy-to-understand summary for a patient and their family. "
    "Retain all relevant clinical data, but explain technical terms using simple language and short sentences.\n\n"
)

import torch
from torch.utils.data import Dataset

class PLSDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_length=2048):
        """
        pairs: lista de PromptPair (medical, plain, ...)
        tokenizer: tokenizer del modelo (DeepSeek-R1-Distill-Qwen-1.5B)
        max_length: longitud máxima del input
        """
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]

        # Construir el prompt completo
        prompt_input = (
            "You are a specialist in healthcare communication. "
            "Use the context to transform the following medical text into a clear, concise, "
            "and easy-to-understand summary for a patient and their family. "
            "Retain all relevant clinical data, but explain technical terms using simple "
            "language and short sentences.\n\n"
            "### Medical text:\n"
            f"{pair.medical}\n\n"
            "### Simplified summary:\n"
        )
        
        # Tokenizar el prompt de input (sin el resumen)
        enc_input = self.tokenizer(
            prompt_input,
            return_tensors="pt"
        )
        
        # Tokenizar el resumen objetivo (sin special tokens al inicio, pero con EOS al final)
        enc_target = self.tokenizer(
            pair.plain,
            add_special_tokens=False,
            return_tensors="pt"
        )
        
        # Obtener los token IDs
        input_ids_input = enc_input["input_ids"][0]
        input_ids_target = enc_target["input_ids"][0]
        
        # Concatenar input + target
        full_input_ids = torch.cat([input_ids_input, input_ids_target])
        
        # Truncar si es necesario
        if len(full_input_ids) > self.max_length:
            # Priorizar mantener el input completo, truncar el target si es necesario
            input_len = len(input_ids_input)
            if input_len < self.max_length:
                # Hay espacio para parte del target
                target_len = self.max_length - input_len
                full_input_ids = torch.cat([
                    input_ids_input,
                    input_ids_target[:target_len]
                ])
            else:
                # El input es muy largo, truncarlo
                full_input_ids = input_ids_input[:self.max_length]
        
        # Padding
        padding_length = self.max_length - len(full_input_ids)
        pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
        if padding_length > 0:
            padding = torch.full((padding_length,), pad_token_id, dtype=full_input_ids.dtype)
            full_input_ids = torch.cat([full_input_ids, padding])
        
        # Attention mask
        attention_mask = (full_input_ids != pad_token_id).long()
        
        # Labels: -100 para input (instrucción + texto médico), tokens reales solo para el resumen
        labels = full_input_ids.clone()
        labels.fill_(-100)  # Marcar todo como ignorado inicialmente
        
        # Solo las etiquetas del resumen deben tener los tokens reales
        input_len = len(input_ids_input)
        actual_target_len = min(len(input_ids_target), len(full_input_ids) - input_len)
        if actual_target_len > 0:
            labels[input_len:input_len + actual_target_len] = input_ids_target[:actual_target_len]
        
        # Marcar padding como -100
        labels[attention_mask == 0] = -100

        return {
            "input_ids": full_input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }



In [None]:
def setup_quantization_config():
    return BitsAndBytesConfig(
        load_in_4bit=False,
        load_in_8bit=False,
    )

def setup_lora_config():
    return LoraConfig(
        r=16,
        lora_alpha=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

print("="*80)
print(f"CARGANDO MODELO BASE {BASE_MODEL_ID}")
print("="*80)

base_tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True
)
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    quantization_config=None,
)

print("Preparando modelo para LoRA...")
lora_config = setup_lora_config()
model_lora = get_peft_model(base_model, lora_config)
model_lora.print_trainable_parameters()


CARGANDO MODELO BASE deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Preparando modelo para LoRA...


NameError: name 'setup_lora_config' is not defined

In [None]:
train_dataset = PLSDataset(train_pairs, base_tokenizer, max_length=2048)
val_dataset   = PLSDataset(val_pairs,   base_tokenizer, max_length=2048)

print("Tamaño train_dataset:", len(train_dataset))
print("Tamaño val_dataset:", len(val_dataset))

# Verificamos un ejemplo
sample = train_dataset[0]
for k, v in sample.items():
    print(k, v.shape, v.dtype)


Tamaño train_dataset: 3981
Tamaño val_dataset: 853
input_ids torch.Size([2048]) torch.int64
attention_mask torch.Size([2048]) torch.int64
labels torch.Size([2048]) torch.int64


In [None]:
os.makedirs(ADAPTER_DIR, exist_ok=True)

training_args = TrainingArguments(
    output_dir="./deepseek_lora_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=3e-4,
    num_train_epochs=2,
    logging_steps=20,
    save_steps=200,
    save_total_limit=2,

    fp16=False,
    bf16=True,  # en A100/T4/L4 está bien, si ves NaN pon bf16=False

    eval_strategy="no",
    report_to="none",
    run_name="deepseek_lora_run",

    logging_nan_inf_filter=False,  # <- si hay NaN verás el error real
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=base_tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("Iniciando fine-tuning...")
trainer.train()
print("Fine-tuning completado.")

model_lora.save_pretrained(ADAPTER_DIR)
print("Adaptador LoRA guardado en:", ADAPTER_DIR)


Iniciando fine-tuning...


Step,Training Loss
20,1.507
40,1.3557
60,1.3795
80,1.3776
100,1.3513
120,1.3443
140,1.363
160,1.3039
180,1.3155
200,1.3326


Fine-tuning completado.
Adaptador LoRA guardado en: /content/deepseek-r1-distill-qwen-1.5b-pls


In [None]:
class EvaluationMetrics:
    def __init__(self, device=None):
        print("Inicializando EvaluationMetrics...")

        self.device = device or (
            torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        )
        print("Dispositivo evaluador:", self.device)

        # ---------------------------------------------------------
        # 1) CORRECTO: Descargar checkpoint desde HuggingFace
        # ---------------------------------------------------------
        from huggingface_hub import hf_hub_download

        ckpt_path = hf_hub_download(
            repo_id="yzha/AlignScore",
            filename="AlignScore-base.ckpt",
        )
        print("Checkpoint AlignScore local:", ckpt_path)

        # ---------------------------------------------------------
        # 2) CORRECTO: Inicializar AlignScore desde SpeedOfMagic
        # ---------------------------------------------------------
        self.align_scorer = AlignScore(
            model="roberta-base",
            batch_size=4,
            device=str(self.device),
            ckpt_path=ckpt_path,
            evaluation_mode="nli_sp",
        )

        print("EvaluationMetrics listo.")

    # ------------------------ RELEVANCIA ------------------------
    def relevance(self, generated: str, reference: str):
        precision, recall, f1 = bert_score(
            [generated],
            [reference],
            lang="en",
            verbose=False,
        )
        return {
            "precision": float(precision.item()),
            "recall": float(recall.item()),
            "f1": float(f1.item()),
        }

    # ------------------------ FACTUALIDAD ------------------------
    def factuality(self, generated: str, source: str):
        try:
            if not generated or not isinstance(generated, str) or len(generated.strip()) < 10:
                return {"score": 0.0}
            if not source or not isinstance(source, str) or len(source.strip()) < 10:
                return {"score": 0.0}
            score = self.align_scorer.score(
                contexts=[source],
                claims=[generated],
            )[0]
            return {"score": float(score)}
        except Exception as e:
            print(f"[WARN] Error calculando factualidad: {e}")
            return {"score": 0.0}

    # ------------------------ LEGIBILIDAD ------------------------
    def readability(self, text: str):
        try:
            if not text or not isinstance(text, str):
                return self._default_readability_metrics()

            txt = text.strip()
            if len(txt) < 10:
                return self._default_readability_metrics()

            fre  = float(textstat.flesch_reading_ease(txt))
            fk   = float(textstat.flesch_kincaid_grade(txt))
            cli  = float(textstat.coleman_liau_index(txt))
            gfi  = float(textstat.gunning_fog(txt))
            smog = float(textstat.smog_index(txt))
            dale = float(textstat.dale_chall_readability_score(txt))

            # Normalizar Flesch
            fre = max(0.0, min(100.0, fre))

            return {
                "flesch_reading_ease": fre,
                "flesch_kincaid_grade_level": fk,
                "coleman_liau_index": cli,
                "gunning_fog_index": gfi,
                "smog_index": smog,
                "dale_chall_readability_score": dale,
            }
        except Exception as e:
            print(f"[WARN] Error calculando legibilidad: {e}")
            return self._default_readability_metrics()

    def _default_readability_metrics(self):
        return {
            "flesch_reading_ease": 30.0,
            "flesch_kincaid_grade_level": 12.0,
            "coleman_liau_index": 12.0,
            "gunning_fog_index": 12.0,
            "smog_index": 12.0,
            "dale_chall_readability_score": 9.0,
        }

    # ------------------------ MÉTRICA FINAL ------------------------
    def evaluate(self, generated: str, reference: str, source: str):
        return {
            "relevance":   self.relevance(generated, reference),
            "factuality":  self.factuality(generated, source),
            "readability": self.readability(generated),
        }


print("Cargando evaluador...")
evaluator = EvaluationMetrics()


Cargando evaluador...
Inicializando EvaluationMetrics...
Dispositivo evaluador: cuda
Checkpoint AlignScore local: /root/.cache/huggingface/hub/models--yzha--AlignScore/snapshots/8509e78d25bb914939fc585c626500c9b2944249/AlignScore-base.ckpt


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.7.7 to v2.5.6. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--yzha--AlignScore/snapshots/8509e78d25bb914939fc585c626500c9b2944249/AlignScore-base.ckpt`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
/usr/local/lib/python3.12/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['base_model.embeddings.position_ids']


EvaluationMetrics listo.


In [None]:
def generate_summary(model, tokenizer, medical_text: str, max_new_tokens: int = 256) -> str:
    prompt = (
        INSTRUCTION
        + "### Medical text:\n"
        + medical_text.strip()
        + "\n\n### Simplified summary:\n"
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048,
    ).to(model.device)

    with torch.no_grad():
        out = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
        )

    full_text = tokenizer.decode(out[0], skip_special_tokens=True)
    if "### Simplified summary:" in full_text:
        summary = full_text.split("### Simplified summary:")[-1].strip()
    else:
        summary = full_text.strip()
    return summary


def evaluate_model(model, tokenizer, pairs, sample_size: int = 30):
    import random
    subset = random.sample(pairs, min(sample_size, len(pairs)))

    results = []
    for p in subset:
        gen = generate_summary(model, tokenizer, p.medical)
        metrics = evaluator.evaluate(
            generated=gen,
            reference=p.plain,
            source=p.medical,
        )
        results.append(metrics)
    return results


def summarize_metrics(results: List[Dict[str, Any]]) -> Dict[str, float]:
    import numpy as np

    if not results:
        return {}

    bert_f1 = [r["relevance"]["f1"] for r in results]
    factual = [r["factuality"]["score"] for r in results]
    flesch  = [r["readability"]["flesch_reading_ease"] for r in results]

    return {
        "bertscore_f1": float(np.mean(bert_f1)),
        "factuality": float(np.mean(factual)),
        "flesch_reading_ease": float(np.mean(flesch)),
    }


In [None]:
print("="*80)
print("EVALUANDO MODELO BASE")
print("="*80)
base_results = evaluate_model(base_model, base_tokenizer, eval_pairs, sample_size=min(30, len(eval_pairs)))
BASE_MODEL_METRICS = summarize_metrics(base_results)
print("Base:", BASE_MODEL_METRICS)

print("="*80)
print("EVALUANDO MODELO FINE-TUNED (LoRA)")
print("="*80)

# modelo fine-tuned es model_lora que ya está en memoria
finetuned_model = model_lora
finetuned_results = evaluate_model(finetuned_model, base_tokenizer, eval_pairs, sample_size=min(30, len(eval_pairs)))
FINETUNED_MODEL_METRICS = summarize_metrics(finetuned_results)
print("Fine-tuned:", FINETUNED_MODEL_METRICS)

import pandas as pd

metrics_table = {
    "BERTScore F1": {
        "Modelo Base": BASE_MODEL_METRICS["bertscore_f1"],
        "Fine-tuned (LoRA)": FINETUNED_MODEL_METRICS["bertscore_f1"],
    },
    "AlignScore (Factualidad)": {
        "Modelo Base": BASE_MODEL_METRICS["factuality"],
        "Fine-tuned (LoRA)": FINETUNED_MODEL_METRICS["factuality"],
    },
    "Flesch Reading Ease": {
        "Modelo Base": BASE_MODEL_METRICS["flesch_reading_ease"],
        "Fine-tuned (LoRA)": FINETUNED_MODEL_METRICS["flesch_reading_ease"],
    },
}

df = pd.DataFrame(metrics_table).T
print("\nTabla comparativa:")
print(df)

print("\nDiferencias (Fine-tuned - Base):")
print("Δ BERTScore F1:", FINETUNED_MODEL_METRICS["bertscore_f1"] - BASE_MODEL_METRICS["bertscore_f1"])
print("Δ Factuality  :", FINETUNED_MODEL_METRICS["factuality"] - BASE_MODEL_METRICS["factuality"])
print("Δ Flesch      :", FINETUNED_MODEL_METRICS["flesch_reading_ease"] - BASE_MODEL_METRICS["flesch_reading_ease"])


EVALUANDO MODELO BASE




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.64it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluati

Base: {'bertscore_f1': 0.8654821276664734, 'factuality': 0.6583624462286631, 'flesch_reading_ease': 32.77305093932154}
EVALUANDO MODELO FINE-TUNED (LoRA)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  9.39it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.73it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluati

Fine-tuned: {'bertscore_f1': 0.8636986096700032, 'factuality': 0.6487543299794197, 'flesch_reading_ease': 34.11356237484331}

Tabla comparativa:
                          Modelo Base  Fine-tuned (LoRA)
BERTScore F1                 0.865482           0.863699
AlignScore (Factualidad)     0.658362           0.648754
Flesch Reading Ease         32.773051          34.113562

Diferencias (Fine-tuned - Base):
Δ BERTScore F1: -0.0017835179964701409
Δ Factuality  : -0.009608116249243404
Δ Flesch      : 1.340511435521769





In [None]:
!pip install stable-baselines3==2.3.0 sb3-contrib==2.3.0 gymnasium==0.29.1

Collecting stable-baselines3==2.3.0
  Downloading stable_baselines3-2.3.0-py3-none-any.whl.metadata (5.1 kB)
Collecting sb3-contrib==2.3.0
  Downloading sb3_contrib-2.3.0-py3-none-any.whl.metadata (3.6 kB)
Collecting gymnasium==0.29.1
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Reason for being yanked: Loading broken with PyTorch 1.13[0m[33m
[0mDownloading stable_baselines3-2.3.0-py3-none-any.whl (182 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.1/182.1 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sb3_contrib-2.3.0-py3-none-any.whl (80 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.3/80.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gymnasium, stable-baselines3, sb3-contrib
  Attem

In [None]:
import stable_baselines3
print(stable_baselines3.__version__)

2.3.0


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  return datetime.utcnow().replace(tzinfo=utc)


In [None]:
# ==============================================================
# SECCIÓN 7: TD3 PARA AJUSTAR LA DECODIFICACIÓN
# ==============================================================

import gymnasium as gym
from gymnasium import spaces
import numpy as np
from stable_baselines3 import TD3
from stable_baselines3.common.env_util import make_vec_env

# ------------------------------------------------------------------
# 7.1 Helper: construir prompt y generar resumen con temperatura/top_p
# ------------------------------------------------------------------

def build_prompt(medical_text: str) -> str:
    """Plantilla de prompt (ajústala si en tu notebook usas otra)."""
    return (
        "You are a specialist in healthcare communication. "
        "Use the context to transform the following medical text into a clear, "
        "concise, and easy-to-understand summary for a patient and their family. "
        "Retain all relevant clinical data, but explain technical terms using simple "
        "language and short sentences.\n\n"
        "### Medical text:\n"
        f"{medical_text}\n\n"
        "### Simplified summary:\n"
    )

@torch.no_grad()
def generate_with_params(model, tokenizer, medical_text: str,
                         temperature: float = 0.7,
                         top_p: float = 0.9,
                         max_new_tokens: int = 256) -> str:
    """Genera un resumen usando el modelo con ciertos parámetros de decodificación."""
    prompt = build_prompt(medical_text)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    ).to(model.device)

    output_ids = model.generate(
        **inputs,
        do_sample=True,
        temperature=float(temperature),
        top_p=float(top_p),
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id,
    )

    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Devolvemos solo lo que viene después de "### Simplified summary:"
    if "### Simplified summary:" in full_text:
        return full_text.split("### Simplified summary:")[-1].strip()
    return full_text.strip()

# ------------------------------------------------------------------
# 7.2 Definición del entorno Gym para TD3
#     - Acción: [temperature, top_p] en [0.1, 1.0]
#     - Observación: vector simple con longitud del texto
# ------------------------------------------------------------------

class SummarizationTD3Env(gym.Env):
    metadata = {"render.modes": []}

    def __init__(self, model, tokenizer, pairs, evaluator, max_new_tokens=256):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.pairs = pairs
        self.evaluator = evaluator
        self.max_new_tokens = max_new_tokens

        # Acción continua: temperatura y top_p
        self.action_space = spaces.Box(
            low=np.array([0.1, 0.1], dtype=np.float32),
            high=np.array([1.0, 1.0], dtype=np.float32),
            dtype=np.float32,
        )

        # Observación: [len(medical_characters_normalized)]
        self.observation_space = spaces.Box(
            low=np.array([0.0], dtype=np.float32),
            high=np.array([1.0], dtype=np.float32),
            dtype=np.float32,
        )

        self.current_index = 0

    def _get_obs_for_index(self, idx: int):
        med = self.pairs[idx].medical
        n_chars = len(med)
        # Normalizamos por 8000 chars para tener algo entre 0 y 1
        return np.array([min(1.0, n_chars / 8000.0)], dtype=np.float32)

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_index = np.random.randint(0, len(self.pairs))
        obs = self._get_obs_for_index(self.current_index)
        return obs, {}

    def step(self, action):
        # Convertir acción a array numpy si es necesario
        action = np.asarray(action, dtype=np.float32)
        
        # Manejar diferentes formatos de acción
        if action.ndim == 0:  # Escalar
            temperature = float(np.clip(action, 0.1, 1.0))
            top_p = float(np.clip(action, 0.1, 1.0))
        elif action.ndim == 1:
            if len(action) >= 2:
                temperature = float(np.clip(action[0], 0.1, 1.0))
                top_p = float(np.clip(action[1], 0.1, 1.0))
            elif len(action) == 1:
                temperature = float(np.clip(action[0], 0.1, 1.0))
                top_p = float(np.clip(action[0], 0.1, 1.0))
            else:
                temperature = 0.7
                top_p = 0.9
        else:
            action_flat = action.flatten()
            if len(action_flat) >= 2:
                temperature = float(np.clip(action_flat[0], 0.1, 1.0))
                top_p = float(np.clip(action_flat[1], 0.1, 1.0))
            else:
                temperature = float(np.clip(action_flat[0], 0.1, 1.0))
                top_p = float(np.clip(action_flat[0], 0.1, 1.0))

        pair = self.pairs[self.current_index]
        medical = pair.medical
        reference = pair.plain
        source = pair.medical  # para factualidad usamos el texto técnico

        # Generar resumen
        generated = generate_with_params(
            self.model,
            self.tokenizer,
            medical,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=self.max_new_tokens,
        )

        # Métricas con manejo de errores
        try:
            metrics = self.evaluator.evaluate(
                generated=generated,
                reference=reference,
                source=source,
            )

            bert_f1 = metrics["relevance"]["f1"]
            fact = metrics["factuality"]["score"]
            fre = metrics["readability"]["flesch_reading_ease"]
        except Exception as e:
            # Si hay error en la evaluación, usar valores por defecto (recompensa baja)
            print(f"[WARN] Error en evaluación TD3: {e}")
            bert_f1 = 0.0
            fact = 0.0
            fre = 0.0

        # Normalizar Flesch (0-100) -> [0,1]
        fre_norm = max(0.0, min(100.0, fre)) / 100.0

        # -----------------------------
        # REWARD (ajústalo a tu gusto)
        # -----------------------------
        # Damos más peso a factualidad, luego relevancia y legibilidad
        reward = (
            0.4 * fact +
            0.4 * bert_f1 +
            0.2 * fre_norm
        )

        obs = self._get_obs_for_index(self.current_index)
        terminated = True   # Un paso = un episodio (bandit contextual)
        truncated = False
        info = {
            "metrics": metrics,
            "temperature": temperature,
            "top_p": top_p,
        }

        # Siguiente episodio usará otro índice
        self.current_index = np.random.randint(0, len(self.pairs))

        return obs, reward, terminated, truncated, info

# ------------------------------------------------------------------
# 7.3 Crear entornos para Base y LoRA
# ------------------------------------------------------------------

# Usamos eval_pairs como "pool" para RL (podrías usar train_pairs también)
rl_pairs = eval_pairs

env_base = make_vec_env(
    lambda: SummarizationTD3Env(base_model, base_tokenizer, rl_pairs, evaluator),
    n_envs=1
)

env_lora = make_vec_env(
    lambda: SummarizationTD3Env(model_lora, base_tokenizer, rl_pairs, evaluator),
    n_envs=1
)

# ------------------------------------------------------------------
# 7.4 Entrenar TD3 para Base y para LoRA
# ------------------------------------------------------------------

TD3_BASE_DIR = "/content/td3_base_agent"
TD3_LORA_DIR = "/content/td3_lora_agent"
td3_steps = 200  # puedes subir a 1000+ si tienes tiempo

# Intentar cargar agentes TD3 existentes, si no existen, entrenarlos
if os.path.exists(TD3_BASE_DIR) and os.path.exists(os.path.join(TD3_BASE_DIR, "policy.pkl")):
    print("Cargando agente TD3 BASE existente...")
    td3_base = TD3.load(TD3_BASE_DIR, env=env_base)
    print("Agente TD3 BASE cargado exitosamente.")
else:
    print("Entrenando TD3 para MODELO BASE...")
    td3_base = TD3(
        "MlpPolicy",
        env_base,
        learning_rate=1e-3,
        batch_size=32,
        verbose=1,
    )
    td3_base.learn(total_timesteps=td3_steps)

if os.path.exists(TD3_LORA_DIR) and os.path.exists(os.path.join(TD3_LORA_DIR, "policy.pkl")):
    print("\nCargando agente TD3 LORA existente...")
    td3_lora = TD3.load(TD3_LORA_DIR, env=env_lora)
    print("Agente TD3 LORA cargado exitosamente.")
else:
    print("\nEntrenando TD3 para MODELO LoRA...")
    td3_lora = TD3(
        "MlpPolicy",
        env_lora,
        learning_rate=1e-3,
        batch_size=32,
        verbose=1,
    )
    td3_lora.learn(total_timesteps=td3_steps)

print("Entrenamiento TD3 completado.")

# Guardar los agentes TD3 (si fueron entrenados o cargados)
os.makedirs(TD3_BASE_DIR, exist_ok=True)
os.makedirs(TD3_LORA_DIR, exist_ok=True)

print("\nGuardando agentes TD3...")
td3_base.save(TD3_BASE_DIR)
td3_lora.save(TD3_LORA_DIR)
print(f"✓ Agente TD3 BASE guardado en: {TD3_BASE_DIR}")
print(f"✓ Agente TD3 LORA guardado en: {TD3_LORA_DIR}")
print("\nNota: Los agentes TD3 guardados pueden ser cargados en futuras ejecuciones para evitar reentrenar.")


Entrenando TD3 para MODELO BASE...
Using cuda device


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluati

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 0        |
|    time_elapsed    | 85       |
|    total_timesteps | 4        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.83it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.665    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 0        |
|    time_elapsed    | 169      |
|    total_timesteps | 8        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.674    |
| time/              |          |
|    episodes        | 12       |
|    fps             | 0        |
|    time_elapsed    | 254      |
|    total_timesteps | 12       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  9.54it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.70it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.661    |
| time/              |          |
|    episodes        | 16       |
|    fps             | 0        |
|    time_elapsed    | 337      |
|    total_timesteps | 16       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.55s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.97it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.666    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 0        |
|    time_elapsed    | 422      |
|    total_timesteps | 20       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.31it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.80it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.667    |
| time/              |          |
|    episodes        | 24       |
|    fps             | 0        |
|    time_elapsed    | 505      |
|    total_timesteps | 24       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.24it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.67     |
| time/              |          |
|    episodes        | 28       |
|    fps             | 0        |
|    time_elapsed    | 589      |
|    total_timesteps | 28       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.50it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 32       |
|    fps             | 0        |
|    time_elapsed    | 672      |
|    total_timesteps | 32       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.46it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.682    |
| time/              |          |
|    episodes        | 36       |
|    fps             | 0        |
|    time_elapsed    | 756      |
|    total_timesteps | 36       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.48it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.685    |
| time/              |          |
|    episodes        | 40       |
|    fps             | 0        |
|    time_elapsed    | 841      |
|    total_timesteps | 40       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.81it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.88it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.682    |
| time/              |          |
|    episodes        | 44       |
|    fps             | 0        |
|    time_elapsed    | 924      |
|    total_timesteps | 44       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:02<00:00,  2.07s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.683    |
| time/              |          |
|    episodes        | 48       |
|    fps             | 0        |
|    time_elapsed    | 1010     |
|    total_timesteps | 48       |
---------------------------------


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.76it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluati

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.681    |
| time/              |          |
|    episodes        | 52       |
|    fps             | 0        |
|    time_elapsed    | 1095     |
|    total_timesteps | 52       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.82it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.681    |
| time/              |          |
|    episodes        | 56       |
|    fps             | 0        |
|    time_elapsed    | 1181     |
|    total_timesteps | 56       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.59it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 60       |
|    fps             | 0        |
|    time_elapsed    | 1264     |
|    total_timesteps | 60       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.41it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.681    |
| time/              |          |
|    episodes        | 64       |
|    fps             | 0        |
|    time_elapsed    | 1349     |
|    total_timesteps | 64       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.69it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 68       |
|    fps             | 0        |
|    time_elapsed    | 1433     |
|    total_timesteps | 68       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.18it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.681    |
| time/              |          |
|    episodes        | 72       |
|    fps             | 0        |
|    time_elapsed    | 1516     |
|    total_timesteps | 72       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.17it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 76       |
|    fps             | 0        |
|    time_elapsed    | 1599     |
|    total_timesteps | 76       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.83it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.38it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 80       |
|    fps             | 0        |
|    time_elapsed    | 1681     |
|    total_timesteps | 80       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 84       |
|    fps             | 0        |
|    time_elapsed    | 1769     |
|    total_timesteps | 84       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.28it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.20it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 88       |
|    fps             | 0        |
|    time_elapsed    | 1852     |
|    total_timesteps | 88       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.18s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 10.69it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 92       |
|    fps             | 0        |
|    time_elapsed    | 1937     |
|    total_timesteps | 92       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.04it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 96       |
|    fps             | 0        |
|    time_elapsed    | 2021     |
|    total_timesteps | 96       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.16s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.24it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 100      |
|    fps             | 0        |
|    time_elapsed    | 2106     |
|    total_timesteps | 100      |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 104      |
|    fps             | 0        |
|    time_elapsed    | 2191     |
|    total_timesteps | 104      |
| train/             |          |
|    actor_loss      | -0.698   |
|    critic_loss     | 0.0435   |
|    learning_rate   | 0.001    |
|    n_updates       | 3        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 108      |
|    fps             | 0        |
|    time_elapsed    | 2277     |
|    total_timesteps | 108      |
| train/             |          |
|    actor_loss      | -0.813   |
|    critic_loss     | 0.0691   |
|    learning_rate   | 0.001    |
|    n_updates       | 7        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 112      |
|    fps             | 0        |
|    time_elapsed    | 2364     |
|    total_timesteps | 112      |
| train/             |          |
|    actor_loss      | -0.508   |
|    critic_loss     | 0.0522   |
|    learning_rate   | 0.001    |
|    n_updates       | 11       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.69it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 116      |
|    fps             | 0        |
|    time_elapsed    | 2450     |
|    total_timesteps | 116      |
| train/             |          |
|    actor_loss      | -0.614   |
|    critic_loss     | 0.039    |
|    learning_rate   | 0.001    |
|    n_updates       | 15       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.42it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.61it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 120      |
|    fps             | 0        |
|    time_elapsed    | 2535     |
|    total_timesteps | 120      |
| train/             |          |
|    actor_loss      | -0.834   |
|    critic_loss     | 0.0259   |
|    learning_rate   | 0.001    |
|    n_updates       | 19       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.23it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.20it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 124      |
|    fps             | 0        |
|    time_elapsed    | 2619     |
|    total_timesteps | 124      |
| train/             |          |
|    actor_loss      | -0.798   |
|    critic_loss     | 0.0218   |
|    learning_rate   | 0.001    |
|    n_updates       | 23       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.13s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.12it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 128      |
|    fps             | 0        |
|    time_elapsed    | 2705     |
|    total_timesteps | 128      |
| train/             |          |
|    actor_loss      | -0.607   |
|    critic_loss     | 0.0127   |
|    learning_rate   | 0.001    |
|    n_updates       | 27       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.05it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 132      |
|    fps             | 0        |
|    time_elapsed    | 2787     |
|    total_timesteps | 132      |
| train/             |          |
|    actor_loss      | -0.612   |
|    critic_loss     | 0.024    |
|    learning_rate   | 0.001    |
|    n_updates       | 31       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.43it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.58s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 136      |
|    fps             | 0        |
|    time_elapsed    | 2873     |
|    total_timesteps | 136      |
| train/             |          |
|    actor_loss      | -0.768   |
|    critic_loss     | 0.0107   |
|    learning_rate   | 0.001    |
|    n_updates       | 35       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.59it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.49s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 140      |
|    fps             | 0        |
|    time_elapsed    | 2958     |
|    total_timesteps | 140      |
| train/             |          |
|    actor_loss      | -0.762   |
|    critic_loss     | 0.0101   |
|    learning_rate   | 0.001    |
|    n_updates       | 39       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.39it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.42it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 144      |
|    fps             | 0        |
|    time_elapsed    | 3042     |
|    total_timesteps | 144      |
| train/             |          |
|    actor_loss      | -0.621   |
|    critic_loss     | 0.0109   |
|    learning_rate   | 0.001    |
|    n_updates       | 43       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 148      |
|    fps             | 0        |
|    time_elapsed    | 3127     |
|    total_timesteps | 148      |
| train/             |          |
|    actor_loss      | -0.617   |
|    critic_loss     | 0.0132   |
|    learning_rate   | 0.001    |
|    n_updates       | 47       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 152      |
|    fps             | 0        |
|    time_elapsed    | 3212     |
|    total_timesteps | 152      |
| train/             |          |
|    actor_loss      | -0.728   |
|    critic_loss     | 0.00615  |
|    learning_rate   | 0.001    |
|    n_updates       | 51       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.73it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 156      |
|    fps             | 0        |
|    time_elapsed    | 3296     |
|    total_timesteps | 156      |
| train/             |          |
|    actor_loss      | -0.716   |
|    critic_loss     | 0.0132   |
|    learning_rate   | 0.001    |
|    n_updates       | 55       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.35s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 160      |
|    fps             | 0        |
|    time_elapsed    | 3381     |
|    total_timesteps | 160      |
| train/             |          |
|    actor_loss      | -0.635   |
|    critic_loss     | 0.00706  |
|    learning_rate   | 0.001    |
|    n_updates       | 59       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.60it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 164      |
|    fps             | 0        |
|    time_elapsed    | 3467     |
|    total_timesteps | 164      |
| train/             |          |
|    actor_loss      | -0.674   |
|    critic_loss     | 0.00924  |
|    learning_rate   | 0.001    |
|    n_updates       | 63       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.98it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 168      |
|    fps             | 0        |
|    time_elapsed    | 3554     |
|    total_timesteps | 168      |
| train/             |          |
|    actor_loss      | -0.71    |
|    critic_loss     | 0.0103   |
|    learning_rate   | 0.001    |
|    n_updates       | 67       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.39it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 172      |
|    fps             | 0        |
|    time_elapsed    | 3640     |
|    total_timesteps | 172      |
| train/             |          |
|    actor_loss      | -0.679   |
|    critic_loss     | 0.0112   |
|    learning_rate   | 0.001    |
|    n_updates       | 71       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.31it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.47it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 176      |
|    fps             | 0        |
|    time_elapsed    | 3724     |
|    total_timesteps | 176      |
| train/             |          |
|    actor_loss      | -0.672   |
|    critic_loss     | 0.0115   |
|    learning_rate   | 0.001    |
|    n_updates       | 75       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 31.96it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 180      |
|    fps             | 0        |
|    time_elapsed    | 3808     |
|    total_timesteps | 180      |
| train/             |          |
|    actor_loss      | -0.709   |
|    critic_loss     | 0.00452  |
|    learning_rate   | 0.001    |
|    n_updates       | 79       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.21it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 184      |
|    fps             | 0        |
|    time_elapsed    | 3893     |
|    total_timesteps | 184      |
| train/             |          |
|    actor_loss      | -0.697   |
|    critic_loss     | 0.00666  |
|    learning_rate   | 0.001    |
|    n_updates       | 83       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:02<00:00,  2.47s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 188      |
|    fps             | 0        |
|    time_elapsed    | 3982     |
|    total_timesteps | 188      |
| train/             |          |
|    actor_loss      | -0.644   |
|    critic_loss     | 0.0112   |
|    learning_rate   | 0.001    |
|    n_updates       | 87       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.23it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.40s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.68     |
| time/              |          |
|    episodes        | 192      |
|    fps             | 0        |
|    time_elapsed    | 4066     |
|    total_timesteps | 192      |
| train/             |          |
|    actor_loss      | -0.705   |
|    critic_loss     | 0.0111   |
|    learning_rate   | 0.001    |
|    n_updates       | 91       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.26it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.92it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 196      |
|    fps             | 0        |
|    time_elapsed    | 4150     |
|    total_timesteps | 196      |
| train/             |          |
|    actor_loss      | -0.652   |
|    critic_loss     | 0.0106   |
|    learning_rate   | 0.001    |
|    n_updates       | 95       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.36it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.31it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 200      |
|    fps             | 0        |
|    time_elapsed    | 4235     |
|    total_timesteps | 200      |
| train/             |          |
|    actor_loss      | -0.7     |
|    critic_loss     | 0.00755  |
|    learning_rate   | 0.001    |
|    n_updates       | 99       |
---------------------------------






Entrenando TD3 para MODELO LoRA...
Using cuda device


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.68it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.62it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluati

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.665    |
| time/              |          |
|    episodes        | 4        |
|    fps             | 0        |
|    time_elapsed    | 85       |
|    total_timesteps | 4        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.654    |
| time/              |          |
|    episodes        | 8        |
|    fps             | 0        |
|    time_elapsed    | 169      |
|    total_timesteps | 8        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.34s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.673    |
| time/              |          |
|    episodes        | 12       |
|    fps             | 0        |
|    time_elapsed    | 254      |
|    total_timesteps | 12       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.18it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.91it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 16       |
|    fps             | 0        |
|    time_elapsed    | 339      |
|    total_timesteps | 16       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.17it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.674    |
| time/              |          |
|    episodes        | 20       |
|    fps             | 0        |
|    time_elapsed    | 425      |
|    total_timesteps | 20       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 12.04it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 24       |
|    fps             | 0        |
|    time_elapsed    | 510      |
|    total_timesteps | 24       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.88it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.32it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.673    |
| time/              |          |
|    episodes        | 28       |
|    fps             | 0        |
|    time_elapsed    | 594      |
|    total_timesteps | 28       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 32       |
|    fps             | 0        |
|    time_elapsed    | 679      |
|    total_timesteps | 32       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.91it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.666    |
| time/              |          |
|    episodes        | 36       |
|    fps             | 0        |
|    time_elapsed    | 763      |
|    total_timesteps | 36       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.72it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.45it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.661    |
| time/              |          |
|    episodes        | 40       |
|    fps             | 0        |
|    time_elapsed    | 846      |
|    total_timesteps | 40       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.28it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.36it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.664    |
| time/              |          |
|    episodes        | 44       |
|    fps             | 0        |
|    time_elapsed    | 931      |
|    total_timesteps | 44       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.32it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.30it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.668    |
| time/              |          |
|    episodes        | 48       |
|    fps             | 0        |
|    time_elapsed    | 1016     |
|    total_timesteps | 48       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.71it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.665    |
| time/              |          |
|    episodes        | 52       |
|    fps             | 0        |
|    time_elapsed    | 1099     |
|    total_timesteps | 52       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.80it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.67it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.664    |
| time/              |          |
|    episodes        | 56       |
|    fps             | 0        |
|    time_elapsed    | 1184     |
|    total_timesteps | 56       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.665    |
| time/              |          |
|    episodes        | 60       |
|    fps             | 0        |
|    time_elapsed    | 1270     |
|    total_timesteps | 60       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.67     |
| time/              |          |
|    episodes        | 64       |
|    fps             | 0        |
|    time_elapsed    | 1357     |
|    total_timesteps | 64       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:02<00:00,  2.03s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 68       |
|    fps             | 0        |
|    time_elapsed    | 1445     |
|    total_timesteps | 68       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.42it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 72       |
|    fps             | 0        |
|    time_elapsed    | 1529     |
|    total_timesteps | 72       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.16it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 76       |
|    fps             | 0        |
|    time_elapsed    | 1612     |
|    total_timesteps | 76       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.46it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 80       |
|    fps             | 0        |
|    time_elapsed    | 1696     |
|    total_timesteps | 80       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.83it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.09it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 84       |
|    fps             | 0        |
|    time_elapsed    | 1779     |
|    total_timesteps | 84       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 22.83it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.675    |
| time/              |          |
|    episodes        | 88       |
|    fps             | 0        |
|    time_elapsed    | 1864     |
|    total_timesteps | 88       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.16it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 92       |
|    fps             | 0        |
|    time_elapsed    | 1947     |
|    total_timesteps | 92       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.45it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 96       |
|    fps             | 0        |
|    time_elapsed    | 2034     |
|    total_timesteps | 96       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.61it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.676    |
| time/              |          |
|    episodes        | 100      |
|    fps             | 0        |
|    time_elapsed    | 2118     |
|    total_timesteps | 100      |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.31it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.76it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 104      |
|    fps             | 0        |
|    time_elapsed    | 2202     |
|    total_timesteps | 104      |
| train/             |          |
|    actor_loss      | -0.627   |
|    critic_loss     | 0.0274   |
|    learning_rate   | 0.001    |
|    n_updates       | 3        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 108      |
|    fps             | 0        |
|    time_elapsed    | 2289     |
|    total_timesteps | 108      |
| train/             |          |
|    actor_loss      | -0.853   |
|    critic_loss     | 0.107    |
|    learning_rate   | 0.001    |
|    n_updates       | 7        |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.33s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.07it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 112      |
|    fps             | 0        |
|    time_elapsed    | 2374     |
|    total_timesteps | 112      |
| train/             |          |
|    actor_loss      | -0.541   |
|    critic_loss     | 0.0574   |
|    learning_rate   | 0.001    |
|    n_updates       | 11       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.25it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.11it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.678    |
| time/              |          |
|    episodes        | 116      |
|    fps             | 0        |
|    time_elapsed    | 2459     |
|    total_timesteps | 116      |
| train/             |          |
|    actor_loss      | -0.629   |
|    critic_loss     | 0.0499   |
|    learning_rate   | 0.001    |
|    n_updates       | 15       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.28it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 120      |
|    fps             | 0        |
|    time_elapsed    | 2542     |
|    total_timesteps | 120      |
| train/             |          |
|    actor_loss      | -0.886   |
|    critic_loss     | 0.0188   |
|    learning_rate   | 0.001    |
|    n_updates       | 19       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.63s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.677    |
| time/              |          |
|    episodes        | 124      |
|    fps             | 0        |
|    time_elapsed    | 2628     |
|    total_timesteps | 124      |
| train/             |          |
|    actor_loss      | -0.952   |
|    critic_loss     | 0.0352   |
|    learning_rate   | 0.001    |
|    n_updates       | 23       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.89it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.10it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.679    |
| time/              |          |
|    episodes        | 128      |
|    fps             | 0        |
|    time_elapsed    | 2712     |
|    total_timesteps | 128      |
| train/             |          |
|    actor_loss      | -0.706   |
|    critic_loss     | 0.016    |
|    learning_rate   | 0.001    |
|    n_updates       | 27       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.53it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.68     |
| time/              |          |
|    episodes        | 132      |
|    fps             | 0        |
|    time_elapsed    | 2796     |
|    total_timesteps | 132      |
| train/             |          |
|    actor_loss      | -0.57    |
|    critic_loss     | 0.0313   |
|    learning_rate   | 0.001    |
|    n_updates       | 31       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.83it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.684    |
| time/              |          |
|    episodes        | 136      |
|    fps             | 0        |
|    time_elapsed    | 2881     |
|    total_timesteps | 136      |
| train/             |          |
|    actor_loss      | -0.712   |
|    critic_loss     | 0.0177   |
|    learning_rate   | 0.001    |
|    n_updates       | 35       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.95it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.41it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.687    |
| time/              |          |
|    episodes        | 140      |
|    fps             | 0        |
|    time_elapsed    | 2966     |
|    total_timesteps | 140      |
| train/             |          |
|    actor_loss      | -0.802   |
|    critic_loss     | 0.0152   |
|    learning_rate   | 0.001    |
|    n_updates       | 39       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.75it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.687    |
| time/              |          |
|    episodes        | 144      |
|    fps             | 0        |
|    time_elapsed    | 3051     |
|    total_timesteps | 144      |
| train/             |          |
|    actor_loss      | -0.698   |
|    critic_loss     | 0.0185   |
|    learning_rate   | 0.001    |
|    n_updates       | 43       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.12it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.04it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.685    |
| time/              |          |
|    episodes        | 148      |
|    fps             | 0        |
|    time_elapsed    | 3135     |
|    total_timesteps | 148      |
| train/             |          |
|    actor_loss      | -0.641   |
|    critic_loss     | 0.0116   |
|    learning_rate   | 0.001    |
|    n_updates       | 47       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.03it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.11it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.686    |
| time/              |          |
|    episodes        | 152      |
|    fps             | 0        |
|    time_elapsed    | 3221     |
|    total_timesteps | 152      |
| train/             |          |
|    actor_loss      | -0.69    |
|    critic_loss     | 0.0062   |
|    learning_rate   | 0.001    |
|    n_updates       | 51       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  1.13it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.688    |
| time/              |          |
|    episodes        | 156      |
|    fps             | 0        |
|    time_elapsed    | 3303     |
|    total_timesteps | 156      |
| train/             |          |
|    actor_loss      | -0.714   |
|    critic_loss     | 0.00825  |
|    learning_rate   | 0.001    |
|    n_updates       | 55       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.01it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  6.26it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.688    |
| time/              |          |
|    episodes        | 160      |
|    fps             | 0        |
|    time_elapsed    | 3389     |
|    total_timesteps | 160      |
| train/             |          |
|    actor_loss      | -0.686   |
|    critic_loss     | 0.00783  |
|    learning_rate   | 0.001    |
|    n_updates       | 59       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.62s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.684    |
| time/              |          |
|    episodes        | 164      |
|    fps             | 0        |
|    time_elapsed    | 3473     |
|    total_timesteps | 164      |
| train/             |          |
|    actor_loss      | -0.687   |
|    critic_loss     | 0.0136   |
|    learning_rate   | 0.001    |
|    n_updates       | 63       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.86it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.55it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.681    |
| time/              |          |
|    episodes        | 168      |
|    fps             | 0        |
|    time_elapsed    | 3559     |
|    total_timesteps | 168      |
| train/             |          |
|    actor_loss      | -0.668   |
|    critic_loss     | 0.0102   |
|    learning_rate   | 0.001    |
|    n_updates       | 67       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.48s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  5.42it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.682    |
| time/              |          |
|    episodes        | 172      |
|    fps             | 0        |
|    time_elapsed    | 3645     |
|    total_timesteps | 172      |
| train/             |          |
|    actor_loss      | -0.662   |
|    critic_loss     | 0.0087   |
|    learning_rate   | 0.001    |
|    n_updates       | 71       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.00s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.64it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.684    |
| time/              |          |
|    episodes        | 176      |
|    fps             | 0        |
|    time_elapsed    | 3729     |
|    total_timesteps | 176      |
| train/             |          |
|    actor_loss      | -0.689   |
|    critic_loss     | 0.00561  |
|    learning_rate   | 0.001    |
|    n_updates       | 75       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.85it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 11.33it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.684    |
| time/              |          |
|    episodes        | 180      |
|    fps             | 0        |
|    time_elapsed    | 3813     |
|    total_timesteps | 180      |
| train/             |          |
|    actor_loss      | -0.683   |
|    critic_loss     | 0.00476  |
|    learning_rate   | 0.001    |
|    n_updates       | 79       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.62it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  7.11it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.685    |
| time/              |          |
|    episodes        | 184      |
|    fps             | 0        |
|    time_elapsed    | 3896     |
|    total_timesteps | 184      |
| train/             |          |
|    actor_loss      | -0.695   |
|    critic_loss     | 0.00941  |
|    learning_rate   | 0.001    |
|    n_updates       | 83       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.57s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.40it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.686    |
| time/              |          |
|    episodes        | 188      |
|    fps             | 0        |
|    time_elapsed    | 3981     |
|    total_timesteps | 188      |
| train/             |          |
|    actor_loss      | -0.672   |
|    critic_loss     | 0.00729  |
|    learning_rate   | 0.001    |
|    n_updates       | 87       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.31it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  4.32it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.687    |
| time/              |          |
|    episodes        | 192      |
|    fps             | 0        |
|    time_elapsed    | 4065     |
|    total_timesteps | 192      |
| train/             |          |
|    actor_loss      | -0.7     |
|    critic_loss     | 0.00703  |
|    learning_rate   | 0.001    |
|    n_updates       | 91       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  8.58it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.686    |
| time/              |          |
|    episodes        | 196      |
|    fps             | 0        |
|    time_elapsed    | 4150     |
|    total_timesteps | 196      |
| train/             |          |
|    actor_loss      | -0.695   |
|    critic_loss     | 0.00981  |
|    learning_rate   | 0.001    |
|    n_updates       | 95       |
---------------------------------



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  2.96it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 18.57it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluat

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 0.684    |
| time/              |          |
|    episodes        | 200      |
|    fps             | 0        |
|    time_elapsed    | 4235     |
|    total_timesteps | 200      |
| train/             |          |
|    actor_loss      | -0.67    |
|    critic_loss     | 0.00826  |
|    learning_rate   | 0.001    |
|    n_updates       | 99       |
---------------------------------
Entrenamiento TD3 completado.





In [None]:
# ==============================================================
# Parchear el método step de los entornos existentes
# ==============================================================
# Esto es necesario porque los entornos ya fueron creados con la versión antigua
# de la clase. Parcheamos el método step sin tener que recrear los entornos.

import types

def step_patched(self, action):
    """Versión corregida del método step con manejo robusto de acciones."""
    # Convertir acción a array numpy si es necesario
    action = np.asarray(action, dtype=np.float32)
    
    # Manejar diferentes formatos de acción
    if action.ndim == 0:  # Escalar
        temperature = float(np.clip(action, 0.1, 1.0))
        top_p = float(np.clip(action, 0.1, 1.0))
    elif action.ndim == 1:
        if len(action) >= 2:
            temperature = float(np.clip(action[0], 0.1, 1.0))
            top_p = float(np.clip(action[1], 0.1, 1.0))
        elif len(action) == 1:
            temperature = float(np.clip(action[0], 0.1, 1.0))
            top_p = float(np.clip(action[0], 0.1, 1.0))
        else:
            temperature = 0.7
            top_p = 0.9
    else:
        action_flat = action.flatten()
        if len(action_flat) >= 2:
            temperature = float(np.clip(action_flat[0], 0.1, 1.0))
            top_p = float(np.clip(action_flat[1], 0.1, 1.0))
        else:
            temperature = float(np.clip(action_flat[0], 0.1, 1.0))
            top_p = float(np.clip(action_flat[0], 0.1, 1.0))

    pair = self.pairs[self.current_index]
    medical = pair.medical
    reference = pair.plain
    source = pair.medical

    # Generar resumen
    generated = generate_with_params(
        self.model,
        self.tokenizer,
        medical,
        temperature=temperature,
        top_p=top_p,
        max_new_tokens=self.max_new_tokens,
    )

    # Métricas con manejo de errores
    try:
        metrics = self.evaluator.evaluate(
            generated=generated,
            reference=reference,
            source=source,
        )

        bert_f1 = metrics["relevance"]["f1"]
        fact = metrics["factuality"]["score"]
        fre = metrics["readability"]["flesch_reading_ease"]
    except Exception as e:
        print(f"[WARN] Error en evaluación TD3: {e}")
        bert_f1 = 0.0
        fact = 0.0
        fre = 0.0
        # Crear métricas por defecto para evitar error en info
        metrics = {
            "relevance": {"f1": 0.0},
            "factuality": {"score": 0.0},
            "readability": {"flesch_reading_ease": 0.0}
        }

    # Normalizar Flesch (0-100) -> [0,1]
    fre_norm = max(0.0, min(100.0, fre)) / 100.0

    # REWARD
    reward = (
        0.4 * fact +
        0.4 * bert_f1 +
        0.2 * fre_norm
    )

    obs = self._get_obs_for_index(self.current_index)
    terminated = True
    truncated = False
    info = {
        "metrics": metrics,
        "temperature": temperature,
        "top_p": top_p,
    }

    # Siguiente episodio usará otro índice
    self.current_index = np.random.randint(0, len(self.pairs))

    return obs, reward, terminated, truncated, info

# Aplicar el parche a los entornos existentes
env_base.envs[0].env.step = types.MethodType(step_patched, env_base.envs[0].env)
env_lora.envs[0].env.step = types.MethodType(step_patched, env_lora.envs[0].env)

print("✓ Método step parcheado en entornos existentes")


In [None]:
# ==============================================================
# 7.5 Obtener parámetros de decodificación "óptimos" de TD3
# ==============================================================

def get_mean_action(agent, env, n_samples=64):
    """Promedia acciones del agente sobre varios estados aleatorios."""
    # En entornos vectorizados de stable-baselines3, reset() devuelve solo la observación
    obs = env.reset()
    
    actions = []
    for _ in range(n_samples):
        action, _ = agent.predict(obs, deterministic=True)
        # Convertir acción a array numpy y asegurar que tenga 2 elementos
        action = np.asarray(action, dtype=np.float32)
        if action.ndim == 0:  # Escalar
            action = np.array([action, action], dtype=np.float32)
        elif action.ndim == 1:
            if len(action) == 1:
                action = np.array([action[0], action[0]], dtype=np.float32)
            elif len(action) > 2:
                action = action[:2]  # Tomar solo los primeros 2 elementos
        else:
            # Multidimensional, aplanar y tomar primeros 2
            action_flat = action.flatten()
            if len(action_flat) >= 2:
                action = action_flat[:2]
            else:
                action = np.array([action_flat[0], action_flat[0]], dtype=np.float32)
        
        actions.append(action)
        
        # step() devuelve (obs, reward, done, info) o (obs, reward, terminated, truncated, info)
        step_result = env.step(action)
        obs = step_result[0]
    
    actions = np.array(actions)
    mean_action = actions.mean(axis=0)
    temperature = float(np.clip(mean_action[0], 0.1, 1.0))
    top_p = float(np.clip(mean_action[1], 0.1, 1.0))
    return temperature, top_p

# Ajustar n_samples para reducir tiempo (por defecto 64)
# Menos muestras = más rápido pero menos preciso
# Más muestras = más lento pero más preciso
N_SAMPLES = 32 

print(f"Calculando parámetros óptimos con {N_SAMPLES} muestras por modelo...")
print("Esto puede tomar 10-30 minutos dependiendo de tu GPU...")
print("="*80)

import time
start_time = time.time()

temp_base, top_p_base = get_mean_action(td3_base, env_base, n_samples=N_SAMPLES)
print(f"✓ Base completado: temperature={temp_base:.3f}, top_p={top_p_base:.3f}")

temp_lora, top_p_lora = get_mean_action(td3_lora, env_lora, n_samples=N_SAMPLES)
print(f"✓ LoRA completado: temperature={temp_lora:.3f}, top_p={top_p_lora:.3f}")

elapsed_time = time.time() - start_time
print("="*80)
print(f"Tiempo total: {elapsed_time/60:.1f} minutos ({elapsed_time:.0f} segundos)")
print(f"\nParámetros TD3 - BASE: temperature={temp_base:.3f}, top_p={top_p_base:.3f}")
print(f"Parámetros TD3 - LORA: temperature={temp_lora:.3f}, top_p={top_p_lora:.3f}")


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# ==============================================================
# Tabla de resultados formateada: BASE + TD3 vs LORA + TD3
# ==============================================================
import pandas as pd

# Verificar que las métricas TD3 estén definidas
if 'BASE_TD3_METRICS' not in globals() or 'LORA_TD3_METRICS' not in globals():
    print("⚠️ ADVERTENCIA: BASE_TD3_METRICS o LORA_TD3_METRICS no están definidas.")
    print("Por favor, ejecuta primero la celda 17 que calcula estas métricas.")
    print("Esta celda requiere que se hayan evaluado los modelos con TD3.")
else:
    # Crear tabla con los resultados
    resultados_td3 = {
        "Métrica": [
            "BERTScore F1",
            "AlignScore (Factualidad)",
            "Flesch Reading Ease"
        ],
        "BASE + TD3": [
            BASE_TD3_METRICS["bertscore_f1"],
            BASE_TD3_METRICS["factuality"],
            BASE_TD3_METRICS["flesch_reading_ease"]
        ],
        "LORA + TD3": [
            LORA_TD3_METRICS["bertscore_f1"],
            LORA_TD3_METRICS["factuality"],
            LORA_TD3_METRICS["flesch_reading_ease"]
        ]
    }

    df_td3 = pd.DataFrame(resultados_td3)
    df_td3.set_index("Métrica", inplace=True)

    print("\n" + "="*80)
    print("TABLA DE RESULTADOS: BASE + TD3 vs LORA + TD3")
    print("="*80)
    print(df_td3.to_string())
    print("="*80)

    # Calcular diferencias
    print("\nDiferencias (LORA + TD3 - BASE + TD3):")
    print(f"Δ BERTScore F1:        {LORA_TD3_METRICS['bertscore_f1'] - BASE_TD3_METRICS['bertscore_f1']:+.4f}")
    print(f"Δ AlignScore (Fact.):  {LORA_TD3_METRICS['factuality'] - BASE_TD3_METRICS['factuality']:+.4f}")
    print(f"Δ Flesch Reading Ease: {LORA_TD3_METRICS['flesch_reading_ease'] - BASE_TD3_METRICS['flesch_reading_ease']:+.2f}")


In [None]:
# ==============================================================
# 7.6 Evaluar modelos con TD3 vs sin TD3
# ==============================================================

from tqdm.auto import tqdm

def evaluate_model_with_params(model, tokenizer, pairs, sample_size=30,
                               temperature=0.7, top_p=0.9, max_new_tokens=256):
    sample = pairs[:sample_size]
    results = []
    for pair in tqdm(sample, desc="Evaluating with TD3 params"):
        medical = pair.medical
        reference = pair.plain
        source = pair.medical

        generated = generate_with_params(
            model,
            tokenizer,
            medical,
            temperature=temperature,
            top_p=top_p,
            max_new_tokens=max_new_tokens,
        )

        metrics = evaluator.evaluate(generated, reference, source)
        results.append(metrics)

    # Resumimos igualmente que en summarize_metrics
    bert_f1 = np.mean([m["relevance"]["f1"] for m in results])
    fact = np.mean([m["factuality"]["score"] for m in results])
    fre = np.mean([m["readability"]["flesch_reading_ease"] for m in results])

    summary = {
        "bertscore_f1": float(bert_f1),
        "factuality": float(fact),
        "flesch_reading_ease": float(fre),
    }
    
    # Devolver tanto el resumen como los resultados individuales
    return summary, results
# Usamos el mismo eval_pairs y sample_size que antes
sample_size = min(30, len(eval_pairs))

print("\n=== Evaluando BASE + TD3 ===")
BASE_TD3_METRICS, base_td3_results = evaluate_model_with_params(
    base_model,
    base_tokenizer,
    eval_pairs,
    sample_size=sample_size,
    temperature=temp_base,
    top_p=top_p_base,
)

print("\n=== Evaluando LORA + TD3 ===")
LORA_TD3_METRICS, lora_td3_results = evaluate_model_with_params(
    model_lora,
    base_tokenizer,
    eval_pairs,
    sample_size=sample_size,
    temperature=temp_lora,
    top_p=top_p_lora,
)

print("BASE + TD3:", BASE_TD3_METRICS)
print("LORA + TD3:", LORA_TD3_METRICS)
print(f"\n✓ Resultados individuales guardados: base_td3_results ({len(base_td3_results)} muestras)")
print(f"✓ Resultados individuales guardados: lora_td3_results ({len(lora_td3_results)} muestras)")


In [None]:
# ==============================================================
# 7.7 Tabla comparativa final: Base vs LoRA vs TD3
# ==============================================================

import pandas as pd

metrics_table = {
    "BERTScore F1": {
        "Base": BASE_MODEL_METRICS["bertscore_f1"],
        "Base + TD3": BASE_TD3_METRICS["bertscore_f1"],
        "LoRA": FINETUNED_MODEL_METRICS["bertscore_f1"],
        "LoRA + TD3": LORA_TD3_METRICS["bertscore_f1"],
    },
    "AlignScore (Factualidad)": {
        "Base": BASE_MODEL_METRICS["factuality"],
        "Base + TD3": BASE_TD3_METRICS["factuality"],
        "LoRA": FINETUNED_MODEL_METRICS["factuality"],
        "LoRA + TD3": LORA_TD3_METRICS["factuality"],
    },
    "Flesch Reading Ease": {
        "Base": BASE_MODEL_METRICS["flesch_reading_ease"],
        "Base + TD3": BASE_TD3_METRICS["flesch_reading_ease"],
        "LoRA": FINETUNED_MODEL_METRICS["flesch_reading_ease"],
        "LoRA + TD3": LORA_TD3_METRICS["flesch_reading_ease"],
    },
}

df = pd.DataFrame(metrics_table).T
print("\n=== TABLA COMPARATIVA FINAL ===")
print(df)

print("\nDiferencias (modelo - Base):")
for name, row in df.iterrows():
    print(f"\n{name}:")
    for col in df.columns:
        if col == "Base":
            continue
        delta = row[col] - row["Base"]
        print(f"  {col} - Base: {delta:+.4f}")


In [None]:
# ==============================================================
# SECCIÓN 8: EVALUACIÓN DE APIs COMERCIALES
# ==============================================================

# Instalar librerías necesarias para APIs comerciales
!pip install openai anthropic google-generativeai -q

import os
import time
import numpy as np
from typing import Optional
from tqdm.auto import tqdm

# ==============================================================
# CONFIGURAR API KEYS
# ==============================================================
# Buscar API keys primero en secretos de Colab, luego en variables de entorno
API_KEY_ANTHROPIC = None
API_KEY_GEMINI = None
API_KEY_OPENAI = None

try:
    from google.colab import userdata
    # userdata.get() en la nueva versión solo acepta un argumento
    try:
        API_KEY_ANTHROPIC = userdata.get("API_KEY_ANTHROPIC")
    except (KeyError, Exception):
        pass
    try:
        API_KEY_GEMINI = userdata.get("API_KEY_GEMINI")
    except (KeyError, Exception):
        pass
    try:
        API_KEY_OPENAI = userdata.get("API_KEY_OPENAI")
    except (KeyError, Exception):
        pass
except ImportError:
    # No estamos en Colab, usar variables de entorno
    pass

# Si no se encontraron en secretos, buscar en variables de entorno
if not API_KEY_ANTHROPIC:
    API_KEY_ANTHROPIC = os.getenv("API_KEY_ANTHROPIC")
if not API_KEY_GEMINI:
    API_KEY_GEMINI = os.getenv("API_KEY_GEMINI")
if not API_KEY_OPENAI:
    API_KEY_OPENAI = os.getenv("API_KEY_OPENAI")

# Verificar que las API keys estén configuradas
if not API_KEY_ANTHROPIC:
    print(" ADVERTENCIA: API_KEY_ANTHROPIC no está configurada")
if not API_KEY_GEMINI:
    print(" ADVERTENCIA: API_KEY_GEMINI no está configurada")
if not API_KEY_OPENAI:
    print(" ADVERTENCIA: API_KEY_OPENAI no está configurada")

# Importar clientes de APIs
try:
    import anthropic
    # Validar y limpiar API key antes de crear el cliente
    if API_KEY_ANTHROPIC:
        API_KEY_ANTHROPIC = API_KEY_ANTHROPIC.strip()  # Eliminar espacios
        if API_KEY_ANTHROPIC.startswith('sk-ant-'):
            try:
                client_anthropic = anthropic.Anthropic(api_key=API_KEY_ANTHROPIC)
                print(f" Anthropic configurado correctamente (key: {API_KEY_ANTHROPIC[:10]}...{API_KEY_ANTHROPIC[-5:]})")
            except Exception as e:
                client_anthropic = None
                print(f" Error al inicializar Anthropic: {e}")
        else:
            client_anthropic = None
            print(f" ADVERTENCIA: API_KEY_ANTHROPIC no tiene el formato correcto (debe empezar con 'sk-ant-'). Valor recibido: {API_KEY_ANTHROPIC[:20]}...")
    else:
        client_anthropic = None
except ImportError:
    client_anthropic = None
    print(" No se pudo importar anthropic")

try:
    import google.generativeai as genai
    if API_KEY_GEMINI:
        genai.configure(api_key=API_KEY_GEMINI)
        model_gemini = genai.GenerativeModel('gemini-2.5-flash')
    else:
        model_gemini = None
except ImportError:
    model_gemini = None
    print(" No se pudo importar google.generativeai")

try:
    from openai import OpenAI
    client_openai = OpenAI(api_key=API_KEY_OPENAI) if API_KEY_OPENAI else None
except ImportError:
    client_openai = None
    print(" No se pudo importar openai")

# Función para construir el prompt (mismo que se usa localmente)
def build_api_prompt(medical_text: str) -> str:
    """Construye el prompt para las APIs comerciales (mismo formato que localmente)."""
    return (
        "You are a specialist in healthcare communication. "
        "Use the context to transform the following medical text into a clear, "
        "concise, and easy-to-understand summary for a patient and their family. "
        "Retain all relevant clinical data, but explain technical terms using simple "
        "language and short sentences.\n\n"
        "### Medical text:\n"
        f"{medical_text}\n\n"
        "### Simplified summary:\n"
    )

# Funciones para generar resúmenes con cada API
def generate_anthropic(medical_text: str, max_retries: int = 3) -> Optional[str]:
    """Genera resumen usando Anthropic Claude."""
    if not client_anthropic:
        return None
    
    prompt = build_api_prompt(medical_text)
    
    for attempt in range(max_retries):
        try:
            message = client_anthropic.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=1024,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )
            response = message.content[0].text
            # Extraer solo el resumen si hay marcadores
            if "### Simplified summary:" in response:
                return response.split("### Simplified summary:")[-1].strip()
            return response.strip()
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Backoff exponencial
                continue
            print(f"Error en Anthropic: {e}")
            return None

def generate_gemini(medical_text: str, max_retries: int = 3) -> Optional[str]:
    """Genera resumen usando Google Gemini."""
    if not model_gemini:
        return None
    
    prompt = build_api_prompt(medical_text)
    
    for attempt in range(max_retries):
        try:
            response = model_gemini.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    max_output_tokens=8192,
                    temperature=0.7,
                )
            )
            
            # Verificar finish_reason antes de acceder a response.text
            if response.candidates and len(response.candidates) > 0:
                candidate = response.candidates[0]
                finish_reason = candidate.finish_reason
                
                # finish_reason 2 = MAX_TOKENS, 3 = SAFETY, 4 = RECITATION
                if finish_reason in [2, 3, 4]:
                    if finish_reason == 2:
                        print(f"Warning: Gemini alcanzó el límite de tokens (finish_reason=2). Usando texto parcial generado.")
                    elif finish_reason == 3:
                        print(f"Warning: Gemini bloqueó contenido por seguridad (finish_reason=3)")
                    elif finish_reason == 4:
                        print(f"Warning: Gemini bloqueó contenido por recitación (finish_reason=4)")
                    # Intentar obtener el texto parcial si existe
                    if candidate.content and candidate.content.parts:
                        text = candidate.content.parts[0].text
                        if text and len(text.strip()) > 10:
                            if "### Simplified summary:" in text:
                                return text.split("### Simplified summary:")[-1].strip()
                            return text.strip()
                    return None
                
                # finish_reason 1 = STOP (éxito)
                if finish_reason == 1 and candidate.content and candidate.content.parts:
                    text = candidate.content.parts[0].text
                    if text:
                        if "### Simplified summary:" in text:
                            return text.split("### Simplified summary:")[-1].strip()
                        return text.strip()
            
            # Fallback: intentar response.text si está disponible
            try:
                text = response.text
                if text:
                    if "### Simplified summary:" in text:
                        return text.split("### Simplified summary:")[-1].strip()
                    return text.strip()
            except Exception:
                pass
            
            return None
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            print(f"Error en Gemini: {e}")
            return None

def generate_openai(medical_text: str, max_retries: int = 3) -> Optional[str]:
    """Genera resumen usando OpenAI GPT."""
    if not client_openai:
        return None
    
    prompt = build_api_prompt(medical_text)
    
    for attempt in range(max_retries):
        try:
            response = client_openai.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a specialist in healthcare communication."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1024,
                temperature=0.7,
            )
            text = response.choices[0].message.content
            # Extraer solo el resumen si hay marcadores
            if "### Simplified summary:" in text:
                return text.split("### Simplified summary:")[-1].strip()
            return text.strip()
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
                continue
            print(f"Error en OpenAI: {e}")
            return None

# Función para evaluar una API
def evaluate_api(generate_fn, api_name: str, pairs, sample_size: int = 30):
    """Evalúa una API comercial con las mismas métricas."""
    sample = pairs[:sample_size]
    results = []
    errors = 0
    
    print(f"\nEvaluando {api_name}...")
    for i, pair in enumerate(tqdm(sample, desc=f"Evaluating {api_name}")):
        medical = pair.medical
        reference = pair.plain
        source = pair.medical
        
        generated = generate_fn(medical)
        
        if generated is None or len(generated.strip()) < 10:
            errors += 1
            continue
        
        try:
            metrics = evaluator.evaluate(generated, reference, source)
            results.append(metrics)
        except Exception as e:
            print(f"Error evaluando muestra {i}: {e}")
            errors += 1
            continue
    
    if not results:
        print(f" No se pudieron evaluar muestras para {api_name}")
        return None
    
    if errors > 0:
        print(f" {errors} errores durante la evaluación de {api_name}")
    
    # Resumir métricas
    bert_f1 = np.mean([m["relevance"]["f1"] for m in results])
    fact = np.mean([m["factuality"]["score"] for m in results])
    fre = np.mean([m["readability"]["flesch_reading_ease"] for m in results])
    
    return {
        "bertscore_f1": float(bert_f1),
        "factuality": float(fact),
        "flesch_reading_ease": float(fre),
        "n_samples": len(results),
        "errors": errors,
    }

# Evaluar todas las APIs comerciales
sample_size = min(30, len(eval_pairs))
print("="*80)
print("EVALUANDO APIs COMERCIALES")
print("="*80)
print(f"Tamaño de muestra: {sample_size}")

ANTHROPIC_METRICS = None
GEMINI_METRICS = None
OPENAI_METRICS = None

if client_anthropic:
    ANTHROPIC_METRICS = evaluate_api(generate_anthropic, "Anthropic Claude", eval_pairs, sample_size)
    if ANTHROPIC_METRICS:
        print(f"Anthropic Claude: {ANTHROPIC_METRICS}")
else:
    print(" Anthropic no disponible (API key no configurada)")

if model_gemini:
    GEMINI_METRICS = evaluate_api(generate_gemini, "Google Gemini", eval_pairs, sample_size)
    if GEMINI_METRICS:
        print(f"Google Gemini: {GEMINI_METRICS}")
else:
    print(" Gemini no disponible (API key no configurada)")

if client_openai:
    OPENAI_METRICS = evaluate_api(generate_openai, "OpenAI GPT-4o", eval_pairs, sample_size)
    if OPENAI_METRICS:
        print(f"OpenAI GPT-4o: {OPENAI_METRICS}")
else:
    print(" OpenAI no disponible (API key no configurada)")


In [None]:
# ==============================================================
# 8.1 Tabla comparativa final: Modelos locales vs APIs comerciales
# ==============================================================

print("="*80)
print("TABLA COMPARATIVA FINAL: MODELOS LOCALES vs APIs COMERCIALES")
print("="*80)

# Construir tabla de métricas
metrics_table = {
    "BERTScore F1": {},
    "AlignScore (Factualidad)": {},
    "Flesch Reading Ease": {},
}

# Agregar modelos locales (verificar que existan antes de agregar)
if 'BASE_MODEL_METRICS' in globals():
    metrics_table["BERTScore F1"]["Base"] = BASE_MODEL_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["Base"] = BASE_MODEL_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["Base"] = BASE_MODEL_METRICS["flesch_reading_ease"]

if 'BASE_TD3_METRICS' in globals():
    metrics_table["BERTScore F1"]["Base + TD3"] = BASE_TD3_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["Base + TD3"] = BASE_TD3_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["Base + TD3"] = BASE_TD3_METRICS["flesch_reading_ease"]

if 'FINETUNED_MODEL_METRICS' in globals():
    metrics_table["BERTScore F1"]["LoRA"] = FINETUNED_MODEL_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["LoRA"] = FINETUNED_MODEL_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["LoRA"] = FINETUNED_MODEL_METRICS["flesch_reading_ease"]

if 'LORA_TD3_METRICS' in globals():
    metrics_table["BERTScore F1"]["LoRA + TD3"] = LORA_TD3_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["LoRA + TD3"] = LORA_TD3_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["LoRA + TD3"] = LORA_TD3_METRICS["flesch_reading_ease"]

# Agregar APIs comerciales si están disponibles
if ANTHROPIC_METRICS:
    metrics_table["BERTScore F1"]["Anthropic Claude"] = ANTHROPIC_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["Anthropic Claude"] = ANTHROPIC_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["Anthropic Claude"] = ANTHROPIC_METRICS["flesch_reading_ease"]

if GEMINI_METRICS:
    metrics_table["BERTScore F1"]["Google Gemini"] = GEMINI_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["Google Gemini"] = GEMINI_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["Google Gemini"] = GEMINI_METRICS["flesch_reading_ease"]

if OPENAI_METRICS:
    metrics_table["BERTScore F1"]["OpenAI GPT-4o"] = OPENAI_METRICS["bertscore_f1"]
    metrics_table["AlignScore (Factualidad)"]["OpenAI GPT-4o"] = OPENAI_METRICS["factuality"]
    metrics_table["Flesch Reading Ease"]["OpenAI GPT-4o"] = OPENAI_METRICS["flesch_reading_ease"]

# Crear DataFrame y mostrar
df_final = pd.DataFrame(metrics_table).T
print("\n" + "="*80)
print(df_final.to_string())
print("="*80)

# Mostrar diferencias respecto al modelo base
print("\n" + "="*80)
print("DIFERENCIAS RESPECTO AL MODELO BASE")
print("="*80)

for name, row in df_final.iterrows():
    print(f"\n{name}:")
    base_value = row.get("Base", None)
    if base_value is None:
        continue
    for col in df_final.columns:
        if col == "Base":
            continue
        if col in row and pd.notna(row[col]):
            delta = row[col] - base_value
            print(f"  {col:25s}: {row[col]:.4f} (Δ {delta:+.4f})")

# Resumen de mejores modelos
print("\n" + "="*80)
print("MEJORES MODELOS POR MÉTRICA")
print("="*80)

for metric in df_final.index:
    row = df_final.loc[metric]
    # Excluir NaN y encontrar el máximo
    valid_values = row.dropna()
    if len(valid_values) > 0:
        best_model = valid_values.idxmax()
        best_value = valid_values.max()
        print(f"{metric:30s}: {best_model:25s} ({best_value:.4f})")


In [None]:
# ==============================================================
# 8.2 Gráficas comparativas: Modelos locales vs APIs comerciales
# ==============================================================

import matplotlib.pyplot as plt
import numpy as np

# ==============================================================
# SELECCIONAR MODELO LOCAL A COMPARAR
# ==============================================================
# Descomenta el modelo local que quieras comparar con las APIs comerciales
# Opciones: "Base", "Base + TD3", "LoRA", "LoRA + TD3"
MODELO_LOCAL_SELECCIONADO = "LoRA"  # Cambia este valor para elegir otro modelo

# Preparar datos para las gráficas
metrics_data = {
    "BERTScore F1": {
        # "Base": BASE_MODEL_METRICS["bertscore_f1"],
        # "Base + TD3": BASE_TD3_METRICS["bertscore_f1"],
        # "LoRA": FINETUNED_MODEL_METRICS["bertscore_f1"],
        # "LoRA + TD3": LORA_TD3_METRICS["bertscore_f1"],
    },
    "AlignScore (Factualidad)": {
        # "Base": BASE_MODEL_METRICS["factuality"],
        # "Base + TD3": BASE_TD3_METRICS["factuality"],
        # "LoRA": FINETUNED_MODEL_METRICS["factuality"],
        # "LoRA + TD3": LORA_TD3_METRICS["factuality"],
    },
    "Flesch Reading Ease": {
        # "Base": BASE_MODEL_METRICS["flesch_reading_ease"],
        # "Base + TD3": BASE_TD3_METRICS["flesch_reading_ease"],
        # "LoRA": FINETUNED_MODEL_METRICS["flesch_reading_ease"],
        # "LoRA + TD3": LORA_TD3_METRICS["flesch_reading_ease"],
    },
}

# Agregar el modelo local seleccionado
if MODELO_LOCAL_SELECCIONADO == "Base":
    metrics_data["BERTScore F1"]["Base"] = BASE_MODEL_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["Base"] = BASE_MODEL_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["Base"] = BASE_MODEL_METRICS["flesch_reading_ease"]
elif MODELO_LOCAL_SELECCIONADO == "Base + TD3":
    metrics_data["BERTScore F1"]["Base + TD3"] = BASE_TD3_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["Base + TD3"] = BASE_TD3_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["Base + TD3"] = BASE_TD3_METRICS["flesch_reading_ease"]
elif MODELO_LOCAL_SELECCIONADO == "LoRA":
    metrics_data["BERTScore F1"]["LoRA"] = FINETUNED_MODEL_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["LoRA"] = FINETUNED_MODEL_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["LoRA"] = FINETUNED_MODEL_METRICS["flesch_reading_ease"]
elif MODELO_LOCAL_SELECCIONADO == "LoRA + TD3":
    metrics_data["BERTScore F1"]["LoRA + TD3"] = LORA_TD3_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["LoRA + TD3"] = LORA_TD3_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["LoRA + TD3"] = LORA_TD3_METRICS["flesch_reading_ease"]

# Agregar APIs comerciales si están disponibles
if ANTHROPIC_METRICS:
    metrics_data["BERTScore F1"]["Anthropic Claude"] = ANTHROPIC_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["Anthropic Claude"] = ANTHROPIC_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["Anthropic Claude"] = ANTHROPIC_METRICS["flesch_reading_ease"]

if GEMINI_METRICS:
    metrics_data["BERTScore F1"]["Google Gemini"] = GEMINI_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["Google Gemini"] = GEMINI_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["Google Gemini"] = GEMINI_METRICS["flesch_reading_ease"]

if OPENAI_METRICS:
    metrics_data["BERTScore F1"]["OpenAI GPT-4o"] = OPENAI_METRICS["bertscore_f1"]
    metrics_data["AlignScore (Factualidad)"]["OpenAI GPT-4o"] = OPENAI_METRICS["factuality"]
    metrics_data["Flesch Reading Ease"]["OpenAI GPT-4o"] = OPENAI_METRICS["flesch_reading_ease"]

# Definir colores para cada modelo
model_colors = {
    "Base": "#8B7355",  # Beige/Cream
    "Base + TD3": "#4A90E2",  # Blue
    "LoRA": "#D3D3D3",  # Light Gray
    "LoRA + TD3": "#FF6B35",  # Red/Orange gradient
    "Anthropic Claude": "#2E7D32",  # Green
    "Google Gemini": "#4285F4",  # Google Blue
    "OpenAI GPT-4o": "#10A37F",  # OpenAI Green
}

# Orden de modelos para la gráfica (solo el modelo local seleccionado + APIs comerciales)
model_order = [MODELO_LOCAL_SELECCIONADO]
if ANTHROPIC_METRICS:
    model_order.append("Anthropic Claude")
if GEMINI_METRICS:
    model_order.append("Google Gemini")
if OPENAI_METRICS:
    model_order.append("OpenAI GPT-4o")

# Crear figura con subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
fig.suptitle(f"Comparación de Métricas: {MODELO_LOCAL_SELECCIONADO} vs APIs Comerciales",
             fontsize=16, fontweight='bold', y=1.02)

# Colores para las barras
colors = [model_colors.get(model, "#808080") for model in model_order]

# Graficar cada métrica
for idx, (metric_name, metric_values) in enumerate(metrics_data.items()):
    ax = axes[idx]

    # Obtener valores en el orden correcto
    values = [metric_values.get(model, np.nan) for model in model_order]

    # Crear barras
    bars = ax.bar(range(len(model_order)), values, color=colors, alpha=0.8, edgecolor='black', linewidth=1.2)

    # Agregar valores en las barras
    for i, (bar, val) in enumerate(zip(bars, values)):
        if not np.isnan(val):
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{val:.2f}',
                   ha='center', va='bottom', fontsize=9, fontweight='bold')

    # Configurar ejes
    ax.set_xticks(range(len(model_order)))
    ax.set_xticklabels(model_order, rotation=45, ha='right', fontsize=10)
    ax.set_ylabel('Score', fontsize=11, fontweight='bold')
    ax.set_title(metric_name, fontsize=12, fontweight='bold', pad=10)
    ax.grid(axis='y', alpha=0.3, linestyle='--')
    ax.set_ylim(0, max([v for v in values if not np.isnan(v)]) * 1.15 if any(not np.isnan(v) for v in values) else 100)

    # Agregar línea de referencia en el valor del modelo Base
    if "Base" in metric_values and not np.isnan(metric_values["Base"]):
        ax.axhline(y=metric_values["Base"], color='gray', linestyle=':', linewidth=1.5, alpha=0.7, label='Base')
        if idx == 0:  # Solo mostrar leyenda en el primer subplot
            ax.legend(loc='upper right', fontsize=8)

plt.tight_layout()
plt.show()

# Gráfica combinada (todas las métricas en una sola)
fig2, ax2 = plt.subplots(figsize=(16, 8))

x = np.arange(len(model_order))
width = 0.25  # Ancho de las barras

# Posiciones de las barras para cada métrica
x1 = x - width
x2 = x
x3 = x + width

# Valores para cada métrica
y1 = [metrics_data["BERTScore F1"].get(model, np.nan) for model in model_order]
y2 = [metrics_data["AlignScore (Factualidad)"].get(model, np.nan) for model in model_order]
y3 = [metrics_data["Flesch Reading Ease"].get(model, np.nan) for model in model_order]

# Normalizar Flesch Reading Ease para mejor visualización (dividir por 10)
y3_norm = [v / 10 if not np.isnan(v) else np.nan for v in y3]

# Crear barras
bars1 = ax2.bar(x1, y1, width, label='BERTScore F1', color='#FF6B35', alpha=0.8, edgecolor='black', linewidth=1)
bars2 = ax2.bar(x2, y2, width, label='AlignScore (Factualidad)', color='#4A90E2', alpha=0.8, edgecolor='black', linewidth=1)
bars3 = ax2.bar(x3, y3_norm, width, label='Flesch Reading Ease (÷10)', color='#2E7D32', alpha=0.8, edgecolor='black', linewidth=1)

# Agregar valores en las barras
for bars, values, norm_factor in [(bars1, y1, 1), (bars2, y2, 1), (bars3, y3, 10)]:
    for bar, val in zip(bars, values):
        if not np.isnan(val):
            height = bar.get_height()
            ax2.text(bar.get_x() + bar.get_width()/2., height,
                    f'{val:.2f}',
                    ha='center', va='bottom', fontsize=8, fontweight='bold')

# Configurar ejes
ax2.set_xlabel('Modelos', fontsize=12, fontweight='bold')
ax2.set_ylabel('Score', fontsize=12, fontweight='bold')
ax2.set_title(f'Comparación Completa de Métricas: {MODELO_LOCAL_SELECCIONADO} vs APIs Comerciales',
              fontsize=14, fontweight='bold', pad=15)
ax2.set_xticks(x)
ax2.set_xticklabels(model_order, rotation=45, ha='right', fontsize=10)
ax2.legend(loc='upper left', fontsize=10)
ax2.grid(axis='y', alpha=0.3, linestyle='--')

plt.tight_layout()
plt.show()

print("\n" + "="*80)
print("GRÁFICAS GENERADAS")
print("="*80)
print("✓ Gráfica 1: Comparación por métrica (3 subplots)")
print("✓ Gráfica 2: Comparación combinada (todas las métricas)")
print("\nNota: Flesch Reading Ease está normalizado (÷10) en la gráfica combinada para mejor visualización.")


In [None]:
# ==============================================================
# 9. DESCARGAR MODELOS DESDE COLAB
# ==============================================================
# Esta celda permite descargar los modelos entrenados:
# 1. Modelo LoRA (finetuning)
# 2. Agente TD3 Base
# 3. Agente TD3 LoRA

import zipfile
import shutil
from google.colab import files, drive
from pathlib import Path

print("="*80)
print("DESCARGAR MODELOS DESDE COLAB")
print("="*80)

# Definir rutas (deben coincidir con las usadas en el notebook)
ADAPTER_DIR = "/content/deepseek-r1-distill-qwen-1.5b-pls"
TD3_BASE_DIR = "/content/td3_base_agent"
TD3_LORA_DIR = "/content/td3_lora_agent"

# Verificar qué modelos existen
modelos_disponibles = []

if os.path.exists(ADAPTER_DIR):
    modelos_disponibles.append(("LoRA (Finetuning)", ADAPTER_DIR))
    print(f"✓ Modelo LoRA encontrado en: {ADAPTER_DIR}")
else:
    print(f"✗ Modelo LoRA NO encontrado en: {ADAPTER_DIR}")

if os.path.exists(TD3_BASE_DIR):
    modelos_disponibles.append(("TD3 Base", TD3_BASE_DIR))
    print(f"✓ Agente TD3 Base encontrado en: {TD3_BASE_DIR}")
else:
    print(f"✗ Agente TD3 Base NO encontrado en: {TD3_BASE_DIR}")

if os.path.exists(TD3_LORA_DIR):
    modelos_disponibles.append(("TD3 LoRA", TD3_LORA_DIR))
    print(f"✓ Agente TD3 LoRA encontrado en: {TD3_LORA_DIR}")
else:
    print(f"✗ Agente TD3 LoRA NO encontrado en: {TD3_LORA_DIR}")

if not modelos_disponibles:
    print("\n No se encontraron modelos para descargar.")
    print("Asegúrate de haber ejecutado las celdas de entrenamiento primero.")
else:
    print(f"\n Se encontraron {len(modelos_disponibles)} modelo(s) para descargar.")
    
    # ==============================================================
    # OPCIÓN 1: Descargar como ZIP (recomendado para archivos grandes)
    # ==============================================================
    print("\n" + "="*80)
    print("OPCIÓN 1: DESCARGAR COMO ARCHIVO ZIP")
    print("="*80)
    
    ZIP_OUTPUT = "/content/modelos_deepseek-r1-distill-qwen-1.5b.zip"
    
    # Crear ZIP con todos los modelos
    with zipfile.ZipFile(ZIP_OUTPUT, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for nombre, ruta in modelos_disponibles:
            if os.path.exists(ruta):
                print(f"Agregando {nombre}...")
                # Agregar todo el directorio al ZIP
                for root, dirs, filenames in os.walk(ruta):
                    for filename in filenames:
                        file_path = os.path.join(root, filename)
                        # Mantener estructura de directorios en el ZIP
                        arcname = os.path.relpath(file_path, os.path.dirname(ruta))
                        arcname = os.path.join(os.path.basename(ruta), arcname)
                        zipf.write(file_path, arcname)
    
    # Obtener tamaño del archivo
    zip_size_mb = os.path.getsize(ZIP_OUTPUT) / (1024 * 1024)
    print(f"\n✓ ZIP creado: {ZIP_OUTPUT}")
    print(f"  Tamaño: {zip_size_mb:.2f} MB")
    
    # Descargar el ZIP
    print("\n⬇ Descargando ZIP...")
    files.download(ZIP_OUTPUT)
    print("✓ Descarga iniciada. El archivo se descargará automáticamente.")
    
    # ==============================================================
    # OPCIÓN 2: Subir a Google Drive (para archivos muy grandes)
    # ==============================================================
    print("\n" + "="*80)
    print("OPCIÓN 2: SUBIR A GOOGLE DRIVE (Opcional)")
    print("="*80)
    print("Si el archivo ZIP es muy grande, puedes subirlo a Google Drive.")
    print("Descomenta las siguientes líneas para usar esta opción:\n")
    
    print("# Montar Google Drive")
    print("# drive.mount('/content/drive')")
    print("#")
    print("# Copiar ZIP a Drive")
    print("# drive_dest = '/content/drive/MyDrive/modelos_deepseek-r1-distill-qwen-1.5b.zip'")
    print("# shutil.copy(ZIP_OUTPUT, drive_dest)")
    print("# print(f'✓ Archivo copiado a: {drive_dest}')")
    
    # ==============================================================
    # OPCIÓN 3: Descargar modelos individuales (para archivos pequeños)
    # ==============================================================
    print("\n" + "="*80)
    print("OPCIÓN 3: DESCARGAR MODELOS INDIVIDUALES (Solo para archivos pequeños)")
    print("="*80)
    print("Nota: Esta opción puede fallar si los archivos son muy grandes.")
    print("Recomendamos usar la OPCIÓN 1 (ZIP) en su lugar.\n")
    
    # Descomentar para descargar modelos individuales:
    # for nombre, ruta in modelos_disponibles:
    #     if os.path.exists(ruta):
    #         print(f"Descargando {nombre}...")
    #         # Crear ZIP individual
    #         zip_individual = f"/content/{os.path.basename(ruta)}.zip"
    #         shutil.make_archive(zip_individual.replace('.zip', ''), 'zip', ruta)
    #         files.download(zip_individual)
    
    print("\n" + "="*80)
    print("RESUMEN")
    print("="*80)
    print(f"✓ ZIP creado con {len(modelos_disponibles)} modelo(s)")
    print(f"✓ Ubicación: {ZIP_OUTPUT}")
    print(f"✓ Tamaño: {zip_size_mb:.2f} MB")
    print("\n Para usar los modelos en otro lugar:")
    print("   1. Descarga el archivo ZIP")
    print("   2. Extrae el contenido")
    print("   3. Carga los modelos usando:")
    print("      - LoRA: model_lora = PeftModel.from_pretrained(base_model, ADAPTER_DIR)")
    print("      - TD3: td3_agent = TD3.load(TD3_DIR, env=env)")
    print("\n" + "="*80)
