# FacebookAI/roberta-base reward model

In [None]:
!pip install trl==0.7.10 transformers==4.38.2 torch==2.2.1
!pip install --upgrade fsspec datasets gcsfs firebase-admin pandas huggingface_hub transformers accelerate
!pip install --force-reinstall git+https://github.com/huggingface/trl.git

import numpy
print("Numpy version:", numpy.__version__)
import transformers
print("Transformers:", transformers.__version__)
import trl
print("TRL:", trl.__version__)

In [None]:
try:
    from trl import DataCollatorForRewardTraining
except ImportError:
    from trl import RewardDataCollatorWithPadding as DataCollatorForRewardTraining

In [None]:
# Import necessary libraries
import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
import firebase_admin
from firebase_admin import credentials, db
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorWithPadding
)
from huggingface_hub import HfApi, login
from datasets import Dataset
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from trl import RewardConfig, RewardTrainer # RewardConfig and RewardTrainer from trl
from huggingface_hub import create_repo

# Custom Data Collator implementation
@dataclass
class CustomRewardDataCollator:
    tokenizer: Any
    padding: Union[bool, str] = True
    max_length: int = None
    pad_to_multiple_of: int = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        chosen_features = [{
            "input_ids": feature["input_ids_chosen"],
            "attention_mask": feature["attention_mask_chosen"]
        } for feature in features]

        rejected_features = [{
            "input_ids": feature["input_ids_rejected"],
            "attention_mask": feature["attention_mask_rejected"]
        } for feature in features]

        collator = DataCollatorWithPadding(
            self.tokenizer,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors
        )

        batch_chosen = collator(chosen_features)
        batch_rejected = collator(rejected_features)

        return {
            "input_ids_chosen": batch_chosen["input_ids"],
            "attention_mask_chosen": batch_chosen["attention_mask"],
            "input_ids_rejected": batch_rejected["input_ids"],
            "attention_mask_rejected": batch_rejected["attention_mask"],
            "return_loss": True
        }

# Firebase credentials
credentials_dict = {...}

if not firebase_admin._apps:
    cred = credentials.Certificate(credentials_dict)
    firebase_admin.initialize_app(cred, {
        'databaseURL': 'https://...'
    })

# Define RLHF Feedback Pipeline
class RLHFFeedbackPipeline:
    def __init__(self, database_url: str):
        self.ref = db.reference('/feedback')

    def fetch_feedback(self, days: int = 7) -> Dict[str, Any]:
        try:
            cutoff = (datetime.now() - timedelta(days=days)).isoformat()
            return self.ref.get() or {}
        except Exception as e:
            print(f"Error fetching feedback: {e}")
            return {}

    def extract_response_data(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        try:
            responses = entry.get('responses', {})
            ratings = entry.get('ratings', {})
            return {
                'prompt': entry.get('prompt', ''),
                'response1': responses.get('response1', ''),
                'response2': responses.get('response2', ''),
                'selected': responses.get('selected', ''),
                'quality': float(ratings.get('overall_quality', 0)),
                'timestamp': entry.get('timestamp', '')
            }
        except Exception as e:
            print(f"Error extracting data: {e}")
            return None

    def create_rlhf_dataset(self) -> pd.DataFrame:
        feedback = self.fetch_feedback()
        rlhf_data = []
        for key, entry in feedback.items():
            data = self.extract_response_data(entry)
            if data and data['quality'] >= 4:
                chosen = data['response1'] if data['selected'] == 'Response 1' else data['response2']
                rejected = data['response2'] if data['selected'] == 'Response 1' else data['response1']
                rlhf_data.append({
                    'prompt': data['prompt'],
                    'chosen': chosen,
                    'rejected': rejected,
                    'quality': data['quality'],
                    'timestamp': data['timestamp']
                })
        return pd.DataFrame(rlhf_data)

# Initialize pipeline
pipeline = RLHFFeedbackPipeline("https://...")
rlhf_df = pipeline.create_rlhf_dataset()

HF_TOKEN = "..."
login(HF_TOKEN)

if not rlhf_df.empty:
    print(f"Dataset size: {len(rlhf_df)} examples")

    # Load model and tokenizer
    reward_model = AutoModelForSequenceClassification.from_pretrained(
        "FacebookAI/roberta-base",
        num_labels=1
    )
    tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

    # Set dropout to zero
    for module in reward_model.modules():
        if isinstance(module, torch.nn.Dropout):
            module.p = 0

    # Prepare dataset
    def prepare_reward_dataset(df: pd.DataFrame) -> Dataset:
        dataset_dict = {
            "prompt": df['prompt'].tolist(),
            "chosen": df['chosen'].tolist(),
            "rejected": df['rejected'].tolist()
        }
        return Dataset.from_dict(dataset_dict)

    # Tokenization function
    def preprocess_function(examples):
        chosen = tokenizer(examples["chosen"],
                          truncation=True,
                          padding="max_length",
                          max_length=512)

        rejected = tokenizer(examples["rejected"],
                            truncation=True,
                            padding="max_length",
                            max_length=512)

        return {
            "input_ids_chosen": chosen["input_ids"],
            "attention_mask_chosen": chosen["attention_mask"],
            "input_ids_rejected": rejected["input_ids"],
            "attention_mask_rejected": rejected["attention_mask"],
        }

    reward_dataset = prepare_reward_dataset(rlhf_df)
    tokenized_dataset = reward_dataset.map(preprocess_function, batched=True)

    # Create data collator
    data_collator = CustomRewardDataCollator(tokenizer=tokenizer, max_length=512)

    # Initialize trainer
    from trl import RewardTrainer

    trainer = RewardTrainer(
        model=reward_model,
        args=RewardConfig(
            output_dir="./reward_model",
            per_device_train_batch_size=8,
            num_train_epochs=3,
            logging_dir="./logs",
            logging_steps=1,
            remove_unused_columns=False,
            fp16=False,
            disable_dropout=True
        ),
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )

    # Train and save
    print(" Starting Training...")
    trainer.train()
    print("Training Completed!")

    save_dir = "./fine_tuned_roberta_rlhf"
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(" Model saved to", save_dir)

    create_repo(repo_id="ArsenKe/reward-model-roberta", repo_type="model", exist_ok=True)


    # Upload to Hub
    api = HfApi()
    api.upload_folder(
        folder_path=save_dir,
        path_in_repo=".",
        repo_id="ArsenKe/reward-model-roberta",
        repo_type="model"
    )
else:
    print(" No data available")

Error fetching feedback: ('invalid_grant: Invalid JWT Signature.', {'error': 'invalid_grant', 'error_description': 'Invalid JWT Signature.'})
⚠️ No data available


In [None]:
from huggingface_hub import create_repo

create_repo(repo_id="ArsenKe/reward-model-roberta", repo_type="model", exist_ok=True)
# Upload to Hub
api = HfApi()
api.upload_folder(
    folder_path=save_dir,
    path_in_repo=".",
    repo_id="ArsenKe/reward-model-roberta",
    repo_type="model"
)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ArsenKe/reward-model-roberta/commit/af7a6cf7e95f196309b29a5b3bb2b41f6d209fcf', commit_message='Upload folder using huggingface_hub', commit_description='', oid='af7a6cf7e95f196309b29a5b3bb2b41f6d209fcf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArsenKe/reward-model-roberta', endpoint='https://huggingface.co', repo_type='model', repo_id='ArsenKe/reward-model-roberta'), pr_revision=None, pr_num=None)

In [None]:
tokenizer.save_pretrained(save_dir)


('./fine_tuned_roberta_rlhf/tokenizer_config.json',
 './fine_tuned_roberta_rlhf/special_tokens_map.json',
 './fine_tuned_roberta_rlhf/vocab.json',
 './fine_tuned_roberta_rlhf/merges.txt',
 './fine_tuned_roberta_rlhf/added_tokens.json',
 './fine_tuned_roberta_rlhf/tokenizer.json')

In [None]:
from huggingface_hub import create_repo, HfApi

create_repo(repo_id=".../reward-model-roberta", repo_type="model", exist_ok=True)

# Upload everything
api = HfApi()
api.upload_folder(
    folder_path=save_dir,
    path_in_repo=".",
    repo_id="ArsenKe/reward-model-roberta",
    repo_type="model"
)


No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/ArsenKe/reward-model-roberta/commit/af7a6cf7e95f196309b29a5b3bb2b41f6d209fcf', commit_message='Upload folder using huggingface_hub', commit_description='', oid='af7a6cf7e95f196309b29a5b3bb2b41f6d209fcf', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ArsenKe/reward-model-roberta', endpoint='https://huggingface.co', repo_type='model', repo_id='ArsenKe/reward-model-roberta'), pr_revision=None, pr_num=None)

# Fine-tuning your chatbot using Proximal Policy Optimization (PPO).


In [None]:
!pip install trl accelerate transformers datasets torch
from transformers import AutoTokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

# Load base model
model = AutoModelForCausalLMWithValueHead.from_pretrained(".../MT5_large_finetuned_chatbot")
tokenizer = AutoTokenizer.from_pretrained(".../MT5_large_finetuned_chatbot")

# Load reward model
from transformers import AutoModelForSequenceClassification
reward_model = AutoModelForSequenceClassification.from_pretrained(".../reward-model-roberta")

# PPO configs
ppo_config = PPOConfig(
    model_name=".../MT5_large_finetuned_chatbot",
    learning_rate=1e-5,
    batch_size=8,
    log_with="wandb",
    project_kwargs={"name": "mt5-ppo-rlhf"},
)

# Initialize PPO trainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    tokenizer=tokenizer,
    reward_model=reward_model
)

# Fine-tuning loop using Firebase
from transformers import pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

for step in range(50):  #  50 PPO steps
    query = "What is Vienna famous for?"
    response = generator(query, max_new_tokens=50)[0]["generated_text"]

    # Reward scoring
    inputs = tokenizer(response, return_tensors="pt", truncation=True, padding=True)
    reward = reward_model(**inputs).logits.item()

    # Optimize via PPO
    ppo_trainer.step([query], [response], [reward])


*ValueError: Unrecognized configuration class <class 'transformers.models.mt5.configuration_mt5.MT5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of AriaTextConfig, BambaConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, DeepseekV3Config, DiffLlamaConfig, ElectraConfig, Emu3Config, ErnieConfig, FalconConfig, FalconMambaConfig, FuyuConfig, GemmaConfig, Gemma2Config, Gemma3Config, Gemma3TextConfig, GitConfig, GlmConfig, Glm4Config, GotOcr2Config, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GraniteConfig, GraniteMoeConfig, GraniteMoeSharedConfig, HeliumConfig, JambaConfig, JetMoeConfig, LlamaConfig, Llama4Config, Llama4TextConfig, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MllamaConfig, MoshiConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NemotronConfig, OlmoConfig, Olmo2Config, OlmoeConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, Phi4MultimodalConfig, PhimoeConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, Qwen3Config, Qwen3MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcode...*

In [None]:
ppo_trainer.save_model("MT5_RLHF_finetuned")
from huggingface_hub import upload_folder
upload_folder(repo_id="ArsenKe/MT5_RLHF_finetuned", folder_path="./MT5_RLHF_finetuned")


To automatically trigger training when new data is added (like feedback from Gradio app stored in Firebase), use Hugging Face Webhooks combined with a small backend or service like Google Cloud Functions, or  a public Colab endpoint.

Bonus: also can automate with Hugging Face Spaces or GitHub Actions


In [None]:
!pip install flask-ngrok
from flask import Flask, request

app = Flask(__name__)

@app.route("/webhook", methods=["POST"])
def webhook():
    data = request.json
    print("🔔 New Hugging Face webhook event received:", data)
    # Call your training function here
    return "Webhook received!", 200

from flask_ngrok import run_with_ngrok
run_with_ngrok(app)
app.run()


# DPO

In [None]:
!pip install --upgrade --no-cache-dir \
    pandas==2.2.2 \
    numpy==1.25.3

# PyTorch + CUDA 12.4 ecosystem
!pip install --upgrade --no-cache-dir \
    --index-url https://download.pytorch.org/whl/cu124 \
    torch==2.6.0+cu124 \
    torchvision==0.21.0+cu124 \
    torchaudio==2.6.0+cu124

#  fastai
!pip install fastai==2.7.19

# Hugging Face and RLHF tooling
!pip install --upgrade --no-cache-dir \
    transformers==4.46.0 \
    trl==0.16.1 \
    accelerate==1.6.0 \
    datasets==3.5.0 \
    tokenizers==0.20.3 \
    huggingface-hub==0.30.2 \
    bitsandbytes==0.39.0
# Peft
!pip install peft==0.4.0
# Filesystems & packaging
!pip install --upgrade --no-cache-dir \
    fsspec==2024.12.0 \
    gcsfs==2024.12.0 \
    packaging==24.2.0 \
    rich==13.7.1 \
    jedi>=0.16

# Firebase Admin SDK
!pip install firebase-admin==6.7.0

In [None]:
import torch, transformers, trl, tokenizers, numpy
import torchvision

print("Torch:", torch.__version__)
print("CUDA available?", torch.cuda.is_available())
print("Transformers:", transformers.__version__)
print("TRL:", trl.__version__)
print("Tokenizers:", tokenizers.__version__)
print("Torchvision version:", torchvision.__version__)
print("Numpy:",numpy.__version__)
print(numpy.__path__)

Torch: 2.6.0+cu124
CUDA available? True
Transformers: 4.46.0
TRL: 0.16.1
Tokenizers: 0.20.3
Torchvision version: 0.21.0+cu124
Numpy: 2.0.2
['/usr/local/lib/python3.11/dist-packages/numpy']


In [None]:
!pip show bitsandbytes
!pip install bitsandbytes --prefer-binary --no-cache-dir
!pip install bitsandbytes --upgrade
!python -m bitsandbytes


In [None]:

import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional
import firebase_admin
from firebase_admin import credentials, db
import os
import torch
import wandb
from huggingface_hub import login, HfApi, create_repo
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig
from transformers  import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, PeftModel
import gc

# Efficient memory allocation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


HF_TOKEN = "..."
FIREBASE_CREDENTIALS = {...}

login(HF_TOKEN)

# initialize Firebase
if not firebase_admin._apps:
    cred = credentials.Certificate(FIREBASE_CREDENTIALS)
    firebase_admin.initialize_app(cred, {
        'databaseURL': 'https://...'
    })

class RLHFFeedbackPipeline:
    def __init__(self, db_url: str):
        self.ref = db.reference('/feedback')

    def fetch_feedback(self) -> Dict[str, Any]:
        try:
            all_feedback = self.ref.get() or {}
            print("Fetched total entries:", len(all_feedback))
            return all_feedback
        except Exception as e:
            print(f"Error fetching feedback: {e}")
            return {}

    def extract_response_data(self, entry: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        try:
            responses = entry.get('responses', {})
            overall_quality = entry.get('overall_quality')

            if overall_quality is None:
                overall_quality = entry.get('ratings', {}).get('overall_quality')
                try:
                    overall_quality = float(overall_quality)
                except (TypeError, ValueError):
                    pass

            return {
                'prompt': entry.get('prompt', ''),
                'response1': responses.get('response1', ''),
                'response2': responses.get('response2', ''),
                'selected': responses.get('selected', ''),
                'overall_quality': overall_quality,
                'timestamp': entry.get('timestamp', '')
            }
        except Exception as e:
            print(f"Error extracting data: {e}")
            return None

    def create_rlhf_dataset(self) -> pd.DataFrame:
        """Create RLHF training dataset"""
        feedback = self.fetch_feedback()
        data = []
        for key, entry in feedback.items():
            item = self.extract_response_data(entry)
            print(f"Entry: {key}, Quality: {item['overall_quality']}, Selected: {item['selected']}")
            if item and item['selected'] in ['Response 1', 'Response 2']:
                chosen = item['response1'] if item['selected'] == 'Response 1' else item['response2']
                rejected = item['response2'] if item['selected'] == 'Response 1' else item['response1']
                data.append({
                    'prompt': item['prompt'],
                    'chosen': chosen,
                    'rejected': rejected,
                    'quality': item['overall_quality'],
                    'timestamp': item['timestamp']
                })
        return pd.DataFrame(data)

pipeline = RLHFFeedbackPipeline("https://...")
rlhf_df = pipeline.create_rlhf_dataset()

if rlhf_df.empty:
    raise ValueError("No preference data found.")

def prepare_dpo_dataset(df: pd.DataFrame) -> Dataset:
    return Dataset.from_dict({
        "prompt": df["prompt"].tolist(),
        "chosen": df["chosen"].tolist(),
        "rejected": df["rejected"].tolist()
    })

dpo_dataset = prepare_dpo_dataset(rlhf_df)

tokenizer = AutoTokenizer.from_pretrained(".../MT5_large_finetuned_chatbot")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess(example):
    prompt_input_ids = tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
    chosen_input_ids = tokenizer(example["chosen"], padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
    rejected_input_ids = tokenizer(example["rejected"], padding="max_length", truncation=True, max_length=128, return_tensors="pt").input_ids
    return {
        "prompt_input_ids": prompt_input_ids,
        "chosen_input_ids": chosen_input_ids,
        "rejected_input_ids": rejected_input_ids
    }

tokenized_dataset = dpo_dataset.map(preprocess)

compute_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    ".../MT5_large_finetuned_chatbot",
    quantization_config=bnb_config,
    device_map="auto",
)
ref_model = AutoModelForSeq2SeqLM.from_pretrained(
    ".../MT5_large_finetuned_chatbot",
    quantization_config=bnb_config,
    device_map="auto",
)
model.enable_input_require_grads()

# Attach LoRA adapters
peft_config = LoraConfig(
    r=16, lora_alpha=32, target_modules=["q", "v"],
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, peft_config)

model.gradient_checkpointing_enable()
ref_model.gradient_checkpointing_enable()

gc.collect()
torch.cuda.empty_cache()

# Configure and train with DPO

dpo_args = DPOConfig(
    output_dir="./dpo_results",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=6,
    learning_rate=1e-6,
    logging_steps=10,
    save_strategy="epoch",
    report_to="wandb",
    run_name="dpo_mt5_colab",
    beta=0.001
    #beta=1e-3,
)


# DPOTrainer without 'tokenizer' and 'beta' arguments
trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=dpo_args,
    train_dataset=tokenized_dataset,
    processing_class=tokenizer
)

trainer.train()


# Evaluation
def evaluate_model(model, tokenizer, prompts: List[str]):
    model.eval()
    for prompt in prompts:
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(model.device)
        output = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,
        )
        print(f"\nPrompt: {prompt}\nResponse: {tokenizer.decode(output[0], skip_special_tokens=True)}")

evaluate_model(model, tokenizer, rlhf_df['prompt'].sample(3).tolist())

# Save and upload
save_path = "./mt5-dpo-finetuned"

model.save_pretrained(save_path)

# save_path
tokenizer.save_pretrained(save_path)


✅ Fetched total entries: 117
🔹 Entry: -OMmQ--7C2Li1D3gm1EL, Quality: 4.0, Selected: Response 1
🔹 Entry: -OMrbxH8k4Ec7f-Qg1Xu, Quality: 3.0, Selected: Response 1
🔹 Entry: -OMrcKFAUQ2fB2Ic_YYv, Quality: 4.0, Selected: Response 1
🔹 Entry: -OMrxHyrwdEvesXrjGYS, Quality: 3.0, Selected: Response 1
🔹 Entry: -OMrxglQvZSGsRTi3PMs, Quality: 5.0, Selected: Response 2
🔹 Entry: -OMs2ZJlqaTSsRdNzxQl, Quality: 3.0, Selected: Response 1
🔹 Entry: -OMs3kBLozbdUJdJx7_K, Quality: 3.0, Selected: Response 2
🔹 Entry: -OMs3v4Gwk_miDg2jPhZ, Quality: 3.0, Selected: Response 2
🔹 Entry: -OMs4n_wU73aHBy5IUGW, Quality: 5.0, Selected: Response 1
🔹 Entry: -OMvLUEHtQ70PjnCC4zm, Quality: 2.0, Selected: Response 2
🔹 Entry: -OMvNNQK5a3wvkJ_cDjo, Quality: 2.0, Selected: Response 2
🔹 Entry: -OMvOG4Wz96ZlIab6MxM, Quality: 2.0, Selected: Response 2
🔹 Entry: -OMvOYHN9Khqmbj6e_IZ, Quality: 5.0, Selected: Response 1
🔹 Entry: -OMvOZKvSWCfU9xBkxMb, Quality: 5.0, Selected: Response 1
🔹 Entry: -OMvOsPZ0yybTe1TdGNh, Quality: 5.0, Se

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

Map:   0%|          | 0/116 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

Extracting prompt in train dataset:   0%|          | 0/116 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/116 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/116 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marsenke[0m ([33marsenke-fh-tech-wien[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.8051
20,2.7406
30,2.7799
40,2.9273
50,2.8034
60,2.8421
70,2.7809
80,2.8154
90,2.8679
100,2.7456


Step,Training Loss
10,2.8051
20,2.7406
30,2.7799
40,2.9273
50,2.8034
60,2.8421
70,2.7809
80,2.8154
90,2.8679
100,2.7456


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Prompt: Are pets allowed on the Royal E-Carriage Tour?
Response: Chatbot: Unfortunately, pets aren’t permitted on the Royal E-Carriage Tour for safety reasons. Contact office@royal-ecars.com if you have concerns.

Prompt: Hi tell how you can help me
Response: Chatbot: Of course! We can assist you with any questions.

Prompt: what museums we see on tour
Response: Chatbot: The Platinum Sightseeing Tour includes a private eco-car ride, a guide, and stops at Zentralfriedhof. Costs start at 105 euro. Contact office@royal-ecars.com for more info.Royal E-Cars Tours | Contact: office@royal-ecars.com | Phone: +43676849696200 | Address: Geigergasse 5, 3. Stock, 1050 Wien


('./mt5-dpo-finetuned/tokenizer_config.json',
 './mt5-dpo-finetuned/special_tokens_map.json',
 './mt5-dpo-finetuned/spiece.model',
 './mt5-dpo-finetuned/added_tokens.json',
 './mt5-dpo-finetuned/tokenizer.json')



Step	Training Loss
10	2.805100
20	2.740600
30	2.779900
40	2.927300
50	2.803400
60	2.842100
70	2.780900
80	2.815400
90	2.867900
100	2.745600
110	2.823600
120	2.761600
130	2.779200
140	2.794400
150	2.684300
160	2.730800
170	2.694700


In [None]:
# upload
repo_id = ".../MT5_DPO_finetuned"

create_repo(repo_id, repo_type="model", exist_ok=True)

adapter = PeftModel.from_pretrained(model,save_path)
adapter.push_to_hub(
    repo_id,
    use_auth_token=HF_TOKEN,
    commit_message="Upload DPO-trained LoRA adapter"
)

print("LoRA adapter uploaded: https://huggingface.co/.../mt5-dpo-adapter")

HfApi().upload_folder(
    folder_path=save_path,
    path_in_repo=".",
    repo_id=repo_id,
    repo_type="model",
    token=HF_TOKEN
)

print("✅ Tokenizers uploaded to: https://huggingface.co/.../MT5_DPO_finetuned")





adapter_model.safetensors:   0%|          | 0.00/18.9M [00:00<?, ?B/s]

✅ LoRA adapter uploaded: https://huggingface.co/ArsenKe/mt5-dpo-adapter


Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

✅ Tokenizers uploaded to: https://huggingface.co/ArsenKe/MT5_DPO_finetuned


Lora and Tokenizers uploaded