In [1]:
!pip install compressed-tensors #ignore this, I tried to install a quantized model of mistral but nothing but errors were thrown

Collecting compressed-tensors
  Downloading compressed_tensors-0.9.2-py3-none-any.whl.metadata (7.0 kB)
Downloading compressed_tensors-0.9.2-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: compressed-tensors
Successfully installed compressed-tensors-0.9.2


In [1]:
!pip install datasets



In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq, EarlyStoppingCallback
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import json
import torch

from huggingface_hub import login

login(token="put your token, if necessary here")


print("CUDA disponible:", torch.cuda.is_available())
torch.cuda.empty_cache()


model_name = "google-t5/t5-base"  
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


dataset_path = "/kaggle/input/input-simple/train_dataset.jsonl"
train_data = Dataset.from_json(dataset_path)

def tokenize_function(batch):

    inputs = batch["input"]
    inputs = [(" ".join(i) if isinstance(i, list) else i).strip() for i in inputs]
    inputs = [i if i else "No input provided." for i in inputs]

    outputs = batch["output"]
    outputs = [(", ".join(str(o) for o in i) if isinstance(i, list) else i).strip() for i in outputs]
    outputs = [o if o else "No output provided." for o in outputs]

    input_encodings = tokenizer(inputs, truncation=True, max_length=512, padding="max_length")
    output_encodings = tokenizer(outputs, truncation=True, max_length=64, padding="max_length")

    labels = [
        [(label if label != tokenizer.pad_token_id else -100) for label in output]
        for output in output_encodings["input_ids"]
    ]

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": labels,
    }

tokenized_train = train_data.map(tokenize_function, batched=True, num_proc=4)

data_split = tokenized_train.train_test_split(test_size=0.20)
train_dataset = data_split["train"]
eval_dataset = data_split["test"]

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="/kaggle/working/results2",
    eval_steps=200,  
    save_steps=400,
    learning_rate=4e-5,
    evaluation_strategy="steps",  
    save_strategy="steps", 
    per_device_train_batch_size=4,  
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="/kaggle/working/logs2",
    report_to=[],  
    logging_steps=100,
    gradient_accumulation_steps=2, 
    lr_scheduler_type="cosine",
    load_best_model_at_end=True,  
    metric_for_best_model="eval_loss",  
    greater_is_better=False,  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)


trainer.train()


model.save_pretrained("/kaggle/working/fine_tuned_t5_base")
tokenizer.save_pretrained("/kaggle/working/fine_tuned_t5_base")


CUDA disponible: True


  trainer = Trainer(


Step,Training Loss,Validation Loss
200,1.1055,0.802409
400,0.8599,0.673343
600,0.7478,0.591101
800,0.6889,0.527997
1000,0.6113,0.498179
1200,0.5595,0.455288
1400,0.5232,0.436658
1600,0.5169,0.417634
1800,0.4829,0.40588
2000,0.4245,0.393323


Could not locate the best model at /kaggle/working/results2/checkpoint-2600/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


('/kaggle/working/fine_tuned_t5_base/tokenizer_config.json',
 '/kaggle/working/fine_tuned_t5_base/special_tokens_map.json',
 '/kaggle/working/fine_tuned_t5_base/spiece.model',
 '/kaggle/working/fine_tuned_t5_base/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json
import torch


def generate_outputs(input_data):
    model = T5ForConditionalGeneration.from_pretrained("/kaggle/working/fine_tuned_t5_base2")
    tokenizer = T5Tokenizer.from_pretrained("/kaggle/working/fine_tuned_t5_base2")

    inputs = [example["input"] for example in input_data] 
    batch_size = 4

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    decoded_outputs = []

    for i in range(0, len(inputs), batch_size):
        batch_inputs = inputs[i:i + batch_size]
        inputs_encodings = tokenizer(batch_inputs, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        inputs_encodings = {key: val.to(device) for key, val in inputs_encodings.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs_encodings["input_ids"],
                attention_mask=inputs_encodings["attention_mask"],
                max_length=64, 
                num_beams=5,  
                early_stopping=True
            )
        batch_decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        decoded_outputs.extend(batch_decoded_outputs)

    output_file_path = "/kaggle/working/generated_outputs.jsonl"
    with open(output_file_path, "w", encoding="utf-8") as f:
        for input_example, output in zip(input_data, decoded_outputs):
            f.write(json.dumps({"input": input_example["input"], "output": output}) + "\n")

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

path = load_json("/kaggle/input/groundtruth/dev_dataset.jsonl")
generate_outputs(path)