In [2]:
!pip install torch transformers accelerate bitsandbytes datasets jsonlines


Collecting torch
  Downloading torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
Collecting accelerate
  Downloading accelerate-1.5.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-win_amd64.whl.metadata (5.1 kB)
Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# 📌 Nombre del modelo en Hugging Face
model_name = "meta-llama/Meta-Llama-3-8B"

# 🔄 Cargar el tokenizador y modelo en GPU con `bitsandbytes` (para menor consumo de memoria)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map="auto",  # Mapea el modelo a la GPU disponible
    torch_dtype="auto"  # Usa la mejor precisión posible
)

print("✅ Llama 3.1 8B cargado correctamente.")


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

# 📂 Cargar el dataset JSONL
dataset = load_dataset("json", data_files="contratos_dataset.jsonl", split="train")

# 🔄 Tokenizar los datos
def tokenize_function(example):
    return tokenizer(example["prompt"], text_target=example["response"], padding="max_length", truncation=True)

# 🏷 Aplicar tokenización
dataset = dataset.map(tokenize_function, batched=True)


In [None]:
from transformers import TrainingArguments, Trainer

# 📂 Configurar la carpeta donde se guardarán los checkpoints
training_args = TrainingArguments(
    output_dir="./llama3_contratos",
    per_device_train_batch_size=2,  # Ajusta según la VRAM de tu GPU
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_total_limit=2,
    fp16=True,  # Usa FP16 para optimizar memoria
    report_to="none"
)

# 📌 Configurar el trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# 🚀 Iniciar entrenamiento
trainer.train()


In [None]:
# 💾 Guardar el modelo ajustado
model.save_pretrained("llama3_contratos_tuned")
tokenizer.save_pretrained("llama3_contratos_tuned")

print("✅ Modelo guardado en 'llama3_contratos_tuned'.")


In [None]:
# 📂 Cargar el modelo ajustado
model_path = "llama3_contratos_tuned"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

# 🔍 Ejemplo de contrato nuevo
prompt = "Analiza este contrato y determina si cumple con la ley dominicana:\nEl inquilino renuncia a cualquier reclamo legal."
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# 🚀 Generar respuesta
outputs = model.generate(**inputs, max_length=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [2]:
from huggingface_hub import login

# 🔑 Reemplaza con tu Token de API de Hugging Face
login("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")


In [4]:
import json

# 📂 Rutas de los archivos
input_file = "contratos_todos.jsonl"
output_file = "contratos_todos_fixed.jsonl"

# 📌 Arreglar el formato JSONL
with open(input_file, "r", encoding="utf-8") as file, open(output_file, "w", encoding="utf-8") as fixed_file:
    current_json = {}
    for line in file:
        line = line.strip()
        if line == "{":  
            current_json = {}  # Iniciar un nuevo objeto JSON
        elif line == "}":
            fixed_file.write(json.dumps(current_json) + "\n")  # Guardar en una sola línea
        else:
            key, value = line.split(": ", 1)  
            current_json[key.strip('"')] = value.strip(',')

print("✅ Archivo corregido y guardado como contratos_todos_fixed.jsonl")


✅ Archivo corregido y guardado como contratos_todos_fixed.jsonl
