# AFINAMIENTO CODELLAMA

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-c

In [None]:
import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType
)
import os

# DATA

In [None]:
DATASET_PATH = "/content/train.jsonl"
OUTPUT_DIR = "./codellama-c-to-cpp-finetuned"

if not os.path.exists(DATASET_PATH):
  print(f"Error: No se encontró el archivo {DATASET_PATH}")

In [None]:
def load_jsonl_dataset(file_path):
    """Carga el dataset desde un archivo JSONL"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

def create_prompt(source_code, target_code=None):
    """Crea el prompt para el entrenamiento"""
    if target_code:
        prompt = f"""Translate the following code from C to C++:

Code C:
{source_code}

Code C++:
{target_code}"""
    else:
        prompt = f"""Translate the following code from C to C++:

Code C:
{source_code}

Code C++:"""

    return prompt

In [None]:
def preprocess_dataset(data, tokenizer, max_length=512):
    """Preprocesa el dataset para el entrenamiento"""
    texts = []

    for item in data:
        if all(key in item for key in ['source_code', 'target_code']):
            prompt = create_prompt(item['source_code'], item['target_code'])
            texts.append(prompt)

    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )

    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': encodings['input_ids'].clone()
    })

In [None]:
print("Cargando y procesando dataset...")
raw_data = load_jsonl_dataset(DATASET_PATH)
print(f"Cargadas {len(raw_data)} muestras")

Cargando y procesando dataset...
Cargadas 3705 muestras


# MODELO

In [None]:
MODEL_NAME = "codellama/CodeLlama-7b-hf"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {DEVICE}")

Usando dispositivo: cuda


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
def setup_model_and_tokenizer():
    """Configura el modelo y tokenizer"""
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    )

    return model, tokenizer

def setup_lora_config():
    """Configura LoRA para fine-tuning eficiente"""
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=[
            "q_proj",
            "v_proj",
            "k_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ]
    )
    return lora_config

In [None]:
print("Cargando modelo y tokenizer...")
model, tokenizer = setup_model_and_tokenizer()

Cargando modelo y tokenizer...


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
print("Preparando modelo para entrenamiento...")
model = prepare_model_for_kbit_training(model)

Preparando modelo para entrenamiento...


In [None]:
print("Configurando LoRA...")
lora_config = setup_lora_config()
model = get_peft_model(model, lora_config)

Configurando LoRA...


# PREPROCESAR DATOS

In [None]:
split_idx = int(0.9 * len(raw_data))
train_data = raw_data[:split_idx]
val_data = raw_data[split_idx:]

In [None]:
train_dataset = preprocess_dataset(train_data, tokenizer)
val_dataset = preprocess_dataset(val_data, tokenizer)

print(f"Dataset de entrenamiento: {len(train_dataset)} muestras")
print(f"Dataset de validación: {len(val_dataset)} muestras")

Dataset de entrenamiento: 3334 muestras
Dataset de validación: 371 muestras


# ENTRENAMIENTO

In [None]:
training_args = TrainingArguments(
  output_dir=OUTPUT_DIR,
  num_train_epochs=1,
  per_device_train_batch_size=24,
  per_device_eval_batch_size=24,
  gradient_accumulation_steps=4,
  warmup_steps=100,
  learning_rate=1e-4,
  fp16=True,
  logging_steps=50,
  eval_strategy="epoch",
  save_strategy="epoch",
  save_total_limit=2,
  load_best_model_at_end=True,
  greater_is_better=False,
  dataloader_pin_memory=False,
  remove_unused_columns=False,
  optim="adamw_torch",
  report_to="none"
  )

In [None]:
data_collator = DataCollatorForLanguageModeling(
  tokenizer=tokenizer,
  mlm=False,
)

In [None]:
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
print("Iniciando entrenamiento...")
trainer.train()

Iniciando entrenamiento...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,No log,0.29554


TrainOutput(global_step=35, training_loss=0.46778411865234376, metrics={'train_runtime': 3635.9724, 'train_samples_per_second': 0.917, 'train_steps_per_second': 0.01, 'total_flos': 6.808284763363738e+16, 'train_loss': 0.46778411865234376, 'epoch': 1.0})

In [None]:
print("Guardando modelo...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Modelo guardado en: {OUTPUT_DIR}")

Guardando modelo...
Modelo guardado en: ./codellama-c-to-cpp-finetuned


# GUARDAR MODELO EN HUGGINGFACE

In [None]:
from huggingface_hub import login
from huggingface_hub import HfApi

In [None]:
try:
    from google.colab import userdata
    HF_TOKEN = userdata.get('HF_TOKEN')
    if HF_TOKEN is None:
        print("Error: HF_TOKEN no encontrado en los secretos de Colab.")
    else:
        login(token=HF_TOKEN)
        print("Inicio de sesión en Hugging Face exitoso.")
except ImportError:
    print("No estás en un entorno Colab o no se pudo acceder a userdata.")

Inicio de sesión en Hugging Face exitoso.


In [None]:
OUTPUT_DIR = "./codellama-c-to-cpp-finetuned"

REPO_NAME = "Berly0/CodeLlama-7b-hf-c-to-cpp"

api = HfApi()

try:
    api.create_repo(repo_id=REPO_NAME, exist_ok=True)
    print(f"Repositorio '{REPO_NAME}' creado o ya existente.")

    print(f"Subiendo archivos desde '{OUTPUT_DIR}' a '{REPO_NAME}'...")
    api.upload_folder(
        folder_path=OUTPUT_DIR,
        repo_id=REPO_NAME,
        repo_type="model",
    )
    print(f"Archivos subidos exitosamente a '{REPO_NAME}'.")

except Exception as e:
    print(f"Error al subir el modelo a Hugging Face: {e}")

Repositorio 'Berly0/CodeLlama-7b-hf-c-to-cpp' creado o ya existente.
Subiendo archivos desde './codellama-c-to-cpp-finetuned' a 'Berly0/CodeLlama-7b-hf-c-to-cpp'...


Uploading...:   0%|          | 0.00/641M [00:00<?, ?B/s]

Archivos subidos exitosamente a 'Berly0/CodeLlama-7b-hf-c-to-cpp'.
