In [1]:
from dotenv import load_dotenv
import os

# Carga las variables de entorno desde el archivo .env
load_dotenv()

# Accede al token de Hugging Face
token = os.getenv("HUGGINGFACE_TOKEN")

# LLAMA-3-8B Instruct

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn as nn

class ModelParallelAutoModelForCausalLM(nn.Module):
    def __init__(self, model_name, tokenizer, device_map, token):
        super(ModelParallelAutoModelForCausalLM, self).__init__()
        self.tokenizer = tokenizer
        self.device_map = device_map
        
        # Cargar el modelo
        self.model = AutoModelForCausalLM.from_pretrained(model_name, token=token)

        # Dividir las capas del modelo en dispositivos según el device_map
        # Distribuyendo las capas del decodificador en múltiples GPUs
        self.model.model.layers = nn.ModuleList(
            [self.model.model.layers[i].to(device_map[i % len(device_map)]) 
             for i in range(len(self.model.model.layers))]
        )

    def forward(self, input_ids):
        # Asegúrate de que input_ids esté en el dispositivo correcto
        input_ids = input_ids.to(self.device_map[0])
        outputs = self.model(input_ids)
        return outputs

# Configuración de los dispositivos
device_map = ["cuda:0", "cuda:1", "cuda:2"]  # Asegúrate de tener las GPU correspondientes disponibles

# Carga del tokenizer y del modelo
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=token)
# Inicializar el modelo paralelo
model = ModelParallelAutoModelForCausalLM("meta-llama/Meta-Llama-3-8B-Instruct", tokenizer, device_map, token)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.37it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU  has a total capacity of 23.69 GiB of which 5.06 MiB is free. Process 467491 has 816.00 MiB memory in use. Process 485582 has 256.00 MiB memory in use. Process 816227 has 22.03 GiB memory in use. Process 1020839 has 256.00 MiB memory in use. Including non-PyTorch memory, this process has 350.00 MiB memory in use. Of the allocated memory 96.00 MiB is allocated by PyTorch, and 0 bytes is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [3]:
# Configura el token de padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Usualmente se puede usar el token EOS

# Definimos el código dividiendolo entre requests y sus outputs
train_texts = ["write a python function to print hello world", "write a python loop to print numbers from 0 to 9", "write code", "give me code"]
train_codes = ["import patata\n# your code here", "import patata\n# your code here", "import patata\n# your code here", "import patata\n# your code here"]

# Preprocesamiento de entradas y salidas usando el método __call__ con text_target para las etiquetas
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
train_labels = tokenizer(train_codes, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Preprocesamiento de entradas y salidas
train_encodings = tokenizer(train_texts, padding="max_length", truncation=True, max_length=512)
with tokenizer.as_target_tokenizer():
    train_labels = tokenizer(train_codes, padding="max_length", truncation=True, max_length=512)

# Verifica que todo está correcto
print(train_encodings['input_ids'][0])  # Imprime los IDs de entrada del primer ejemplo para verificar
print(train_labels['input_ids'][0])  

[128000, 5040, 264, 10344, 734, 311, 1194, 24748, 1917, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 



In [4]:
from torch.utils.data import Dataset

class CodeGenDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

train_dataset = CodeGenDataset(train_encodings, train_labels)


In [None]:
import torch
from torch.optim import AdamW  # Importa AdamW de PyTorch en lugar de transformers
from torch.utils.data import DataLoader

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model = nn.DataParallel(model)
model.train()

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(20):  # Número de épocas
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch} Loss {loss.item()}")

por aqui se puede guarar o cargar el modelo finetuneado

In [1]:
def generate_code(model, tokenizer, prompt, max_length=1024):
    # Codificar el texto de entrada para el modelo
    input_ids = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True).input_ids

    # Colocar los input_ids en el mismo dispositivo que la primera parte del modelo
    input_ids = input_ids.to(model.device_map[0])  # model.device_map[0] es el dispositivo de la primera parte del modelo

    # Generar código utilizando el modelo, sin necesidad de mover el modelo entre dispositivos
    outputs = model.generate(input_ids, max_length=max_length)

    # Decodificar y retornar la salida generada
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code

In [4]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)  # Mueve el modelo al dispositivo adecuado
#model = nn.DataParallel(model)

test_prompts = [
    "write a python function to add two numbers",
    "create a loop in python that prints numbers from 1 to 5"
]

# Generar código para cada descripción
for prompt in test_prompts:
    generated_code = generate_code(model, tokenizer, prompt)
    print(f"Description: {prompt}")
    print(f"Generated Code:\n{generated_code}\n")

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 

# PHI-3-mini-128k Instruct

https://huggingface.co/microsoft/Phi-3-mini-128k-instruct

Cargamos el modelo

In [2]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer_phi3 = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)
model_phi3 = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-128k-instruct", trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

In [None]:
# Lista de mensajes con el rol de "user" para generar código
messages = [
    {"role": "user", "content": "write a python function to add two numbers"},
    {"role": "user", "content": "create a loop in python that prints numbers from 1 to 5"}
]

# Función adaptada para aceptar un mensaje y generar código
def generate_code_from_message(model, tokenizer, message, max_length=1024):
    # Extraer el contenido del mensaje
    prompt = message["content"]
    
    # Codificar el texto de entrada para el modelo
    input_ids = tokenizer(prompt, return_tensors="pt", max_length=max_length, truncation=True).input_ids

    # Colocar los input_ids en el mismo dispositivo que la primera parte del modelo
    input_ids = input_ids.to(model.device_map[0])  # model.device_map[0] es el dispositivo de la primera parte del modelo

    # Generar código utilizando el modelo, sin necesidad de mover el modelo entre dispositivos
    outputs = model.generate(input_ids, max_length=max_length)

    # Decodificar y retornar la salida generada
    generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_code

# Generar código para cada mensaje
for message in messages:
    generated_code = generate_code_from_message(model_phi3, tokenizer_phi3, message)
    print(f"Message: {message['content']}")
    print(f"Generated Code:\n{generated_code}\n")
