Este Notebook tiene como objetivo naturalizar la columna "docstring" de un dataset orientado a finetuning conversacional para generación de código con LLMs 

# 2 - Naturalización del Dataset

In [None]:
%pip install tqdm

In [None]:
import os
import pandas as pd
import torch
from transformers import pipeline
from tqdm import tqdm

# Crear la carpeta /datasets si no existe
output_dir = './datasets'
checkpoint_interval = 50
checkpoint_dir = f'{output_dir}/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Cargar los DataFrames filtrados desde los archivos JSON
df_train_filtered = pd.read_json(f'{output_dir}/train_filtered.json', orient='records', lines=True)
df_validation_filtered = pd.read_json(f'{output_dir}/validation_filtered.json', orient='records', lines=True)
df_test_filtered = pd.read_json(f'{output_dir}/test_filtered.json', orient='records', lines=True)

# Configurar el dispositivo y cargar el pipeline con LLaMA 3 8B Instruct
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

text_generation_pipeline = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
)

# Función para transformar un docstring a una descripción contextual usando el pipeline de LLaMA 3 8B Instruct
def transformar_docstring_con_llama(docstring, code):
    
    messages = [
    {"role": "system", "content": "Imagine you're a user working on a project and you need to request a specific function. Describe your need in a detailed and natural way, as if you were asking a developer to implement this function for you. Do not include any code."},
    {"role": "user", "content": f"I'm working on a project and I need help with the following function:\n\nFunction:\n{code}\n\nCurrent docstring:\n'{docstring}'\n\nPlease describe in a natural way what you need this function to do, including the context or scenario where it will be used."},
    ]

    response = text_generation_pipeline(
        messages,
        max_new_tokens=250,
        do_sample=True,
        temperature=0.6,
        top_p=0.9,
    )
    
    # Extraer solo la respuesta del "assistant"
    generated_text = ""
    if isinstance(response, list) and isinstance(response[0], dict):
        for part in response[0].get("generated_text", []):
            if part.get("role") == "assistant" and "content" in part:
                generated_text += part["content"] + " "
    
    cleaned_text = generated_text.strip()
    
    print("Cleaned text:", cleaned_text)  # Línea de depuración para ver el texto limpio generado
    return cleaned_text

# Función para procesar y guardar en checkpoints
def procesar_y_guardar(df, df_name):
    contextual_docstrings = []
    start_idx = 0
    
    checkpoint_path = os.path.join(checkpoint_dir, f'{df_name}_contextual_checkpoint.json')
    if os.path.exists(checkpoint_path):
        df_checkpoint = pd.read_json(checkpoint_path, orient='records', lines=True)
        contextual_docstrings = df_checkpoint['docstring_contextual'].tolist()
        start_idx = len(contextual_docstrings)
    
    for i, (_, row) in enumerate(tqdm(df.iterrows(), total=len(df), desc=f"Processing {df_name} dataset")):
        if i < start_idx:
            continue  # Saltar las filas ya procesadas
        
        contextual_docstrings.append(transformar_docstring_con_llama(row['docstring'], row['code']))
        
        if (i + 1) % checkpoint_interval == 0:
            df_checkpoint = pd.DataFrame({'docstring_contextual': contextual_docstrings})
            df_checkpoint.to_json(checkpoint_path, orient='records', lines=True)
            print(f"Checkpoint guardado en {checkpoint_path} hasta la fila {i + 1}.")
    
    df['docstring_contextual'] = contextual_docstrings
    df.to_json(f'{output_dir}/{df_name}_filtered_contextual.json', orient='records', lines=True)
    print(f"{df_name.capitalize()} dataset guardado exitosamente en {output_dir}.")

    if os.path.exists(checkpoint_path):
        os.remove(checkpoint_path)

# Procesar cada dataset con checkpoints
procesar_y_guardar(df_train_filtered, "train")
procesar_y_guardar(df_validation_filtered, "validation")
procesar_y_guardar(df_test_filtered, "test")
