## Instalaci√≥n de dependencias

In [None]:
# Install core libraries for text modeling and evaluation
%pip install sentence_transformers datasets matplotlib seaborn

# Install PyTorch nightly build for ROCm 6.3 (GPU support on AMD)
%pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3

# helpers
%pip install wordcloud

# Install Accelerate for efficient training and device management
%pip install "accelerate>=0.26.0"

## Configuraci√≥n del entorno de ejecuci√≥n (GPU y Accelerate)

Establecemos variables de entorno para controlar el uso de GPU y evitar problemas comunes al entrenar modelos en notebooks. En particular, se fuerza el uso de una √∫nica GPU y se desactiva el mixed precision de Accelerate para mejorar la estabilidad del entrenamiento en este entorno.

In [None]:
import os

# Use a single GPU to avoid DataParallel issues in notebooks.
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Force Accelerate to disable mixed precision (stability fix for this environment).
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"

## Importaci√≥n de librer√≠as

Esta secci√≥n carga todas las dependencias necesarias para el procesamiento de texto, entrenamiento del modelo y utilidades auxiliares.

In [None]:
import csv
import gc
import math
import random
import re
from collections import Counter
from datetime import datetime, timedelta

import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
from datasets import DatasetDict, load_dataset
from IPython.display import clear_output, display
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

## Configuraci√≥n del dispositivo de ejecuci√≥n

Establecemos el dispositivo de c√≥mputo a utilizar durante el entrenamiento y la inferencia. Utilizamos la GPU si est√° disponible, de lo contrario el c√≥digo se ejecuta en CPU.

In [None]:
# Detect compute device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Dispositivo seleccionado: {device}")

# Optional: print GPU details when available
if device.type == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_cap = torch.cuda.get_device_capability(0)
    print(f"GPU: {gpu_name} ‚Äî Compute Capability: {gpu_cap}")

## Par√°metros y filtros de datos irrelevantes

En esta secci√≥n se definen patrones t√≠picos de WhatsApp que deben ser descartados durante la limpieza del dataset.

In [None]:
# Set of patterns considered irrelevant for training (WhatsApp system messages)
irrelevant_data = {
    # Spanish
    "eliminaste este mensaje",
    "se elimin√≥ este mensaje",
    "<multimedia omitido>",
    "multimedia omitido",
    "los mensajes y las llamadas est√°n cifrados de extremo a extremo",

    # English
    "you deleted this message",
    "this message was deleted",
    "<media omitted>",
    "media omitted",
    "messages and calls are end-to-end encrypted",
}

def contains_irrelevant_data(message: str) -> bool:
    """
    Returns True if the message contains any irrelevant WhatsApp system string.
    Assumes the input message has already been lowercased.
    """
    return any(pattern in message for pattern in irrelevant_data)


## Procesamiento del chat de WhatsApp

Esta celda contiene todas las funciones relacionadas con la limpieza, parseo y estructuraci√≥n del chat de WhatsApp. No realiza acciones por s√≠ misma; solo define el procesamiento que luego ser√° utilizado por la interfaz interactiva.

In [None]:
# ============================================================
# 4Ô∏è‚É£ PROCESAMIENTO DEL CHAT WHATSAPP (k-turns con roles)
# ============================================================

MSG_SEP = "<|msg_sep|>"   # separator between messages inside the same turn


def clean_text(text: str) -> str:
    """
    Apply light text cleaning: lowercase, trim, remove unusual symbols,
    normalize whitespace. This function does not decide whether a message
    is irrelevant; that is handled by `contains_irrelevant_data`.
    """
    text = text.lower().strip()
    text = re.sub(r"[^a-z√°√©√≠√≥√∫√±√º0-9,.;:¬°!¬ø?\s']", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def parse_datetime(line: str):
    """
    Extract the datetime of a WhatsApp message line if present.
    Returns a datetime object or None if no valid timestamp is found.
    """
    match = re.match(r"(\d+/\d+/\d+[, ]\s?\d+:\d+)\s-", line)
    if match:
        for fmt in ("%d/%m/%y %H:%M", "%d/%m/%Y %H:%M"):
            try:
                return datetime.strptime(match.group(1).replace(",", ""), fmt)
            except Exception:
                pass
    return None


def group_consecutive_messages(messages):
    """
    Group consecutive messages from the same author into a single turn
    if they are close in time.

    When several messages from the same author are grouped, the MSG_SEP
    token is inserted between them:

        msg_1 <|msg_sep|> msg_2 <|msg_sep|> msg_3
    """
    grouped = []
    for author, msg, ts in messages:
        if (
            grouped
            and grouped[-1][0] == author
            and ts and grouped[-1][2]
            and (ts - grouped[-1][2]) < timedelta(hours=1)
        ):
            # Same author and close in time ‚Üí same turn with separator
            prev_author, prev_msg, prev_ts = grouped[-1]
            new_msg = prev_msg + f" {MSG_SEP} " + msg
            grouped[-1] = (author, new_msg, ts)
        else:
            grouped.append((author, msg, ts))
    return grouped


def process_whatsapp_chat_with_roles(
    filepath: str,
    target_author: str,
    k_history: int = 4,
    time_gap: timedelta = timedelta(hours=3),
):
    """
    Build <PROMPT, RESPONSE> pairs from a WhatsApp export.

    Turns are defined as consecutive messages from the same author,
    grouped when they are close in time.

    For each turn where `target_author` speaks, up to `k_history` previous
    turns (both authors) are used as context.

    Roles are made explicit as [target_author] / [OTRO] in the prompt, and messages
    inside each turn are separated by MSG_SEP (<|msg_sep|>).

    Final format:

        PROMPT   = "<|talk|><|ax1|> [OTRO] ... <|msg_sep|> ... [target_author] ... <|ax2|>"
        RESPONSE = " target_author_reply <|endoftext|>"
    """
    print("Procesando chat (k-turns con roles)...")

    # --- Read and parse raw lines ---
    messages = []
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            ts = parse_datetime(line)
            match = re.match(r"\d+/\d+/\d+[, ]\s?\d+:\d+\s-\s([^:]+):\s(.+)", line)
            if match:
                author = match.group(1).strip()
                raw_msg = match.group(2)
                msg = clean_text(raw_msg)
                # Filter empty messages or irrelevant WhatsApp system text
                if msg and not contains_irrelevant_data(msg):
                    messages.append((author, msg, ts))

    if not messages:
        print("No se encontraron mensajes v√°lidos.")
        return [], []

    # --- Group consecutive messages from the same author (turns) ---
    messages = group_consecutive_messages(messages)
    print(f"Total de turnos agrupados: {len(messages)}")

    prompts, responses = [], []

    # --- Iterate over turns and build training pairs ---
    for i in range(1, len(messages)):
        author_i, msg_i, ts_i = messages[i]

        # We only care about turns where the target author is responding
        if author_i != target_author:
            continue

        # Build a context of up to k_history previous turns (both authors)
        context = []
        last_ts = ts_i

        for j in range(i - 1, -1, -1):
            a_j, m_j, ts_j = messages[j]

            # Session break if the time gap is too large
            if ts_j and last_ts and (last_ts - ts_j) > time_gap:
                break

            # Insert at the beginning to keep chronological order
            context.insert(0, (a_j, m_j))
            last_ts = ts_j if ts_j is not None else last_ts

            if len(context) >= k_history:
                break

        if not context:
            continue

        def fmt_turn(a, m):
            speaker = f"[{target_author}]" if a == target_author else "[OTRO]"
            return f"{speaker} {m}"

        context_str = " ".join(fmt_turn(a, m) for (a, m) in context)

        prompt = f"<|talk|><|ax1|> {context_str} <|ax2|>"
        response = f" {msg_i} <|endoftext|>"

        prompts.append(prompt)
        responses.append(response)

    print(f"Total de pares generados: {len(prompts)}")
    return prompts, responses


## Procesamiento interactivo del chat
Esta celda ofrece una interfaz interactiva para cargar el archivo del chat, ingresar el nombre del autor y ajustar par√°metros como el historial considerado y el tiempo m√°ximo entre mensajes. Permite procesar el chat sin modificar c√≥digo.

In [None]:
# File upload widget for WhatsApp chat export (.txt)
chat_uploader = widgets.FileUpload(
    accept=".txt",
    multiple=False,
    description="Subir chat (.txt)"
)

# Text input for the target author (exactly as appears in the export)
author_input = widgets.Text(
    description="Autor:",
    placeholder="Nombre exactamente como figura en el chat",
    layout=widgets.Layout(width="60%")
)

# Slider for the number of history turns
k_history_slider = widgets.IntSlider(
    value=4,
    min=1,
    max=10,
    step=1,
    description="k_history:",
    continuous_update=False
)

# Slider for the time gap (in hours) to cut sessions
time_gap_slider = widgets.IntSlider(
    value=3,
    min=1,
    max=24,
    step=1,
    description="time_gap (h):",
    continuous_update=False
)

# Button to trigger processing
process_button = widgets.Button(
    description="Procesar chat",
    button_style="primary"
)

output_proc = widgets.Output()


def on_process_clicked(_):
    with output_proc:
        clear_output()

        # Basic validation
        if len(chat_uploader.value) == 0:
            print("Por favor, sube un archivo de chat en formato .txt.")
            return

        target_author = author_input.value.strip()
        if not target_author:
            print("Por favor, ingresa el nombre del autor exactamente como aparece en el chat.")
            return

        # Extract uploaded file content
        upload_info = chat_uploader.value[0]
        content = bytes(upload_info["content"]).decode("utf-8-sig", errors="replace")

        # Save to temporary file so we can reuse the existing function
        tmp_path = "./uploaded_chat.txt"
        with open(tmp_path, "w", encoding="utf-8") as f:
            f.write(content)

        # Call the processing function and expose results as global variables
        global prompts, responses
        prompts, responses = process_whatsapp_chat_with_roles(
            filepath=tmp_path,
            target_author=target_author,
            k_history=k_history_slider.value,
            time_gap=timedelta(hours=time_gap_slider.value),
        )

        print(f"Pares generados: {len(prompts)}")
        if prompts:
            print("\nEjemplo de PROMPT:")
            print(prompts[0][:500] + ("..." if len(prompts[0]) > 500 else ""))
            print("\nEjemplo de RESPONSE:")
            print(responses[0][:500] + ("..." if len(responses[0]) > 500 else ""))
        print("\nVariables disponibles para las siguientes celdas: 'prompts', 'responses'.")


process_button.on_click(on_process_clicked)

display(
    widgets.VBox([
        widgets.HTML("<b>Procesamiento interactivo del chat de WhatsApp</b>"),
        chat_uploader,
        author_input,
        k_history_slider,
        time_gap_slider,
        process_button,
        output_proc
    ])
)


## Creaci√≥n del dataset, limpieza y guardado

Esta celda toma las listas prompts y responses generadas previamente, construye un DataFrame, aplica una limpieza b√°sica (elimina ejemplos triviales y enlaces) y guarda un archivo CSV con el dataset inicial. Adem√°s, muestra algunos ejemplos aleatorios para inspeccionar el resultado.

In [None]:
# Safety check: prompts and responses must exist
if "prompts" not in globals() or "responses" not in globals():
    raise RuntimeError(
        "Las variables 'prompts' y 'responses' no existen. "
        "Ejecuta primero el procesamiento interactivo del chat (secci√≥n 4-bis)."
    )

def build_dataset(prompts, responses,
                  min_prompt_len: int = 4,
                  min_response_len: int = 3,
                  filter_links: bool = True):
    """
    Build a DataFrame from prompts and responses applying basic cleaning:
    - Drop NaN values.
    - Remove very short prompts/responses.
    - Optionally filter out examples containing URLs.
    """
    df = pd.DataFrame({"prompt": prompts, "response": responses}).dropna()

    # Filter trivial examples (very short prompts/responses)
    df = df[
        (df["prompt"].str.split().str.len() >= min_prompt_len) &
        (df["response"].str.split().str.len() >= min_response_len)
    ].reset_index(drop=True)

    # Basic noise filtering: remove URLs if requested
    if filter_links:
        df = df[
            ~df["prompt"].str.contains(r"http|www|\.com", regex=True) &
            ~df["response"].str.contains(r"http|www|\.com", regex=True)
        ].reset_index(drop=True)

    return df


# Default parameters (aligned with your original thresholds)
min_prompt_len_default = 4
min_response_len_default = 3
filter_links_default = True
output_path_default = "train_data_raw.csv"

# Build dataset with default configuration
data = build_dataset(
    prompts,
    responses,
    min_prompt_len=min_prompt_len_default,
    min_response_len=min_response_len_default,
    filter_links=filter_links_default,
)

# Save preprocessed dataset
output_path = output_path_default
data.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL)

print(f"Dataset inicial guardado ‚Üí {len(data)} pares.")
print("\nVista aleatoria de algunos ejemplos:\n")
for _ in range(min(5, len(data))):
    s = data.sample(1).iloc[0]
    print(f"Prompt:\n{s['prompt']}\n‚Üí Response:\n{s['response']}\n{'-'*70}")

## Filtrado sem√°ntico global

Esta celda aplica un filtrado sem√°ntico basado en embeddings para quedarse √∫nicamente con pares prompt‚Äìresponse que tengan una relaci√≥n de significado suficientemente fuerte. Se calcula la similitud coseno entre cada prompt y su respuesta, se agregan estos valores al DataFrame y se guarda un dataset filtrado.

In [None]:
# Load the sentence transformer model (only once)
if "model_emb" not in globals():
    model_emb = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# Encode prompts and responses
emb_prompts = model_emb.encode(
    data["prompt"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)
emb_resps = model_emb.encode(
    data["response"].tolist(),
    convert_to_tensor=True,
    show_progress_bar=True
)

# Cosine similarity between each prompt and its paired response
similarities = util.cos_sim(emb_prompts, emb_resps).diagonal().cpu().numpy()
data["similarity"] = similarities

print(f"Media de similitud: {data['similarity'].mean():.3f}")

# Filtering configuration
SIM_THRESHOLD = 0.30
MAX_PROMPT_LEN = 600
MAX_RESPONSE_LEN = 400

# Keep only examples with strong semantic relation and reasonable length
filtered = data[
    (data["similarity"] > SIM_THRESHOLD) &
    (data["prompt"].str.len() < MAX_PROMPT_LEN) &
    (data["response"].str.len() < MAX_RESPONSE_LEN)
].reset_index(drop=True)

output_filtered_path = "filtered_train_data.csv"
filtered.to_csv(output_filtered_path, index=False)

print(f"Dataset final guardado en '{output_filtered_path}'.")
print(f"Total de pares √∫tiles: {len(filtered)} (de {len(data)}).")

# Quick preview of filtered examples
print("\nVista r√°pida de algunos ejemplos filtrados:\n")
for _ in range(min(3, len(filtered))):
    s = filtered.sample(1).iloc[0]
    print(
        f"Prompt:\n{s['prompt']}\n"
        f"‚Üí Response:\n{s['response']}\n"
        f"Similarity: {s['similarity']:.3f}\n"
        + "-" * 70
    )

### An√°lisis estad√≠stico y calidad del dataset final

Este bloque analiza el dataset final utilizado para el entrenamiento supervisado del modelo, es decir, el conjunto de datos luego de aplicar todos los filtros de preprocesamiento y limpieza definidos en el pipeline.

El objetivo es verificar la calidad, tama√±o y estructura del dataset definitivo, y caracterizar el vocabulario que el modelo aprender√°, evitando mezclar informaci√≥n de etapas intermedias que no participan directamente del entrenamiento.

En particular, se reportan las siguientes m√©tricas:

* **Dataset**: identifica el conjunto analizado (dataset final).

* **Samples**: cantidad total de ejemplos de entrenamiento. Cada ejemplo corresponde a un par *prompt‚Äìresponse* serializado que el modelo debe aprender a continuar.

* **Exact duplicates**: n√∫mero de ejemplos id√©nticos detectados.

* **Chars avg**: longitud promedio de los ejemplos medida en caracteres. Proporciona una noci√≥n general del tama√±o del texto sin depender del tokenizer.

* **Words avg**: cantidad promedio de palabras por ejemplo, utilizada como una medida ling√º√≠stica intuitiva de longitud.

* **Words p95**: percentil 95 de la longitud en palabras. Indica que el 95 % de los ejemplos tiene una longitud menor o igual a este valor y permite identificar conversaciones largas.

* **Unique words**: cantidad total de palabras distintas presentes en el dataset (tama√±o del vocabulario).

* **Vocab richness (%)**: proporci√≥n entre palabras √∫nicas y el total de palabras. Valores relativamente bajos son esperables en chats personales y reflejan consistencia de estilo y repetici√≥n de expresiones.

* **Tokens avg**: cantidad promedio de tokens por ejemplo seg√∫n el tokenizer del modelo, que representa lo que efectivamente procesa el modelo durante el entrenamiento.

* **Tokens p95**: percentil 95 de la longitud en tokens. Es una m√©trica clave para estimar el uso del contexto y prevenir truncamiento.

Adem√°s, se incluye un an√°lisis del vocabulario m√°s frecuente del dataset final en dos vistas complementarias:

* **Con muletillas**: ranking de palabras seg√∫n frecuencia cruda, √∫til para detectar expresiones dominantes y patrones naturales del habla.

* **Sin muletillas**: ranking de palabras tras remover stopwords y t√©rminos t√©cnicos del template, lo que permite identificar contenido m√°s informativo y sem√°nticamente relevante.

Finalmente, se presenta una nube de palabras construida a partir del dataset final (sin stopwords), donde las palabras m√°s frecuentes aparecen con mayor tama√±o. Esta visualizaci√≥n ofrece una caracterizaci√≥n r√°pida del estilo conversacional y los temas predominantes aprendidos por el modelo.


In [None]:
try:
    from wordcloud import WordCloud
    WORDCLOUD_AVAILABLE = True
except Exception:
    WORDCLOUD_AVAILABLE = False

final_df = None
final_name = None

if "filtered" in globals() and isinstance(filtered, pd.DataFrame):
    final_df = filtered
    final_name = "Final (filtered)"
elif "data" in globals() and isinstance(data, pd.DataFrame):
    final_df = data
    final_name = "Final (data)"
else:
    raise RuntimeError("No encuentro `filtered` ni `data`. Necesito el dataset final en un DataFrame.")

# Expect prompt/response pairs for GPT-2 pipeline
required_cols = {"prompt", "response"}
if not required_cols.issubset(final_df.columns):
    raise RuntimeError(f"El dataset final debe tener columnas {required_cols}. Columnas actuales: {list(final_df.columns)}")

# Helpers for word stats
WORD_RE = re.compile(r"[a-z√°√©√≠√≥√∫√±√º0-9']+")
TAG_RE_GENERIC = re.compile(r"<\|.*?\|>")  # removes <|talk|>, <|ax1|>..., <|endoftext|>, etc.

# Exclude target author name from stats if available
AUTHOR_STOP_WORDS = set()
if "author_input" in globals() and getattr(author_input, "value", "").strip():
    AUTHOR_STOP_WORDS = {
        w for w in re.findall(r"[a-z√°√©√≠√≥√∫√±√º]+", author_input.value.strip().lower())
        if len(w) >= 2
    }

# Technical leftovers / control tokens (after stripping tags these might still appear)
STOP_WORDS_EXTRA = {
    "talk", "ax1", "ax2", "ax3", "ax4", "ax5", "ax6", "ax7", "ax8", "ax9",
    "endoftext", "msg_sep", "otro",
    "user", "assistant", "system",
    "metadata", "knowledge", "cutoff", "date", "today", "reasoning", "mode", "custom", "instructions",
}.union(AUTHOR_STOP_WORDS)

# Spanish stopwords for "informative words"
SPANISH_STOPWORDS = {
    "el","la","los","las","un","una","unos","unas",
    "yo","me","te","se","lo","le","nos","les",
    "de","que","y","o","pero","si","no","es","en","con","por","para",
    "ya","bien","eh","ah","oh","xd","jaja","jajaja"
}

def normalize_for_stats(text: str) -> str:
    """Normalization ONLY for stats (not training)."""
    if not isinstance(text, str):
        return ""
    text = TAG_RE_GENERIC.sub(" ", text)
    text = text.replace("\n", " ").lower()
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def extract_words(text: str):
    """Raw frequency words (keeps muletillas, removes template leftovers)."""
    words = WORD_RE.findall(normalize_for_stats(text))
    return [w for w in words if w not in STOP_WORDS_EXTRA and len(w) >= 2]

def extract_informative_words(text: str):
    """Informative frequency words (stopwords removed)."""
    return [w for w in extract_words(text) if w not in SPANISH_STOPWORDS]

def build_texts_from_pairs(df_pairs: pd.DataFrame):
    """For prompt/response datasets, analyze prompt+response concatenated."""
    p = df_pairs["prompt"].astype(str).tolist()
    r = df_pairs["response"].astype(str).tolist()
    return [f"{pp} {rr}" for pp, rr in zip(p, r)]

def compute_text_stats(texts):
    texts = [t for t in texts if isinstance(t, str) and t.strip()]
    n = len(texts)

    char_lengths = [len(t) for t in texts]
    word_lengths = [len(extract_words(t)) for t in texts]

    vocab = Counter()
    for t in texts:
        vocab.update(extract_words(t))

    stats = {
        "dataset": final_name,
        "samples": n,
        "exact_duplicates": int(pd.Series(texts).duplicated().sum()),
        "chars_avg": float(sum(char_lengths) / max(1, n)),
        "words_avg": float(sum(word_lengths) / max(1, n)),
        "words_p95": float(pd.Series(word_lengths).quantile(0.95)) if n else 0.0,
        "unique_words": int(len(vocab)),
        "vocab_richness_pct": 100.0 * len(vocab) / max(1, sum(vocab.values())),
    }

    # Token stats if tokenizer exists
    if "tokenizer" in globals() and tokenizer is not None:
        try:
            token_lengths = [len(tokenizer(t, add_special_tokens=False).input_ids) for t in texts]
            stats.update({
                "tokens_avg": float(sum(token_lengths) / max(1, n)),
                "tokens_p95": float(pd.Series(token_lengths).quantile(0.95)) if n else 0.0,
            })
        except Exception:
            pass

    return stats, vocab

def maybe_show_wordcloud(vocab: Counter, title: str, max_words: int = 120):
    if not WORDCLOUD_AVAILABLE:
        print("‚ÑπÔ∏è WordCloud no est√° disponible (instal√°: pip install wordcloud) para ver la nube de palabras.")
        return
    if not vocab:
        print("‚ÑπÔ∏è No hay vocabulario suficiente para generar la nube de palabras.")
        return

    wc = WordCloud(
        width=1200,
        height=600,
        background_color="white",
        max_words=max_words,
        collocations=False
    ).generate_from_frequencies(dict(vocab))

    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title)
    plt.show()


final_texts = build_texts_from_pairs(final_df.dropna(subset=["prompt", "response"]).reset_index(drop=True))

stats, vocab_raw = compute_text_stats(final_texts)
summary_df = pd.DataFrame([stats])

print("\nüìä Resumen estad√≠stico (dataset final)")
display(summary_df)

# Top words: with / without stopwords
TOP_K = 20
print(f"\nüîù Top {TOP_K} palabras (dataset final)")

top_raw = pd.DataFrame(vocab_raw.most_common(TOP_K), columns=["palabra", "frecuencia"])

vocab_info = Counter()
for t in final_texts:
    vocab_info.update(extract_informative_words(t))

top_info = pd.DataFrame(vocab_info.most_common(TOP_K), columns=["palabra", "frecuencia"])

max_len = max(len(top_raw), len(top_info))
top_raw = top_raw.reindex(range(max_len))
top_info = top_info.reindex(range(max_len))

spacer = pd.DataFrame({"": [""] * max_len})
spacer.columns = pd.MultiIndex.from_product([[""], [""]])

top_raw.columns = pd.MultiIndex.from_product([["Con muletillas"], top_raw.columns])
top_info.columns = pd.MultiIndex.from_product([["Sin muletillas"], top_info.columns])

display(pd.concat([top_raw, spacer, top_info], axis=1))

# Word cloud
print("\nüñºÔ∏è Nube de palabras (sin stopwords) - dataset final")
maybe_show_wordcloud(vocab_info, title="Nube de palabras (sin stopwords) - Dataset final")

# Brief info
print("\nüßæ Informaci√≥n para el informe")
print("‚Ä¢ Modelo objetivo: GPT-2 (entrenamiento supervisado con pares prompt‚Äìresponse)")
print("‚Ä¢ Tokens de control del dataset: <|endoftext|>, <|talk|>, <|ax1|>...<|ax9|> (se eliminan solo para estad√≠sticas)")
print(f"‚Ä¢ Dataset analizado: {final_name} ‚Üí {len(final_df)} ejemplos")

## Carga del modelo base o fine-tune anterior

Esta celda carga el modelo base seleccionado, inicializa el tokenizer, registra los tokens especiales utilizados en el formato de conversaci√≥n y ajusta las dimensiones del modelo para incluirlos. Finalmente, mueve el modelo al dispositivo correspondiente (CPU o GPU).

In [None]:
# ============================================================
# 7Ô∏è‚É£ CARGA DEL MODELO BASE O FINE-TUNE ANTERIOR
# ============================================================
model_name = "DeepESP/gpt2-spanish"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Register special tokens used in the conversational format
special_tokens = {
    "additional_special_tokens": [MSG_SEP, f"[{author_input.value.strip()}]", "[OTRO]"]
}
tokenizer.add_special_tokens(special_tokens)

# Padding token (GPT-2 uses EOS as pad)
tokenizer.pad_token = tokenizer.eos_token

# Load base model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Resize embeddings to include new special tokens
model.resize_token_embeddings(len(tokenizer))

# Move model to CPU/GPU
model.to(device)

print(f"Modelo cargado: {model_name}")
print(f"Total de tokens en el tokenizer: {len(tokenizer)}")
print(f"Ejecutando en: {device}")


## Tokenizaci√≥n robusta para entrenamiento

Esta celda carga el dataset filtrado, separa los datos en entrenamiento y validaci√≥n, y construye una funci√≥n de tokenizaci√≥n adecuada para entrenamiento causal (SFT). Cada ejemplo se tokeniza de manera independiente (sin concatenaci√≥n), y las etiquetas (labels) corresponden directamente a los input_ids.

In [None]:
# Load and clean dataset
data = pd.read_csv("filtered_train_data.csv")
data = data.dropna(subset=["prompt", "response"])
data = data[(data["prompt"].str.strip() != "") & (data["response"].str.strip() != "")]
data.to_csv("filtered_train_data.csv", index=False)

# Load into HuggingFace dataset structure
dataset = load_dataset("csv", data_files="filtered_train_data.csv")

# Split into train/validation
train_test = dataset["train"].train_test_split(test_size=0.2)
datasets = DatasetDict({
    "train": train_test["train"],
    "validation": train_test["test"],
})

def tokenize_function(examples):
    """
    Tokenizes each pair (prompt, response) as a single training example.
    The full conversational formatted text is already built in the dataset.
    Labels are set equal to input_ids for causal LM training.
    """
    texts = []
    for p, r in zip(examples["prompt"], examples["response"]):
        if isinstance(p, str) and isinstance(r, str):
            texts.append(f"{p.strip()} {r.strip()}")

    encodings = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=256,
    )

    # Causal LM: labels = input_ids
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

# Apply tokenization
tokenized = datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=["prompt", "response"],
)

lm_datasets = tokenized
print(lm_datasets)

## Argumentos de entrenamiento

Define los hiperpar√°metros principales de entrenamiento para el fine-tuning del modelo (n√∫mero de √©pocas, batch size, tasa de aprendizaje, etc.).

In [None]:
training_args = TrainingArguments(
    output_dir = f"./models/{author_input.value.strip()}_whatsapp_gpt2",
    num_train_epochs=3,              # few epochs for a final refinement
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,              # small LR to preserve base style
    save_strategy="epoch",
    eval_strategy="no",        # no evaluation during training
    logging_steps=10,
    report_to=[],                    # disable external loggers
)

## Configuraci√≥n del Trainer

Configura el objeto Trainer que orquesta el loop de entrenamiento, conectando el modelo, los datos tokenizados y los argumentos de entrenamiento. Se usa el DataCollatorForLanguageModeling en modo causal (sin MLM).

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,   # causal LM, not masked LM
    ),
)

## Entrenamiento y guardado del modelo

Esta celda entrena el modelo y guarda los pesos y el tokenizer en un directorio cuyo nombre se genera autom√°ticamente en funci√≥n del autor seleccionado en la etapa de procesamiento interactivo.


In [None]:
# Build dynamic save directory from author_input
if "author_input" in globals():
    author_name = author_input.value.strip()
    # Normalize folder name: replace spaces, remove symbols
    safe_author = re.sub(r"[^A-Za-z0-9_-]", "_", author_name)
    save_dir = f"./bot_{safe_author}" if safe_author else "./bot_model"
else:
    save_dir = "./bot_model"

print(f"Guardando el modelo en: {save_dir}")

try:
    trainer.train()
finally:
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()

# Save model and tokenizer
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

print(f"Modelo y tokenizer guardados en: {save_dir}")

## Comparador de respuestas entre modelos

Esta celda carga el modelo base y el modelo fine-tuneado, y define un comparador para generar respuestas con ambos a partir de un mismo prompt. De esta forma se puede inspeccionar cualitativamente el efecto del fine-tuning.

In [None]:
# Ensure save_dir exists (from training step)
if "save_dir" not in globals():
    save_dir = "./bot_model"  # fallback directory if not defined

# Build author tag used during preprocessing and training
AUTHOR_TAG = f"[{author_input.value.strip()}]"

# Load base model with the same tokenizer (must match special tokens)
base_model = AutoModelForCausalLM.from_pretrained("DeepESP/gpt2-spanish")
base_model.resize_token_embeddings(len(tokenizer))
base_model.to(device)
base_model.eval()

# Load fine-tuned model
fine_model = AutoModelForCausalLM.from_pretrained(save_dir).to(device)
fine_model.eval()


def generate_response(
    prompt: str,
    model,
    tokenizer,
    max_new_tokens: int = 60
) -> str:
    """
    Generate a response following the conversational format used during training.
    The input prompt is plain text from the user (no tags).
    The model should interpret it as a message from the other participant.
    """

    # Construct formatted input according to training structure
    formatted_prompt = (
        f"<|talk|><|ax1|> [OTRO] {prompt.strip()} <|ax2|> {AUTHOR_TAG} "
    )

    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.8,
        top_p=0.92,
        repetition_penalty=1.9,
        no_repeat_ngram_size=3,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Truncate at end-of-text tag if present
    if "<|endoftext|>" in generated:
        generated = generated.split("<|endoftext|>")[0]

    # Keep only text after <|ax2|>
    if "<|ax2|>" in generated:
        generated = generated.split("<|ax2|>", 1)[-1].strip()

    # Remove role tags and internal separators from visible output
    generated = generated.replace(AUTHOR_TAG, "")
    generated = generated.replace("[OTRO]", "")
    generated = generated.replace(MSG_SEP, " ")

    # Final whitespace cleanup
    response = re.sub(r"\s+", " ", generated).strip()
    return response if response else "(no generated response)"


def compare_models(prompt: str):
    """Generate and print responses from both models for comparison."""
    print(f"\nPrompt: {prompt}")
    base_resp = generate_response(prompt, base_model, tokenizer)
    fine_resp = generate_response(prompt, fine_model, tokenizer)

    print("\n----------------------------------")
    print("Base model:\n", base_resp)
    print("----------------------------------")
    print("Fine-tuned model:\n", fine_resp)
    print("----------------------------------")


# Extract prompts from validation set and sample a few
val_prompts = [ex["prompt"] for ex in datasets["validation"]]
sample_prompts = random.sample(val_prompts, min(5, len(val_prompts)))

print("\n=== Model comparison using validation samples ===")
for q in sample_prompts:
    # Remove framework tags and internal separators
    clean_q = (
        q.replace("<|talk|>", "")
         .replace("<|ax1|>", "")
         .replace("<|ax2|>", "")
         .replace("<|endoftext|>", "")
         .replace(MSG_SEP, " ")
         .replace(AUTHOR_TAG, "")
         .replace("[OTRO]", "")
         .strip()
    )
    #compare_models(clean_q)


    mis_preguntas = [
    "Hola, c√≥mo est√°s?",
    "Eu sale algo el finde?",
    "Qu√© opin√°s de la programaci√≥n?",
    "Me aburrooo, contame algo",
    "Nos vemos m√°s tarde?"
]

print("\n=== COMPARACI√ìN DE MODELOS ===")
for p in mis_preguntas:
    compare_models(p)


## Chat interactivo

Esta celda permite interactuar con el modelo fine-tuneado mediante entrada por consola. El usuario puede escribir mensajes y recibir respuestas del modelo. Para finalizar la sesi√≥n, ingresar salir.

In [None]:
def clean_user_input(text: str) -> str:
    """Remove training tags from user input to avoid formatting conflicts."""
    text = text.replace("<|talk|>", "")
    text = text.replace("<|ax1|>", "")
    text = text.replace("<|ax2|>", "")
    text = text.replace("<|endoftext|>", "")
    text = text.replace(MSG_SEP, " ")
    text = text.replace(AUTHOR_TAG, "")
    text = text.replace("[OTRO]", "")
    return text.strip()

print("=== Interactive chat with the fine-tuned model ===")
print("(Type 'salir' to exit)\n")

while True:
    user_msg = input("You: ").strip()

    if user_msg.lower() == "salir":
        print("Model: session finished.")
        break

    # Remove unwanted tags and tokens
    user_msg = clean_user_input(user_msg)

    # Generate model response
    response = generate_response(user_msg, fine_model, tokenizer)
    print(f"Model: {response}\n")

## Evaluaci√≥n cuantitativa y reporte final

Esta celda eval√∫a cuantitativamente el modelo base y el modelo fine-tuneado sobre el conjunto de validaci√≥n. Se calcula la p√©rdida media de cross-entropy y la perplexity aproximada, y se reporta la mejora relativa del modelo fine-tuneado. Finalmente, se muestra una comparaci√≥n gr√°fica sencilla entre ambos modelos.

In [None]:
sns.set_theme(style="whitegrid")

def evaluate_model(model, dataset, max_batches: int = 200):
    """
    Evaluate a causal language model on a subset of the dataset.

    The function computes the average cross-entropy loss and derives
    perplexity as exp(loss). It iterates over individual examples,
    feeding input_ids as both inputs and labels.
    """
    model.eval()
    losses = []

    with torch.no_grad():
        for i, example in enumerate(tqdm(dataset, desc="Evaluando...")):
            if i >= max_batches:
                break

            input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(device)
            outputs = model(input_ids, labels=input_ids)
            losses.append(outputs.loss.item())

    avg_loss = sum(losses) / len(losses)
    try:
        perp = math.exp(avg_loss)
    except OverflowError:
        perp = float("inf")

    return {"cross_entropy": avg_loss, "perplexity": perp}


# Evaluate base and fine-tuned models on the validation set
metrics_base = evaluate_model(base_model, lm_datasets["validation"])
metrics_fine = evaluate_model(fine_model, lm_datasets["validation"])

print("\nResultados de evaluaci√≥n:")
print(
    f"Base model  ‚Üí Cross-Entropy: {metrics_base['cross_entropy']:.3f} | "
    f"Perplexity: {metrics_base['perplexity']:.2f}"
)
print(
    f"Fine-tuned  ‚Üí Cross-Entropy: {metrics_fine['cross_entropy']:.3f} | "
    f"Perplexity: {metrics_fine['perplexity']:.2f}"
)

# Relative improvement in perplexity (percentage)
improvement = (
    (metrics_base["perplexity"] - metrics_fine["perplexity"])
    / metrics_base["perplexity"]
    * 100
)

print(f"Mejora relativa en perplexity: {improvement:.2f}%")

# Simple barplot comparison
plt.figure(figsize=(6, 4))
sns.barplot(
    x=["Base (GPT-2)", "Fine-tuned"],
    y=[metrics_base["perplexity"], metrics_fine["perplexity"]],
    palette=["#95a5a6", "#2ecc71"]
)
plt.title("Comparaci√≥n de perplexity")
plt.ylabel("Perplexity")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()