# Proyecto 4: Pipeline de preprocesamiento

Usando el dataset de 5,000 oraciones de `nlp_prueba_cc0c2_large.csv`

In [1]:
import torchtext
print(torchtext.__version__)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/site-packages/tornado

0.17.0+cpu


In [2]:
# Usamos pandas para almacenar el dataset
import pandas as pd 

df = pd.read_csv("data/nlp_prueba_cc0c2_large.csv")
df.head()

assert "Texto" in df.columns, "El CSV debe tener la columna 'Texto'"

In [3]:
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torchdata.datapipes.iter import IterableWrapper, Mapper
import torchtext


## Tres pipelines

### Crudo

In [None]:
# -*- coding: utf-8 -*-
import re
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator

# ============ 
# 1) Dataset base (ejemplo: df["Texto"])
# ============
texts = df["Texto"].astype(str).tolist()

print(f"Total size (usado para vocab): {len(texts)}")

# ============ 
# 2) Tokenizador crudo (regex)
# ============
tokenizer_regex = re.compile(
    r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+(?:[-'][A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+)*|\d+(?:[.,]\d+)?"
)

def tokenize_crudo(text: str):
    return tokenizer_regex.findall(text)

# ============ 
# 3) Vocabulario desde TODO el corpus
# ============
def yield_tokens(sentences):
    for text in sentences:
        yield tokenize_crudo(text)

specials = ["<unk>", "<pad>"]
vocab = build_vocab_from_iterator(yield_tokens(texts), specials=specials, min_freq=1)
vocab.set_default_index(vocab["<unk>"])  # OOV -> <unk>

PAD_IDX = vocab["<pad>"]
UNK_IDX = vocab["<unk>"]

print("Vocab size:", len(vocab))
print("Muestra de vocab:", vocab.get_itos()[:20])

# ============ 
# 4) Dataset y DataLoader (opcional)
# ============
class CustomDataset(Dataset):
    def __init__(self, sentences, tokenizer, vocab):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.vocab = vocab

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = self.tokenizer(self.sentences[idx])
        ids = self.vocab.lookup_indices(tokens)
        return torch.tensor(ids, dtype=torch.long)

def collate_fn(batch_tensors):
    return pad_sequence(batch_tensors, batch_first=True, padding_value=PAD_IDX)

dataset = CustomDataset(texts, tokenize_crudo, vocab)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

# ============ 
# 5) Función de OOV
# ============
def oov_rate(sentences, tokenizer, vocab, unk_idx):
    total, oov = 0, 0
    for s in sentences:
        ids = vocab.lookup_indices(tokenizer(s))
        total += len(ids)
        oov += sum(1 for i in ids if i == unk_idx)
    return (oov / total) if total > 0 else 0.0

# ============ 
# 6) Ejemplos de code-switching
# ============
code_switching_examples = [
    "The course de NLP es impresionante",
    "Los proyectos are done rápidamente",
    "Procesar text works impresionante"
]

print("\n=== Code-switching ===")
for s in code_switching_examples:
    toks = tokenize_crudo(s)
    ids = vocab.lookup_indices(toks)
    rate = oov_rate([s], tokenize_crudo, vocab, UNK_IDX)
    print("Texto: ", s)
    print("Tokens:", toks)
    print("Idxs:  ", ids)
    print("OOV rate:", f"{rate:.4f}\n")


Train size: 3003, Test size: 2002
Vocab size (train): 73
Muestra de vocab: ['<unk>', '<pad>', 'en', 'de', 'es', 'para', 'Implementar', 'proyectos', 'reales', 'Los', 'pero', 'son', 'No', 'entiendo', 'cómo', 'funciona', 'la', 'La', 'NLP', 'curso']
OOV rate (test vs train vocab): 0.0000

=== Ejemplo TEST ===
Texto:  The course de NLP es impresionante
Tokens: ['The', 'course', 'de', 'NLP', 'es', 'impresionante']
Idxs:   [0, 0, 3, 18, 4, 45] 

Texto:  Los proyectos are done
Tokens: ['Los', 'proyectos', 'are', 'done']
Idxs:   [9, 7, 0, 0] 

Texto:  Procesar text works impresionante
Tokens: ['Procesar', 'text', 'works', 'impresionante']
Idxs:   [0, 0, 0, 45] 



### Normalizado(minúsculas, sin puntuación)

### normalizado+lematizado (spaCy en español)