<a href="https://colab.research.google.com/github/ArmandoGalaz/prueba/blob/main/Preprocesamiento.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install spacy



In [None]:
!pip install num2words



In [None]:
!pip install fasttext



In [None]:
!pip install transformers torch



In [None]:
import numpy as np
import pandas as pd
import re
from typing import List, Union, Optional
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from num2words import num2words
import nltk

In [None]:
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

True

In [None]:
ruta_dataset = "/content/drive/Shareddrives/Titulo 2/Dataset/dataset.csv"
df_limpio = pd.read_csv(ruta_dataset)

In [None]:
# -*- coding: utf-8 -*-
"""Preprocesamiento con FastText y RoBERTa-base

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1YcupHkdfYCImhRCOLTPxyZltdKgYd_WI
"""

class TextPreprocessor:
    def __init__(self, lang: str = "es"):
        """
        Inicializa el preprocesador de texto con configuraciones de idioma.

        Args:
            lang (str): Idioma para procesamiento ('es' para espa√±ol, 'en' para ingl√©s)
        """
        self.lang = lang
        self.stop_words = set(stopwords.words("spanish" if lang == "es" else "english"))
        self.stop_words = self._get_enhanced_stopwords(lang)
        self.tokenizer = TweetTokenizer()
        self.stemmer = SnowballStemmer("spanish" if lang == "es" else "english")

        # Cargar modelo de spaCy para lematizaci√≥n y embeddings
        try:
            self.nlp = spacy.load("es_core_news_lg" if lang == "es" else "en_core_web_lg")
        except:
            spacy.cli.download("es_core_news_lg" if lang == "es" else "en_core_web_lg")
            self.nlp = spacy.load("es_core_news_lg" if lang == "es" else "en_core_web_lg")

        # Inicializar atributos para BERT/RoBERTa
        self.bert_tokenizer = None
        self.bert_model = None
        self.current_bert_model = None

        # Inicializar atributos para RoBERTa-base espec√≠ficamente
        self.roberta_tokenizer = None
        self.roberta_model = None
        self.current_roberta_model = None

        # Inicializar atributos para FastText
        self.fasttext_model = None
        self.current_fasttext_model = None

        # Inicializar atributos para RoBERTuito
        self.robertuito_tokenizer = None
        self.robertuito_model = None

        # Diccionario de contracciones (puede expandirse)
        self.contractions = {
            "es": {
                "q": "que",
                "d": "de",
                "x": "por",
                "xa": "para",
                "xk": "porque",
                "dl": "del",
                "tb": "tambi√©n"
            },
            "en": {
                "don't": "do not",
                "can't": "cannot",
                "won't": "will not",
                "i'm": "I am"  # Nota: Mantenemos la may√∫scula en "I"
            }
        }

    def normalize_text(self, text: str) -> str:
        """
        Normaliza el texto manteniendo las may√∫sculas/min√∫sculas originales.
        Realiza:
        1. Expansi√≥n de contracciones (respetando capitalizaci√≥n)
        2. Conversi√≥n de n√∫meros a palabras
        3. Eliminaci√≥n de caracteres especiales
        4. Normalizaci√≥n de espacios
        """
        text = str(text)  # No convertimos a min√∫sculas

        # Expansi√≥n de contracciones (respetando may√∫sculas iniciales)
        for cont, exp in self.contractions.get(self.lang, {}).items():
            # Manejar casos donde la contracci√≥n est√° al inicio (may√∫scula)
            if text.startswith(cont.capitalize()):
                text = re.sub(rf"\b{cont.capitalize()}\b", exp.capitalize(), text)
            # Caso general
            text = re.sub(rf"\b{cont}\b", exp, text, flags=re.IGNORECASE)

        # Convertir n√∫meros a palabras (respetando may√∫sculas iniciales)
        def num_to_words(match):
            num = match.group()
            try:
                # Si el n√∫mero est√° al inicio de oraci√≥n
                if match.start() == 0 or (match.start() > 0 and text[match.start()-1] in '.!?'):
                    return num2words(int(num), lang=self.lang).capitalize()
                return num2words(int(num), lang=self.lang)
            except:
                return num  # Mantener el n√∫mero original si hay error

        text = re.sub(r'\d+', num_to_words, text)

        # Eliminar caracteres especiales (excepto signos de puntuaci√≥n b√°sicos)
        text = re.sub(r"[^\w\s.,!?¬ø¬°]", " ", text)

        # Normalizar espacios
        text = re.sub(r"\s+", " ", text).strip()

        return text

    def tokenize(self, text: str, advanced: bool = True) -> List[str]:
        """
        Tokeniza el texto con opci√≥n para tokenizaci√≥n avanzada (conserva emojis y jerga)
        """
        if advanced:
            return self.tokenizer.tokenize(text)
        return word_tokenize(text)

    def _get_enhanced_stopwords(self, lang: str) -> set:
        """Obtiene y mejora la lista de stopwords incluyendo variantes"""
        base_stopwords = set(stopwords.words("spanish" if lang == "es" else "english"))

        # A√±adir variantes comunes con acentos y may√∫sculas
        enhanced_stopwords = base_stopwords.copy()
        if lang == "es":
            extras = {'√©l', '√©sta', '√©stas', '√©stos', '√âl', "√©l", "√âl"}
            for word in list(base_stopwords):
                enhanced_stopwords.add(word.capitalize())
                enhanced_stopwords.add(word.upper())
            enhanced_stopwords.update(extras)

        return enhanced_stopwords

    def remove_stopwords(self, tokens: List[str]) -> List[str]:
        """Filtra stopwords considerando m√∫ltiples variantes"""
        return [token for token in tokens
                if token.lower() not in self.stop_words
                and token not in self.stop_words]

    def lemmatize(self, tokens: List[str]) -> List[str]:
        """Lematiza los tokens usando spaCy"""
        doc = self.nlp(" ".join(tokens))
        return [token.lemma_ for token in doc if token.lemma_.strip() != ""]

    def preprocess_text(self, text: str,
                       normalize: bool = True,
                       tokenize: bool = True,
                       remove_stopwords: bool = True,
                       lemmatize: bool = True,
                       advanced_tokenize: bool = True) -> Union[str, List[str]]:
        """
        Pipeline completo de preprocesamiento de texto.

        Args:
            text: Texto a procesar
            normalize: Aplicar normalizaci√≥n
            tokenize: Aplicar tokenizaci√≥n
            remove_stopwords: Eliminar stopwords
            lemmatize: Aplicar lematizaci√≥n
            advanced_tokenize: Usar TweetTokenizer (True) o word_tokenize (False)

        Returns:
            Texto procesado (str si tokenize=False, List[str] si tokenize=True)
        """
        if normalize:
            text = self.normalize_text(text)

        if not tokenize:
            return text

        tokens = self.tokenize(text, advanced=advanced_tokenize)

        if remove_stopwords:
            tokens = self.remove_stopwords(tokens)

        if lemmatize:
            tokens = self.lemmatize(tokens)

        return tokens

    def vectorize(self, texts: Union[List[str], List[List[str]]],
                  method: str = "tfidf",
                  **kwargs) -> np.ndarray:
        """
        Vectoriza textos usando diferentes m√©todos.

        Args:
            texts: Lista de textos o tokens preprocesados
            method: M√©todo de vectorizaci√≥n ('tfidf', 'spacy', 'bert', 'roberta', 'fasttext', 'robertuito')
            **kwargs: Par√°metros espec√≠ficos para cada m√©todo

        Returns:
            Array numpy con las representaciones vectoriales
        """
        if method == "tfidf":
            return self._tfidf_vectorize(texts, **kwargs)
        elif method == "spacy":
            return self._spacy_vectorize(texts, **kwargs)
        elif method == "bert":
            return self._bert_vectorize(texts, **kwargs)
        elif method == "roberta":
            return self._roberta_vectorize(texts, **kwargs)
        elif method == "fasttext":
            return self._fasttext_vectorize(texts, **kwargs)
        elif method == "robertuito":
            return self._robertuito_vectorize(texts, **kwargs)
        else:
            raise ValueError(f"M√©todo de vectorizaci√≥n no soportado: {method}")

    def _tfidf_vectorize(self, texts: Union[List[str], List[List[str]]],
                         max_features: int = 5000,
                         ngram_range: tuple = (1, 1)) -> np.ndarray:
        """Vectorizaci√≥n TF-IDF"""
        if isinstance(texts[0], list):
            texts = [" ".join(tokens) for tokens in texts]

        vectorizer = TfidfVectorizer(max_features=max_features,
                                    ngram_range=ngram_range)
        return vectorizer.fit_transform(texts).toarray()

    def _spacy_vectorize(self, tokens_list: List[List[str]],
                        reduce_dim: bool = True,
                        n_components: int = 300) -> np.ndarray:
        """Vectorizaci√≥n con embeddings de spaCy"""
        vectors = []

        for tokens in tokens_list:
            # Obtener embeddings para cada token y promediarlos
            doc = self.nlp(" ".join(tokens))
            if len(doc) > 0:
                vec = np.mean([token.vector for token in doc], axis=0)
            else:
                vec = np.zeros(self.nlp.vocab.vectors_length)
            vectors.append(vec)

        vectors = np.array(vectors)

        # Reducci√≥n dimensional opcional para compatibilidad
        if reduce_dim and vectors.shape[1] > n_components:
            svd = TruncatedSVD(n_components=n_components)
            vectors = svd.fit_transform(vectors)

        return vectors

    def _bert_vectorize(self, texts: Union[List[str], List[List[str]]],
                        model_name: str = "dccuchile/bert-base-spanish-wwm-cased",
                        **kwargs) -> np.ndarray:
        """Vectorizaci√≥n con BERT"""
        try:
            from transformers import AutoTokenizer, AutoModel
            import torch
        except ImportError:
            raise ImportError("Para usar BERT, instala transformers: pip install transformers")

        # Cargar modelo si no est√° cargado o si cambia el nombre
        if (self.bert_tokenizer is None or
            self.bert_model is None or
            self.current_bert_model != model_name):

            print(f"‚è≥ Cargando modelo {model_name}...")
            self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name, **kwargs.get('tokenizer_args', {}))
            self.bert_model = AutoModel.from_pretrained(model_name, **kwargs.get('model_args', {}))
            self.current_bert_model = model_name

        # Par√°metros por defecto para el tokenizador
        tokenizer_args = {
            'return_tensors': 'pt',
            'padding': True,
            'truncation': True,
            'max_length': 512,
            'add_special_tokens': True
        }
        # Actualizar con par√°metros proporcionados por el usuario
        tokenizer_args.update(kwargs.get('tokenizer_args', {}))

        if isinstance(texts[0], list):
            texts = [" ".join(tokens) for tokens in texts]

        # Tokenizaci√≥n
        inputs = self.bert_tokenizer(texts, **tokenizer_args)

        # Generaci√≥n de embeddings
        with torch.no_grad():
            outputs = self.bert_model(**inputs)

        # Obtener y promediar embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

        return embeddings

    def _roberta_vectorize(self, texts: Union[List[str], List[List[str]]],
                          model_name: str = "roberta-base",
                          pooling_strategy: str = "mean",
                          **kwargs) -> np.ndarray:
        """
        Vectorizaci√≥n con RoBERTa-base

        Args:
            texts: Lista de textos o tokens
            model_name: Nombre del modelo RoBERTa (por defecto: roberta-base)
            pooling_strategy: Estrategia de pooling ('mean', 'cls', 'max')
            **kwargs: Par√°metros adicionales para el tokenizador y modelo
        """
        try:
            from transformers import RobertaTokenizer, RobertaModel
            import torch
        except ImportError:
            raise ImportError("Para usar RoBERTa, instala transformers: pip install transformers torch")

        # Cargar RoBERTa si no est√° cargado o si cambia el modelo
        if (self.roberta_tokenizer is None or
            self.roberta_model is None or
            self.current_roberta_model != model_name):

            print(f"‚è≥ Cargando RoBERTa: {model_name}...")
            self.roberta_tokenizer = RobertaTokenizer.from_pretrained(
                model_name,
                **kwargs.get('tokenizer_args', {})
            )
            self.roberta_model = RobertaModel.from_pretrained(
                model_name,
                **kwargs.get('model_args', {})
            )
            self.current_roberta_model = model_name
            print(f"‚úÖ RoBERTa cargado exitosamente")

        # Par√°metros por defecto para el tokenizador
        tokenizer_args = {
            'return_tensors': 'pt',
            'padding': True,
            'truncation': True,
            'max_length': 512,
            'add_special_tokens': True
        }
        # Actualizar con par√°metros proporcionados por el usuario
        tokenizer_args.update(kwargs.get('tokenizer_args', {}))

        # Preparar textos
        if isinstance(texts[0], list):
            texts = [" ".join(tokens) for tokens in texts]

        # Procesar en lotes para manejar datasets grandes
        batch_size = kwargs.get('batch_size', 32)
        all_embeddings = []

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]

            # Tokenizaci√≥n
            inputs = self.roberta_tokenizer(batch_texts, **tokenizer_args)

            # Generaci√≥n de embeddings
            with torch.no_grad():
                outputs = self.roberta_model(**inputs)

            # Aplicar estrategia de pooling
            if pooling_strategy == "mean":
                # Promedio de todos los tokens excluyendo padding
                attention_mask = inputs['attention_mask']
                embeddings = outputs.last_hidden_state
                # Multiplicar por la m√°scara de atenci√≥n y promediar
                embeddings = (embeddings * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(1).unsqueeze(-1)
            elif pooling_strategy == "cls":
                # Usar solo el token CLS
                embeddings = outputs.last_hidden_state[:, 0, :]
            elif pooling_strategy == "max":
                # Max pooling
                embeddings = torch.max(outputs.last_hidden_state, dim=1)[0]
            else:
                raise ValueError(f"Estrategia de pooling no soportada: {pooling_strategy}")

            all_embeddings.append(embeddings.cpu().numpy())

        # Concatenar todos los lotes
        final_embeddings = np.concatenate(all_embeddings, axis=0)

        return final_embeddings

    def _fasttext_vectorize(self, texts: Union[List[str], List[List[str]]],
                           model_path: str = None,
                           model_name: str = "cc.es.300.bin",
                           download_if_missing: bool = True,
                           **kwargs) -> np.ndarray:
        """
        Vectorizaci√≥n con FastText

        Args:
            texts: Lista de textos o tokens
            model_path: Ruta al modelo FastText local
            model_name: Nombre del modelo FastText preentrenado
            download_if_missing: Si descargar el modelo si no existe
            **kwargs: Par√°metros adicionales
        """
        try:
            import fasttext
            import fasttext.util
        except ImportError:
            raise ImportError("Para usar FastText, instala fasttext: pip install fasttext")

        # Cargar modelo si no est√° cargado o si cambia
        if (self.fasttext_model is None or
            self.current_fasttext_model != (model_path or model_name)):

            if model_path:
                print(f"‚è≥ Cargando modelo FastText desde: {model_path}")
                self.fasttext_model = fasttext.load_model(model_path)
                self.current_fasttext_model = model_path
            else:
                print(f"‚è≥ Cargando modelo FastText: {model_name}")
                if download_if_missing:
                    # Descargar modelo si no existe
                    try:
                        lang_code = model_name.split('.')[1]  # Extraer c√≥digo de idioma
                        fasttext.util.download_model(lang_code, if_exists='ignore')
                        self.fasttext_model = fasttext.load_model(f'{lang_code}.bin')
                    except:
                        # Fallback: intentar cargar directamente
                        self.fasttext_model = fasttext.load_model(model_name)
                else:
                    self.fasttext_model = fasttext.load_model(model_name)
                self.current_fasttext_model = model_name

        # Preparar textos
        if isinstance(texts[0], list):
            texts = [" ".join(tokens) for tokens in texts]

        # Generar embeddings
        embeddings = []
        for text in texts:
            # FastText puede manejar texto completo y generar embeddings promediados
            embedding = self.fasttext_model.get_sentence_vector(text)
            embeddings.append(embedding)

        return np.array(embeddings)

    def _robertuito_vectorize(self, texts: Union[List[str], List[List[str]]],
                             model_name: str = "pysentimiento/robertuito-base-uncased",
                             **kwargs) -> np.ndarray:
        """
        Vectorizaci√≥n con RoBERTuito (modelo espec√≠fico para espa√±ol de redes sociales)

        Args:
            texts: Lista de textos o tokens
            model_name: Nombre del modelo RoBERTuito
            **kwargs: Par√°metros adicionales para el tokenizador y modelo
        """
        try:
            from transformers import AutoTokenizer, AutoModel
            import torch
        except ImportError:
            raise ImportError("Para usar RoBERTuito, instala transformers: pip install transformers")

        # Cargar RoBERTuito si no est√° cargado
        if (self.robertuito_tokenizer is None or self.robertuito_model is None):
            print(f"‚è≥ Cargando RoBERTuito: {model_name}")
            self.robertuito_tokenizer = AutoTokenizer.from_pretrained(
                model_name,
                **kwargs.get('tokenizer_args', {})
            )
            self.robertuito_model = AutoModel.from_pretrained(
                model_name,
                **kwargs.get('model_args', {})
            )

        # Par√°metros por defecto para el tokenizador
        tokenizer_args = {
            'return_tensors': 'pt',
            'padding': True,
            'truncation': True,
            'max_length': 512,
            'add_special_tokens': True
        }
        # Actualizar con par√°metros proporcionados por el usuario
        tokenizer_args.update(kwargs.get('tokenizer_args', {}))

        # Preparar textos
        if isinstance(texts[0], list):
            texts = [" ".join(tokens) for tokens in texts]

        # Tokenizaci√≥n
        inputs = self.robertuito_tokenizer(texts, **tokenizer_args)

        # Generaci√≥n de embeddings
        with torch.no_grad():
            outputs = self.robertuito_model(**inputs)

        # Obtener embeddings (promedio de la √∫ltima capa oculta)
        embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

        return embeddings


def preprocess_dataframe(
    df: pd.DataFrame,
    text_col: str = "text",
    preprocessor: Optional[TextPreprocessor] = None,
    lang: str = "es",
    normalize: bool = True,
    tokenize: bool = True,
    remove_stopwords: bool = True,
    lemmatize: bool = True,
    advanced_tokenize: bool = True,
    vectorize: bool = False,
    vector_method: str = "tfidf",
    save_vectors: bool = False,
    output_dir: str = "/content/drive/Shareddrives/Titulo 2/Dataset",
    output_filename: str = "vectorized_results",
    **vector_kwargs
) -> pd.DataFrame:
    """
    Preprocesa un DataFrame completo con m√∫ltiples opciones y opci√≥n para guardar vectores.

    Args:
        df: DataFrame de entrada
        text_col: Columna con el texto a procesar
        preprocessor: Instancia de TextPreprocessor (opcional)
        lang: Idioma ('es' o 'en')
        normalize: Aplicar normalizaci√≥n
        tokenize: Aplicar tokenizaci√≥n
        remove_stopwords: Eliminar stopwords
        lemmatize: Aplicar lematizaci√≥n
        advanced_tokenize: Usar TweetTokenizer
        vectorize: Aplicar vectorizaci√≥n
        vector_method: M√©todo de vectorizaci√≥n ('tfidf', 'spacy', 'bert', 'roberta', 'fasttext', 'robertuito')
        save_vectors: Si True, guarda los vectores en archivos separados
        output_dir: Directorio donde guardar los resultados
        output_filename: Nombre base para los archivos de salida
        **vector_kwargs: Par√°metros para el vectorizador

    Returns:
        DataFrame con columnas adicionales de texto procesado
    """
    if preprocessor is None:
        preprocessor = TextPreprocessor(lang=lang)

    df_processed = df.copy()

    # Preprocesamiento b√°sico
    print("‚è≥ Aplicando preprocesamiento b√°sico...")
    df_processed['cleaned_text'] = df_processed[text_col].apply(
        lambda x: preprocessor.preprocess_text(
            x,
            normalize=normalize,
            tokenize=False,
            remove_stopwords=False,
            lemmatize=False,
            advanced_tokenize=advanced_tokenize
        )
    )

    if tokenize:
        print("‚è≥ Tokenizando textos...")
        df_processed['tokens'] = df_processed['cleaned_text'].apply(
            lambda x: preprocessor.preprocess_text(
                x,
                normalize=False,
                tokenize=True,
                remove_stopwords=remove_stopwords,
                lemmatize=lemmatize,
                advanced_tokenize=advanced_tokenize
            )
        )

    # Vectorizaci√≥n (opcional)
    vector_data = None
    if vectorize:
        print(f"‚è≥ Vectorizando con m√©todo: {vector_method}")

        if vector_method in ['fasttext', 'robertuito', 'roberta']:
            # Para FastText, RoBERTuito y RoBERTa, usar texto limpio en lugar de tokens
            vectors = preprocessor.vectorize(
                df_processed['cleaned_text'].tolist(),
                method=vector_method,
                **vector_kwargs
            )
        else:
            # Para otros m√©todos, usar tokens si est√°n disponibles
            if not tokenize:
                raise ValueError("La vectorizaci√≥n requiere tokenizaci√≥n previa para m√©todos que no sean FastText, RoBERTuito o RoBERTa")
            vectors = preprocessor.vectorize(
                df_processed['tokens'].tolist(),
                method=vector_method,
                **vector_kwargs
            )

        print(f"‚úÖ Vectorizaci√≥n completada. Shape: {vectors.shape}")

        # Crear columnas para los vectores en el DataFrame
        vector_cols = [f"{vector_method}_vec_{i}" for i in range(vectors.shape[1])]
        df_vectors = pd.DataFrame(vectors, columns=vector_cols)
        df_processed = pd.concat([df_processed, df_vectors], axis=1)

        # Guardar datos de vectorizaci√≥n si se solicita
        if save_vectors:
            import os
            import numpy as np

            # Crear directorio si no existe
            os.makedirs(output_dir, exist_ok=True)

            # Guardar vectores como archivo numpy
            np.save(os.path.join(output_dir, f"{output_filename}_vectors.npy"), vectors)

            # Guardar metadatos de vectorizaci√≥n
            vector_metadata = {
                'vector_method': vector_method,
                'vector_shape': vectors.shape,
                'columns': vector_cols,
                'parameters': vector_kwargs
            }

            import json
            with open(os.path.join(output_dir, f"{output_filename}_metadata.json"), 'w') as f:
                json.dump(vector_metadata, f, indent=2)

            print(f"‚úÖ Vectores guardados en: {os.path.join(output_dir, output_filename)}_[vectors.npy|metadata.json]")

    return df_processed

In [None]:
# Ejemplo de uso completo con RoBERTa-base
print("üöÄ Iniciando preprocesamiento con RoBERTa-base...")

# Instanciar el preprocesador
preprocessor = TextPreprocessor(lang="es")

# Llamada completa a la funci√≥n con RoBERTa-base y todos los par√°metros
df_roberta = preprocess_dataframe(
    df=df_limpio,
    text_col="Texto",  # Ajustar seg√∫n el nombre real de la columna
    preprocessor=preprocessor,
    lang="es",
    normalize=True,
    tokenize=True,
    remove_stopwords=True,
    lemmatize=True,
    advanced_tokenize=True,
    vectorize=True,
    vector_method="roberta",
    save_vectors=True,
    output_dir="/content/drive/Shareddrives/Titulo 2/Dataset/Vectorizaciones",
    output_filename="roberta_base_results",
    # Par√°metros espec√≠ficos para RoBERTa
    model_name="roberta-base",
    pooling_strategy="mean",  # Opciones: 'mean', 'cls', 'max'
    batch_size=16,  # Ajustar seg√∫n memoria disponible
    tokenizer_args={
        'max_length': 512,
        'padding': True,
        'truncation': True
    },
    model_args={}  # Par√°metros adicionales para el modelo si es necesario
)

print("‚úÖ Preprocesamiento completado!")
print(f"üìä Shape del DataFrame resultante: {df_roberta.shape}")
print(f"üìã Columnas disponibles: {list(df_roberta.columns)}")

# Mostrar informaci√≥n sobre los vectores generados
roberta_cols = [col for col in df_roberta.columns if col.startswith('roberta_vec_')]
print(f"üî¢ N√∫mero de dimensiones del vector RoBERTa: {len(roberta_cols)}")

# Guardar el DataFrame procesado
output_path = "/content/drive/Shareddrives/Titulo 2/Dataset/df_roberta_processed.csv"
df_roberta.to_csv(output_path, index=False)
print(f"üíæ DataFrame guardado en: {output_path}")

üöÄ Iniciando preprocesamiento con RoBERTa-base...
‚è≥ Aplicando preprocesamiento b√°sico...
‚è≥ Tokenizando textos...
‚è≥ Vectorizando con m√©todo: roberta
‚è≥ Cargando RoBERTa: roberta-base...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


‚úÖ RoBERTa cargado exitosamente
‚úÖ Vectorizaci√≥n completada. Shape: (2075, 768)
‚úÖ Vectores guardados en: /content/drive/Shareddrives/Titulo 2/Dataset/Vectorizaciones/roberta_base_results_[vectors.npy|metadata.json]
‚úÖ Preprocesamiento completado!
üìä Shape del DataFrame resultante: (2075, 777)
üìã Columnas disponibles: ['ID', 'Categor√≠a', 'T√≠tulo', 'T√≥pico', 'Fuente', 'Texto', 'Sub_categor√≠a_t√≥pico', 'cleaned_text', 'tokens', 'roberta_vec_0', 'roberta_vec_1', 'roberta_vec_2', 'roberta_vec_3', 'roberta_vec_4', 'roberta_vec_5', 'roberta_vec_6', 'roberta_vec_7', 'roberta_vec_8', 'roberta_vec_9', 'roberta_vec_10', 'roberta_vec_11', 'roberta_vec_12', 'roberta_vec_13', 'roberta_vec_14', 'roberta_vec_15', 'roberta_vec_16', 'roberta_vec_17', 'roberta_vec_18', 'roberta_vec_19', 'roberta_vec_20', 'roberta_vec_21', 'roberta_vec_22', 'roberta_vec_23', 'roberta_vec_24', 'roberta_vec_25', 'roberta_vec_26', 'roberta_vec_27', 'roberta_vec_28', 'roberta_vec_29', 'roberta_vec_30', 'roberta_

In [None]:
import pandas as pd
import json
from collections import defaultdict

def extract_and_save_labels(df_path: str,
                           columns: list = ["Categor√≠a", "Fuente", "T√≥pico"],
                           output_path: str = "/content/drive/Shareddrives/Titulo 2/Dataset/labels.json"):
    """
    Extrae etiquetas √∫nicas de columnas espec√≠ficas y las guarda en un archivo JSON.

    Args:
        df_path (str): Ruta del archivo CSV procesado
        columns (list): Columnas para extraer etiquetas
        output_path (str): Ruta de salida para el archivo JSON
    """
    # Leer el DataFrame
    df = pd.read_csv(df_path)

    # Verificar existencia de columnas
    for col in columns:
        if col not in df.columns:
            raise ValueError(f"Columna '{col}' no encontrada en el DataFrame")

    # Extraer etiquetas √∫nicas
    labels_dict = defaultdict(list)
    for col in columns:
        unique_labels = df[col].dropna().unique().tolist()
        labels_dict[col] = sorted(unique_labels)

    # Guardar en JSON
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(labels_dict, f, ensure_ascii=False, indent=2)

    print(f"‚úÖ Etiquetas guardadas en: {output_path}")

# Ejemplo de uso DESPU√âS de generar df_roberta_processed.csv
extract_and_save_labels(
    df_path="/content/drive/Shareddrives/Titulo 2/Dataset/df_roberta_processed.csv",
    columns=["Categor√≠a", "Fuente", "T√≥pico"],
    output_path="/content/drive/Shareddrives/Titulo 2/Dataset/unique_labels.json"
)

‚úÖ Etiquetas guardadas en: /content/drive/Shareddrives/Titulo 2/Dataset/unique_labels.json
