# INSTALACIÓN E IMPORTACIÓN DE BIBLIOTECAS

In [1]:
!pip install transformers sentence_transformers

import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence_transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence_transformers)
 

# DEFINICÓN DE DE FUNCIONES

In [2]:
def texto_transformer(texto_columna):
    """
    Genera un embedding para un texto o lista de textos usando Sentence Transformers.
    Si recibe un NaN, retorna NaN.

    Parámetros:
    texto_columna (str o list): Texto o lista de textos a codificar.

    Retorna:
    numpy.ndarray o NaN: El embedding generado o NaN si el input es NaN.
    """
    # Verificar si el input es NaN
    if texto_columna is None or (isinstance(texto_columna, float) and np.isnan(texto_columna)):
        return texto_columna

    # Cargar el modelo multilingüe
    model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

    # Si es una lista, unir los comentarios en un string
    if isinstance(texto_columna, list):
        texto_columna = " ".join(texto_columna)

    # Generar el embedding
    texto_embedding = model.encode(texto_columna)

    return texto_embedding

def normalizacion_minmax(embedding):
    """
    Normaliza un array utilizando Min-Max Scaling para que los valores queden en el rango [-1, 1].
    Si recibe un NaN, retorna NaN.

    Parámetros:
    embedding (numpy.ndarray o NaN): Embedding a normalizar.

    Retorna:
    numpy.ndarray o NaN: El embedding normalizado o NaN si el input es NaN.
    """
    # Verificar si el input es NaN
    if embedding is None or (isinstance(embedding, float) and np.isnan(embedding)):
        return embedding

    min_val = np.min(embedding)
    max_val = np.max(embedding)
    return 2 * (embedding - min_val) / (max_val - min_val) - 1



# TRANSFORMACIÓN DE TEXTOS DE LOS ARTÍCULOS A EMBEDDINGS

In [6]:
# URL del archivo CSV en su versión RAW
url = "https://raw.githubusercontent.com/Fran251184/pasantia_TUIA_CIM_Francisco_J._Alomar/main/dataset_medios_narcoterrorimo_rosario.csv"

# Cargar el CSV en las variables

df = pd.read_csv(url)

df_embedding_normalizado = pd.read_csv(url)


In [7]:
# Aplicar el proceso a las columnas indicadas
for columna in ['titulo', 'bajada', 'cuerpo_texto', 'comentarios_fb']:
    df_embedding_normalizado[columna] = df[columna].apply(
        lambda x: normalizacion_minmax(texto_transformer(x)) if isinstance(x, str) else np.nan
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/556 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/539M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/452 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense%2Fconfig.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

In [8]:

df_embedding_normalizado.to_csv('dataset_embedding_normalizado.csv', index=False)

In [11]:
medios_narcos_embedding = pd.read_csv('dataset_embedding_normalizado.csv')


In [12]:
df_embedding_normalizado.tail()

Unnamed: 0,medio,url,fecha,titulo,bajada,cuerpo_texto,comentarios_fb
594,rosario3,https://www.rosario3.com/opinion/Milei-les-tir...,2024-03-29,"[0.2845863, -0.3128481, 0.094020605, -0.012359...","[-0.05510688, -0.24011815, 0.020127177, 0.0556...","[-0.123471975, -0.48411614, 0.24690461, -0.060...","[0.39145672, -0.012049794, 0.39878047, 0.22996..."
595,rosario3,https://www.rosario3.com/informaciongeneral/El...,2024-03-29,"[-0.25626737, -0.09766519, 0.90033793, -0.4887...","[-0.0784595, 0.07298291, 0.35658002, -0.302847...","[-0.5605185, -0.09077138, 0.7081815, -0.145975...","[0.20751071, 0.19920182, 0.39948702, -0.160336..."
596,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,"[0.18368638, -0.20616096, -0.3287453, -0.21701...","[-0.24655211, 0.1279614, 0.32438552, 0.0089941...","[-0.22667938, -0.3002618, -0.07419157, -0.1523...","[0.3225124, 0.14165509, 0.17284322, 0.08127212..."
597,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,"[-0.13323355, -0.08827621, 0.15845609, 0.09435...","[-0.57121766, -0.40314537, 0.38653028, -0.1354...","[-0.87978435, -0.25685358, 0.3622085, 0.140853...","[0.27397943, 0.1567049, 0.7693887, -0.05441421..."
598,rosario3,https://www.rosario3.com/informaciongeneral/Lo...,2024-03-28,"[0.058799148, -0.32900518, 0.11278796, -0.0619...","[-0.0045577884, 0.26709914, 0.4966669, -0.2255...","[-0.32882917, -0.05656886, 0.29903924, 0.00312...","[-0.2570384, -0.01782757, 0.8932595, 0.0035674..."


In [13]:
medios_narcos_embedding.tail()

Unnamed: 0,medio,url,fecha,titulo,bajada,cuerpo_texto,comentarios_fb
594,rosario3,https://www.rosario3.com/opinion/Milei-les-tir...,2024-03-29,[ 0.2845863 -0.3128481 0.09402061 -0.012359...,[-5.51068783e-02 -2.40118146e-01 2.01271772e-...,[-1.23471975e-01 -4.84116137e-01 2.46904612e-...,[ 0.39145672 -0.01204979 0.39878047 0.229963...
595,rosario3,https://www.rosario3.com/informaciongeneral/El...,2024-03-29,[-2.56267369e-01 -9.76651907e-02 9.00337934e-...,[-0.0784595 0.07298291 0.35658002 -0.302847...,[-5.60518503e-01 -9.07713771e-02 7.08181500e-...,[ 2.07510710e-01 1.99201822e-01 3.99487019e-...
596,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,[ 1.83686376e-01 -2.06160963e-01 -3.28745306e-...,[-2.46552110e-01 1.27961397e-01 3.24385524e-...,[-2.26679385e-01 -3.00261796e-01 -7.41915703e-...,[ 0.3225124 0.14165509 0.17284322 0.081272...
597,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,[-1.33233547e-01 -8.82762074e-02 1.58456087e-...,[-5.71217656e-01 -4.03145373e-01 3.86530280e-...,[-0.87978435 -0.25685358 0.3622085 0.140853...,[ 0.27397943 0.1567049 0.7693887 -0.054414...
598,rosario3,https://www.rosario3.com/informaciongeneral/Lo...,2024-03-28,[ 5.87991476e-02 -3.29005182e-01 1.12787962e-...,[-4.55778837e-03 2.67099142e-01 4.96666908e-...,[-3.28829169e-01 -5.65688610e-02 2.99039245e-...,[-0.2570384 -0.01782757 0.8932595 0.003567...
