# INSTALACIÓN E IMPORTACIÓN DE BIBLIOTECAS

In [18]:
!pip install transformers sentence_transformers

import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np




# DEFINICÓN DE DE FUNCIONES

In [19]:
def texto_transformer(texto_columna):
    """
    Genera un embedding para un texto o lista de textos usando Sentence Transformers.
    Si recibe un NaN, retorna NaN.

    Parámetros:
    texto_columna (str o list): Texto o lista de textos a codificar.

    Retorna:
    numpy.ndarray o NaN: El embedding generado o NaN si el input es NaN.
    """
    # Verificar si el input es NaN
    if texto_columna is None or (isinstance(texto_columna, float) and np.isnan(texto_columna)):
        return texto_columna

    # Cargar el modelo multilingüe
    model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

    # Si es una lista, unir los comentarios en un string
    if isinstance(texto_columna, list):
        texto_columna = " ".join(texto_columna)

    # Generar el embedding
    texto_embedding = model.encode(texto_columna)

    return texto_embedding

def normalizacion_minmax(embedding):
    """
    Normaliza un array utilizando Min-Max Scaling para que los valores queden en el rango [-1, 1].
    Si recibe un NaN, retorna NaN.

    Parámetros:
    embedding (numpy.ndarray o NaN): Embedding a normalizar.

    Retorna:
    numpy.ndarray o NaN: El embedding normalizado o NaN si el input es NaN.
    """
    # Verificar si el input es NaN
    if embedding is None or (isinstance(embedding, float) and np.isnan(embedding)):
        return embedding

    min_val = np.min(embedding)
    max_val = np.max(embedding)
    return 2 * (embedding - min_val) / (max_val - min_val) - 1



# TRANSFORMACIÓN DE TEXTOS DE LOS ARTÍCULOS A EMBEDDINGS

In [23]:
# URL del archivo CSV en su versión RAW
url = "https://raw.githubusercontent.com/Fran251184/pasantia_TUIA_CIM_Francisco_J._Alomar/main/dataset_medios_narcoterrorismo_rosario.csv"
# Cargar el CSV en las variables

df = pd.read_csv(url)

df_embedding_normalizado = pd.read_csv(url)


In [25]:
# Aplicar el proceso a las columnas indicadas
for columna in ['titulo', 'bajada', 'cuerpo_texto', 'comentarios_fb']:
    df_embedding_normalizado[columna] = df[columna].apply(
        lambda x: normalizacion_minmax(texto_transformer(x)) if isinstance(x, str) else np.nan
    )

In [26]:

# Guardar el DataFrame en formato Parquet
df_embedding_normalizado.to_parquet("dataset_embedding_normalizado.parquet", index=False)


In [28]:
medios_narcos_embedding = pd.read_parquet("dataset_embedding_normalizado.parquet")


In [29]:
df_embedding_normalizado.tail()

Unnamed: 0,medio,url,fecha,titulo,bajada,cuerpo_texto,comentarios_fb
594,rosario3,https://www.rosario3.com/opinion/Milei-les-tir...,2024-03-29,"[0.2845863, -0.3128481, 0.094020605, -0.012359...","[-0.05510688, -0.24011815, 0.020127177, 0.0556...","[-0.123471975, -0.48411614, 0.24690461, -0.060...","[0.39145672, -0.012049794, 0.39878047, 0.22996..."
595,rosario3,https://www.rosario3.com/informaciongeneral/El...,2024-03-29,"[-0.25626737, -0.09766519, 0.90033793, -0.4887...","[-0.0784595, 0.07298291, 0.35658002, -0.302847...","[-0.5605185, -0.09077138, 0.7081815, -0.145975...","[0.20751071, 0.19920182, 0.39948702, -0.160336..."
596,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,"[0.18368638, -0.20616096, -0.3287453, -0.21701...","[-0.24655211, 0.1279614, 0.32438552, 0.0089941...","[-0.22667938, -0.3002618, -0.07419157, -0.1523...","[0.3225124, 0.14165509, 0.17284322, 0.08127212..."
597,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,"[-0.13323355, -0.08827621, 0.15845609, 0.09435...","[-0.57121766, -0.40314537, 0.38653028, -0.1354...","[-0.87978435, -0.25685358, 0.3622085, 0.140853...","[0.27397943, 0.1567049, 0.7693887, -0.05441421..."
598,rosario3,https://www.rosario3.com/informaciongeneral/Lo...,2024-03-28,"[0.058799148, -0.32900518, 0.11278796, -0.0619...","[-0.0045577884, 0.26709914, 0.4966669, -0.2255...","[-0.32882917, -0.05656886, 0.29903924, 0.00312...","[-0.2570384, -0.01782757, 0.8932595, 0.0035674..."


In [30]:
medios_narcos_embedding.tail()

Unnamed: 0,medio,url,fecha,titulo,bajada,cuerpo_texto,comentarios_fb
594,rosario3,https://www.rosario3.com/opinion/Milei-les-tir...,2024-03-29,"[0.2845863, -0.3128481, 0.094020605, -0.012359...","[-0.05510688, -0.24011815, 0.020127177, 0.0556...","[-0.123471975, -0.48411614, 0.24690461, -0.060...","[0.39145672, -0.012049794, 0.39878047, 0.22996..."
595,rosario3,https://www.rosario3.com/informaciongeneral/El...,2024-03-29,"[-0.25626737, -0.09766519, 0.90033793, -0.4887...","[-0.0784595, 0.07298291, 0.35658002, -0.302847...","[-0.5605185, -0.09077138, 0.7081815, -0.145975...","[0.20751071, 0.19920182, 0.39948702, -0.160336..."
596,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,"[0.18368638, -0.20616096, -0.3287453, -0.21701...","[-0.24655211, 0.1279614, 0.32438552, 0.0089941...","[-0.22667938, -0.3002618, -0.07419157, -0.1523...","[0.3225124, 0.14165509, 0.17284322, 0.08127212..."
597,rosario3,https://www.rosario3.com/informaciongeneral/Pu...,2024-03-29,"[-0.13323355, -0.08827621, 0.15845609, 0.09435...","[-0.57121766, -0.40314537, 0.38653028, -0.1354...","[-0.87978435, -0.25685358, 0.3622085, 0.140853...","[0.27397943, 0.1567049, 0.7693887, -0.05441421..."
598,rosario3,https://www.rosario3.com/informaciongeneral/Lo...,2024-03-28,"[0.058799148, -0.32900518, 0.11278796, -0.0619...","[-0.0045577884, 0.26709914, 0.4966669, -0.2255...","[-0.32882917, -0.05656886, 0.29903924, 0.00312...","[-0.2570384, -0.01782757, 0.8932595, 0.0035674..."
