In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Cargar el dataset
df_complete = pd.read_csv('complete_without_outliers.csv')

In [3]:
# Función de limpieza básica
def limpieza_basica(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text)  # Eliminar saltos de línea y tabs
        text = text.strip()  # Eliminar espacios al inicio y final
        return text.lower()  # Convertir a minúsculas
    else:
        return None

In [5]:
# Aplicar limpieza al texto
df_complete['text_cleaned'] = df_complete['text'].apply(limpieza_basica)

In [6]:
df_sample, _ = train_test_split(df_complete, train_size=2000, stratify=df_complete['class'], random_state=42)

In [7]:
# Definir las etiquetas y los textos
#texts = df_sample['text_cleaned'].tolist()
#labels = df_sample['class'].tolist() 

In [12]:
texts = df_sample['text_cleaned']
labels = df_sample['class']

In [15]:
# Dividir el conjunto de datos en train y test
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [16]:
# Cargar el modelo de embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

In [17]:
# Función para obtener embeddings
def get_embeddings(text):
    if text:  # Asegurarse de que no sea None
        return model.encode(text)
    else:
        return None

In [19]:
# Aplicar la función a la columna de texto y almacenar los embeddings en una nueva columna
df_sample['Embeddings'] = df_sample['text_cleaned'].apply(get_embeddings)

In [22]:
# Mostrar los embeddings
print(df_sample[['text_cleaned', 'Embeddings']].head())  # Muestra los primeros registros con sus embeddings

                                             text_cleaned  \
294978  hpe service activator is a service provisionin...   
143108  avalon or group avalon was a band consisting o...   
141836  due south is a canadian crime comedy drama tel...   
189217  gretchen jones is an american fashion designer...   
34583   silverstone's poison frog (ameerega silverston...   

                                               Embeddings  
294978  [-0.10829187, -0.08002075, -0.036951497, -0.11...  
143108  [0.04930241, -0.055457503, -0.07331795, -0.076...  
141836  [-0.06276055, -0.088093266, 0.018262984, -0.04...  
189217  [-0.10008263, 0.0064794947, -0.09542956, 0.010...  
34583   [0.020429071, -0.006813759, 0.057907984, -0.00...  


In [23]:
# Función de limpieza básica
def limpieza_basica(text):
    if isinstance(text, str):
        text = re.sub(r'\s+', ' ', text)  # Eliminar saltos de línea y tabs
        text = text.strip()  # Eliminar espacios al inicio y final
        return text.lower()  # Convertir a minúsculas
    else:
        return None

In [24]:
# Aplicar limpieza al texto
df_sample['text_cleaned'] = df_sample['text'].apply(limpieza_basica)

In [27]:
# Definir las etiquetas y los textos
texts = df_complete['text_cleaned'].tolist()
labels = df_complete['class'].tolist()  

In [28]:
# Dividir el conjunto de datos en train y test
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [29]:
from sentence_transformers import SentenceTransformer

# Cargar el modelo de embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Texto que queremos transformar en embeddings
texto = "Este es un ejemplo de cómo generar embeddings de un texto."

# Generar los embeddings
embeddings = model.encode(texto)

# Mostrar los embeddings
print(embeddings)
print(f"Dimensiones del embedding: {len(embeddings)}")

[-2.65024435e-02  1.71470940e-02  1.31286951e-02 -1.56637526e-03
 -2.67034061e-02  8.12192634e-02 -1.14679560e-02  3.97442747e-03
  4.83105108e-02 -4.21296358e-02  4.52797431e-05  1.93351414e-02
  3.34629826e-02 -6.32607902e-04 -2.36528311e-02  3.27744335e-02
  3.59263271e-02  7.00960308e-02  3.94431390e-02 -3.98232695e-03
  9.74113420e-02  2.16303207e-02 -2.28917273e-03  4.48175631e-02
  1.64768528e-02  6.07070001e-03 -1.80506408e-02  7.08187968e-02
 -6.11066411e-04 -7.35926926e-02  8.22000380e-04  6.28694752e-03
  3.42395455e-02  1.78526733e-02 -1.89285204e-02  4.10691276e-02
 -1.85888279e-02  3.78940045e-03 -5.54763749e-02  4.23105583e-02
 -5.39671332e-02  2.02679839e-02 -5.14474809e-02 -2.54189372e-02
  6.95178658e-02 -8.17556009e-02 -6.29748255e-02  3.81914563e-02
 -2.72701476e-02 -2.60062832e-02 -1.06740572e-01 -2.95120236e-02
 -7.26294667e-02  1.90756060e-02  3.29637853e-03 -9.23274737e-03
 -2.26643737e-02 -3.61107476e-02  3.14595476e-02 -5.97650930e-03
 -1.59684904e-02 -2.40589

In [None]:
# Aplicar la función a la columna de texto y almacenar los embeddings en una sola columna
df_sample['Embeddings'] = df_sample['text_cleaned'].apply(get_embeddings)

In [None]:
# Guardar el DataFrame con embeddings
df_complete.to_csv('complete_with_embeddings.csv', index=False)