In [12]:
import pandas as pd

## Parte 0: Carga del Corpus
Actividad
* Carga el corpus 20 Newsgroups desde sklearn.datasets.fetch_20newsgroups.
* Limita el corpus a los primeros 2000 documentos para facilitar el procesamiento.

In [13]:
from sklearn.datasets import fetch_20newsgroups

# Cargar el conjunto completo (train + test)
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
newsgroupsdocs = newsgroups.data

In [14]:
# Limitar a los primeros 2000 documentos
corpus = newsgroups.data[:2000]
labels = newsgroups.target[:2000]
categories = newsgroups.target_names

print("Documentos usados:", len(corpus))

Documentos usados: 2000


## Parte 2: Generación de Embeddings
Actividad
* Usa dos modelos de sentence-transformers. Puedes usar: 'all-MiniLM-L6-v2' (SBERT), o 'intfloat/e5-base' (E5). Cuando uses E5, antepon "passage: " a cada documento antes de codificar.
* Genera los vectores de embeddings para todos los documentos usando el modelo seleccionado.
* Guarda los embeddings en un array de NumPy para su posterior indexación.

In [15]:
!pip install -q sentence-transformers

In [16]:
#Generar embeddings con SBERT – all-MiniLM-L6-v2
from sentence_transformers import SentenceTransformer

# Modelo SBERT
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')

# Generar embeddings
emb_sbert = model_sbert.encode(corpus, show_progress_bar=True)

print("Shape SBERT:", emb_sbert.shape)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Shape SBERT: (2000, 384)


In [17]:
#Generar embeddings con E5 – intfloat/e5-base
from sentence_transformers import SentenceTransformer

# Modelo E5
model_e5 = SentenceTransformer('intfloat/e5-base')

# Agregar prefijo "passage: "
corpus_e5 = ["passage: " + doc for doc in corpus]

# Generar embeddings
emb_e5 = model_e5.encode(corpus_e5, show_progress_bar=True)

print("Shape E5:", emb_e5.shape)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

Shape E5: (2000, 768)


In [18]:
#Genera los vectores de embeddings para todos los documentos usando el modelo seleccionado. (SBERT o E5)
from sentence_transformers import SentenceTransformer

def generate_embeddings(corpus, model_name):

    # Cargar el modelo
    model = SentenceTransformer(model_name)

    # Si el modelo es E5 → agregar prefijo "passage: "
    if model_name.startswith("intfloat/e5"):
        corpus_to_encode = ["passage: " + doc for doc in corpus]
    else:
        corpus_to_encode = corpus

    # Codificar documentos
    embeddings = model.encode(corpus_to_encode, show_progress_bar=True)

    return embeddings


In [19]:
#Usar SBERT
embeddings_sbert = generate_embeddings(corpus, "all-MiniLM-L6-v2")
print("SBERT embeddings shape:", embeddings_sbert.shape)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

SBERT embeddings shape: (2000, 384)


In [20]:
#Usar E5
embeddings_e5 = generate_embeddings(corpus, "intfloat/e5-base")
print("E5 embeddings shape:", embeddings_e5.shape)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

E5 embeddings shape: (2000, 768)


In [21]:
#Convertir embeddings a un array numpy
import numpy as np

#SBERT
embeddings_np_sbert = np.array(embeddings_sbert)
print("Shape embeddings:", embeddings_np_sbert.shape)

#E5
embeddings_np_e5 = np.array(embeddings_sbert)
print("Shape embeddings:", embeddings_np_e5.shape)


Shape embeddings: (2000, 384)
Shape embeddings: (2000, 384)


In [22]:
#Guardar como archivo npy
np.save("embeddings_sbert.npy", embeddings_np_sbert)
np.save("embeddings_e5.npy", embeddings_np_e5)


## Parte 3: Consulta
Actividad
* Escribe una consulta en lenguaje natural. Ejemplos:
  "God, religion, and spirituality"
  "space exploration"
  "car maintenance"

* Codifica la consulta utilizando el mismo modelo de embeddings. Cuando uses E5, antepon "query: " a la consulta.

* Recupera los 5 documentos más relevantes con similitud coseno.

* Muestra los textos de los documentos recuperados (puedes mostrar solo los primeros 500 caracteres de cada uno).

In [23]:
#Ejemplos de consulta en lenguaje natural

#query
"Environmental consequences of climate change"
"Advances in artificial intelligence and machine learning"
"Health benefits of regular physical exercise"
"How social media affects mental health"
"Cybersecurity threats in corporate networks"

#Modelos
"all-MiniLM-L6-v2"
"intfloat/e5-base"


'intfloat/e5-base'

In [24]:
#Codificar una consulta
from sentence_transformers import SentenceTransformer

def encode_query(query, model_name):
    model = SentenceTransformer(model_name)

    # Si usas E5, anteponer "query: "
    if model_name.startswith("intfloat/e5"):
        query = "query: " + query

    embedding = model.encode([query])
    return embedding[0]


In [25]:
#Funcionalidad
query = "Advances in artificial intelligence and machine learning"
model_name = "all-MiniLM-L6-v2"

query_embedding = encode_query(query, model_name)

print("Shape del embedding de la consulta:", query_embedding.shape)


Shape del embedding de la consulta: (384,)


In [26]:
#Similitud Coseno
from sklearn.metrics.pairwise import cosine_similarity




In [27]:
# Asegurar forma correcta (1, dim)
query_vec = query_embedding.reshape(1, -1)

# Similitudes
similarities = cosine_similarity(query_vec, embeddings_np_sbert)[0]

In [28]:
# Obtener los índices de los top 5 documentos
top_k = 5
top_indices = np.argsort(similarities)[-top_k:][::-1]


In [29]:
print("Top 5 documentos más relevantes:\n")

for rank, idx in enumerate(top_indices, 1):
    print(f"Rank {rank} | Índice: {idx} | Similitud: {similarities[idx]:.4f}")
    print(corpus[idx][:300].replace("\n", " ") + "...")
    print("-" * 80)


Top 5 documentos más relevantes:

Rank 1 | Índice: 724 | Similitud: 0.2875
Australian Pattern Recognition Society                           2nd CALL FOR PAPERS                                 DICTA-93                            2nd Conference on -           DIGITAL IMAGING COMPUTING: TECHNIQUES AND APPLICATIONS   Location: Macquarie Theatre           Macquarie University  ...
--------------------------------------------------------------------------------
Rank 2 | Índice: 510 | Similitud: 0.2631
 I have a little answer:  See Foley, van Dam, Feiner, and Hughes, _Computer Graphics: Principles and Practice, Second Edition_.  [If people would *read* this book, 75 percent of the questions in this froup would disappear overnight...]  							spl...
--------------------------------------------------------------------------------
Rank 3 | Índice: 712 | Similitud: 0.2303
 Direction-finding and directional monitoring receivers. Can you say "little black bakery truck"?  :-)  David   ...
----------

In [30]:
#Top 5 documentos recuperados con sus 500 primeros caracteres
print("DOCUMENTOS RECUPERADOS (Top 5):\n")

for rank, idx in enumerate(top_indices, 1):
    print(f"Rank {rank} | Índice: {idx} | Similitud: {similarities[idx]:.4f}")
    print(corpus[idx][:500].replace("\n", " "))
    print("\n" + "-" * 120 + "\n")


DOCUMENTOS RECUPERADOS (Top 5):

Rank 1 | Índice: 724 | Similitud: 0.2875
Australian Pattern Recognition Society                           2nd CALL FOR PAPERS                                 DICTA-93                            2nd Conference on -           DIGITAL IMAGING COMPUTING: TECHNIQUES AND APPLICATIONS   Location: Macquarie Theatre           Macquarie University           Sydney  Date: 8-10 December 1993.      DICTA-93 is the second biennial national conference of the Australian Pattern Recognition Society.     This event will provide an opportunity for any pe

------------------------------------------------------------------------------------------------------------------------

Rank 2 | Índice: 510 | Similitud: 0.2631
 I have a little answer:  See Foley, van Dam, Feiner, and Hughes, _Computer Graphics: Principles and Practice, Second Edition_.  [If people would *read* this book, 75 percent of the questions in this froup would disappear overnight...]  							spl

------------

In [32]:
from google.colab import output
output.clear()