# Darlin Joel Anacicha Sanchez GR1CC
# Ejercicio 4: Modelo Probabilístico

## Objetivo de la práctica
- Comprender los componentes del modelo vectorial mediante cálculos manuales y observación directa.
- Aplicar el modelo de espacio vectorial con TF-IDF para recuperar documentos relevantes.
- Comparar la recuperación con BM25 frente a TF-IDF.
- Analizar visualmente las diferencias entre los modelos.
- Evaluar si los rankings generados son consistentes con lo que considerarías documentos relevantes.

## Parte 0: Carga del Corpus

In [11]:
from sklearn.datasets import load_files
import pandas as pd

# Cargar los datos
train_data = load_files('/kaggle/input/dataaset/20news-bydate-train')
test_data = load_files('/kaggle/input/dataaset/20news-bydate-test')

# Combinar documentos y etiquetas
newsgroupsdocs = train_data.data + test_data.data
targets = train_data.target.tolist() + test_data.target.tolist()
labels = train_data.target_names

print(len(newsgroupsdocs), "documentos cargados")

# Limpiar textos
docs_limpios = [doc.decode("latin1").encode("utf-8","ignore").decode("utf-8","ignore") for doc in newsgroupsdocs]

# Convertir a DataFrame para ver primeros documentos
df = pd.DataFrame({
    'texto': [doc[:200] for doc in docs_limpios], 
    'categoria_id': targets,
    'categoria_nombre': [labels[i] for i in targets]
})

df.head(100)


18846 documentos cargados


Unnamed: 0,texto,categoria_id,categoria_nombre
0,From: cubbie@garnet.berkeley.edu ( ...,9,rec.sport.baseball
1,From: gnelson@pion.rutgers.edu (Gregory Nelson...,4,comp.sys.mac.hardware
2,From: crypt-comments@math.ncsu.edu\nSubject: C...,11,sci.crypt
3,From: ()\nSubject: Re: Quadra SCSI Problems??...,4,comp.sys.mac.hardware
4,From: keith@cco.caltech.edu (Keith Allan Schne...,0,alt.atheism
...,...,...,...
95,From: tthiel@cs.uiuc.edu (Terry Thiel)\nSubjec...,4,comp.sys.mac.hardware
96,From: jmeyers@ecst.csuchico.edu (Jeff Meyers)\...,2,comp.os.ms-windows.misc
97,From: whit@carson.u.washington.edu (John Whitm...,12,sci.electronics
98,From: rjwade@rainbow.ecn.purdue.edu (Robert J....,7,rec.autos


## Parte 1: Cálculo de TF, DF, IDF y TF-IDF

### Actividad 
1. Utiliza el corpus cargado.
2. Construye la matriz de términos (TF), y calcula la frecuencia de documentos (DF)
3. Calcula TF-IDF utilizando sklearn.
4. Visualiza los valores en un DataFrame para analizar las diferencias entre los términos.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Usamos solo los primeros 10 documentos para que sea fácil
docs_demo = docs_limpios[:100]

# TF
vectorizador_tf = CountVectorizer(max_features=100)
matriz_tf = vectorizador_tf.fit_transform(docs_demo)

tabla_tf = pd.DataFrame(matriz_tf.toarray(), columns=vectorizador_tf.get_feature_names_out())
print("=== MATRIZ TF ===")
display(tabla_tf)


=== MATRIZ TF ===


Unnamed: 0,0t,1d9,3t,about,all,an,and,any,are,article,...,when,which,who,will,with,wm,world,would,writes,you
0,0,0,0,0,0,0,4,0,0,1,...,0,0,0,2,0,0,0,0,1,0
1,0,0,0,0,1,1,2,1,0,1,...,0,0,0,1,4,0,0,2,0,1
2,0,0,0,0,2,5,45,0,4,0,...,0,2,0,0,2,0,2,0,0,1
3,0,0,0,0,0,1,2,1,0,0,...,3,1,0,2,5,0,0,2,1,0
4,0,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,9,0,0,0,...,0,0,0,1,0,0,0,0,1,0
96,0,0,0,2,0,0,2,0,0,1,...,0,0,1,0,3,0,0,0,1,0
97,0,0,0,0,0,0,2,1,0,2,...,1,1,0,2,2,0,0,1,1,0
98,0,0,0,0,2,1,5,1,1,0,...,2,1,1,2,5,0,0,0,0,2


In [13]:
# DF (Document Frequency)
df_valores = np.sum(matriz_tf.toarray() > 0, axis=0)
tabla_df = pd.Series(df_valores, index=vectorizador_tf.get_feature_names_out(), name="DF")
print("\n=== FRECUENCIA DE DOCUMENTOS (DF) ===")
display(tabla_df)


=== FRECUENCIA DE DOCUMENTOS (DF) ===


0t         1
1d9        1
3t         1
about     33
all       41
          ..
wm         2
world     17
would     41
writes    50
you       49
Name: DF, Length: 100, dtype: int64

In [14]:
# TF-IDF
vectorizador_tfidf = TfidfVectorizer(max_features=100)
matriz_tfidf = vectorizador_tfidf.fit_transform(docs_demo)

tabla_tfidf = pd.DataFrame(matriz_tfidf.toarray(), columns=vectorizador_tfidf.get_feature_names_out())
print("\n=== MATRIZ TF-IDF ===")
display(tabla_tfidf)


=== MATRIZ TF-IDF ===


Unnamed: 0,0t,1d9,3t,about,all,an,and,any,are,article,...,when,which,who,will,with,wm,world,would,writes,you
0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.284781,0.000000,0.000000,0.109123,...,0.000000,0.000000,0.000000,0.232013,0.000000,0.0,0.000000,0.000000,0.105330,0.000000
1,0.0,0.0,0.0,0.000000,0.060933,0.055930,0.073854,0.060170,0.000000,0.056600,...,0.000000,0.000000,0.000000,0.060170,0.199611,0.0,0.000000,0.121867,0.000000,0.055275
2,0.0,0.0,0.0,0.000000,0.031168,0.071523,0.425000,0.000000,0.055246,0.000000,...,0.000000,0.036754,0.000000,0.000000,0.025526,0.0,0.045235,0.000000,0.000000,0.014137
3,0.0,0.0,0.0,0.000000,0.000000,0.045326,0.059852,0.048762,0.000000,0.000000,...,0.189079,0.058231,0.000000,0.097524,0.202208,0.0,0.000000,0.098762,0.044274,0.000000
4,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.079233,0.000000,0.115870,0.000000,...,0.000000,0.000000,0.000000,0.129104,0.000000,0.0,0.000000,0.000000,0.117222,0.118601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.581133,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.105212,0.000000,0.0,0.000000,0.000000,0.095529,0.000000
96,0.0,0.0,0.0,0.225199,0.000000,0.000000,0.122670,0.000000,0.000000,0.094010,...,0.000000,0.000000,0.125027,0.000000,0.248661,0.0,0.000000,0.000000,0.090742,0.000000
97,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.083904,0.068357,0.000000,0.128603,...,0.088354,0.081631,0.000000,0.136715,0.113387,0.0,0.000000,0.069225,0.062066,0.000000
98,0.0,0.0,0.0,0.000000,0.114241,0.052430,0.173082,0.056404,0.050623,0.000000,...,0.145809,0.067357,0.070563,0.112809,0.233901,0.0,0.000000,0.000000,0.000000,0.103632


## Parte 2: Ranking de documentos usando TF-IDF

### Actividad 

1. Dada una consulta, construye el vector de consulta
2. Calcula la similitud coseno entre la consulta y cada documento usando los vectores TF-IDF
3. Genera un ranking de los documentos ordenados por relevancia.
4. Muestra los resultados en una tabla.

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

consulta = "computadoras y gráficos"
vector_consulta = vectorizador_tfidf.transform([consulta])

In [16]:
similitud = cosine_similarity(vector_consulta, matriz_tfidf).flatten()

In [17]:
ranking = similitud.argsort()[::-1][:10]  # top 10C documentos

resultado = pd.DataFrame({
    "Documento": ranking,
    "Puntaje": similitud[ranking],
    "Categoria": [labels[targets[i]] for i in ranking],
    "Texto": [docs_demo[i][:120] + "..." for i in ranking]
})

print("\n=== RANKING DE DOCUMENTOS POR RELEVANCIA ===")
display(resultado)


=== RANKING DE DOCUMENTOS POR RELEVANCIA ===


Unnamed: 0,Documento,Puntaje,Categoria,Texto
0,99,0.0,talk.politics.misc,From: chaudhary-amar@yale.edu (Amar Chaudhary)...
1,36,0.0,sci.med,From: wcsbeau@alfred.carleton.ca (OPIRG)\nSubj...
2,26,0.0,sci.space,From: aws@iti.org (Allen W. Sherzer)\nSubject:...
3,27,0.0,rec.autos,From: sheinfel@ssd.comm.mot.com (Aviad Sheinfe...
4,28,0.0,comp.os.ms-windows.misc,From: ac151@Freenet.carleton.ca (David Clarke)...
5,29,0.0,comp.graphics,From: davidr@rincon.ema.rockwell.com (David J....
6,30,0.0,sci.crypt,From: silly@ugcs.caltech.edu (Brad Threatt)\nS...
7,31,0.0,comp.windows.x,From: aa894@Freenet.carleton.ca (Terry MacLean...
8,32,0.0,comp.sys.mac.hardware,Distribution: world\nFrom: Thomas_n.a._Krebs@m...
9,33,0.0,alt.atheism,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...


## Forma Manual

In [22]:
import numpy as np
import pandas as pd
from math import log
import re
from sklearn.datasets import load_files

# === 1. Cargar datos ===
train_data = load_files('/kaggle/input/dataaset/20news-bydate-train')
test_data = load_files('/kaggle/input/dataaset/20news-bydate-test')

newsgroupsdocs = train_data.data + test_data.data
targets = train_data.target.tolist() + test_data.target.tolist()
labels = train_data.target_names

print(len(newsgroupsdocs), "documentos cargados")

# === 2. Limpiar texto ===
docs_limpios = []
for doc in newsgroupsdocs:
    # Intentamos decodificar
    try:
        texto = doc.decode("latin1")
    except:
        texto = str(doc)
    # Expresión regular: solo letras y espacios
    texto = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ\s]', ' ', texto)
    texto = texto.lower()
    texto = re.sub(r'\b[a-zñáéíóú]{1,2}\b', ' ', texto)
    texto = re.sub(r'\s+', ' ', texto).strip()
    docs_limpios.append(texto)

# Para hacerlo más liviano, usamos solo los primeros 100 documentos
corpus = docs_limpios[:100]
print("Documentos en corpus:", len(corpus))

# === 3. Crear vocabulario ===
texto_total = " ".join(corpus)
vocabulario = list(set(texto_total.split()))
print("Tamaño del vocabulario:", len(vocabulario))

# === 4. Calcular TF ===
def calcular_tf(doc, vocabulario):
    palabras = doc.split()
    tf = []
    for palabra in vocabulario:
        tf.append(palabras.count(palabra))
    return np.array(tf)

matriz_tf = np.array([calcular_tf(doc, vocabulario) for doc in corpus])
print("Matriz TF lista:", matriz_tf.shape)

# === 5. Calcular DF ===
df = np.sum(matriz_tf > 0, axis=0)
print("Frecuencia DF calculada:", df.shape)

# === 6. Calcular IDF ===
N = len(corpus)
idf = np.log(N / (df + 1))  # +1 para evitar dividir por cero

# === 7. Calcular TF-IDF manualmente ===
matriz_tfidf = matriz_tf * idf

# === 8. Mostrar resultados ===
df_tfidf = pd.DataFrame(matriz_tfidf, columns=vocabulario)
print("\n=== MATRIZ TF-IDF (primeras filas) ===")
display(df_tfidf.head())

print("\nTF del primer documento:\n", matriz_tf[0][:100])
print("\nIDF de las primeras 20 palabras:\n", idf[:100])
print("\nTF-IDF del primer documento:\n", matriz_tfidf[0][:100])



18846 documentos cargados
Documentos en corpus: 100
Tamaño del vocabulario: 6334
Matriz TF lista: (100, 6334)
Frecuencia DF calculada: (6334,)

=== MATRIZ TF-IDF (primeras filas) ===


Unnamed: 0,excitotoxic,happens,eisenhower,contain,randy,disk,nus,scam,charley,complicated,...,relativly,reality,county,departement,myths,via,surviving,cam,ftpmd,smart
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,3.506558,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.65926,0.0,7.824046,3.912023,0.0
3,0.0,2.995732,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



TF del primer documento:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

IDF de las primeras 20 palabras:
 [3.91202301 2.99573227 3.91202301 3.5065579  3.91202301 3.21887582
 3.91202301 3.91202301 3.91202301 3.5065579  3.21887582 3.21887582
 3.91202301 3.91202301 3.21887582 3.91202301 3.91202301 3.91202301
 3.91202301 3.91202301]

TF-IDF del primer documento:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [23]:
#  Vectorizar la consulta (manual)

consulta_texto = "computadoras y gráficos"   # tu consulta
consulta_palabras = consulta_texto.lower().replace(".", " ").replace(",", " ").split()

# TF de la consulta (en el vocabulario)
tf_consulta = np.array([consulta_palabras.count(pal) for pal in vocabulario], dtype=float)

# TF-IDF de la consulta: TF * IDF (misma fórmula que usamos para docs)
vec_consulta = tf_consulta * idf  # vector tamaño = vocabulario


#  Similitud coseno (manual) entre vec_consulta y cada documento

# norma del vector consulta
norm_consulta = np.linalg.norm(vec_consulta)

# norma de cada documento (usamos matriz_tfidf ya calculada)
norms_docs = np.linalg.norm(matriz_tfidf, axis=1)

# evitar división por cero: si alguna norma es 0, la ponemos muy pequeña
eps = 1e-12
norms_docs = np.where(norms_docs == 0, eps, norms_docs)
norm_consulta = norm_consulta if norm_consulta != 0 else eps

# producto punto entre consulta y cada documento
puntos = matriz_tfidf.dot(vec_consulta)  # (n_docs,)

# similitud coseno
similitudes = puntos / (norms_docs * norm_consulta)


#  Ranking: ordenar documentos por similitud (de mayor a menor)

top_k = 10
indices_ordenados = np.argsort(similitudes)[::-1][:top_k]

# Para mostrar categoría y texto, necesitamos los índices reales en el dataset.
# En nuestro ejemplo usamos `corpus = docs_limpios[:10]`, por lo tanto los índices reales son 0..9.
# Si tu corpus es una porción distinta, ajusta `indice_real = indice_offset + i`.
# Aquí asumimos offset = 0:
offset = 0

filas_resultado = []
for idx in indices_ordenados:
    idx_real = offset + idx
    categoria = labels[targets[idx_real]] if idx_real < len(targets) else "desconocida"
    texto_snippet = corpus[idx][:200]  # usa corpus (documentos usados en la demo)
    filas_resultado.append({
        "Indice_local": int(idx),               # índice dentro de corpus usado
        "Indice_global": int(idx_real),         # índice relativo al dataset completo
        "Puntaje_coseno": float(similitudes[idx]),
        "Categoria": categoria,
        "Texto (inicio)": texto_snippet
    })

tabla_resultados = pd.DataFrame(filas_resultado)
print("=== RANKING MANUAL (similitud coseno) ===")
display(tabla_resultados)

=== RANKING MANUAL (similitud coseno) ===


Unnamed: 0,Indice_local,Indice_global,Puntaje_coseno,Categoria,Texto (inicio)
0,99,99,0.0,talk.politics.misc,from chaudhary amar yale edu amar chaudhary su...
1,36,36,0.0,sci.med,from wcsbeau alfred carleton opirg subject msg...
2,26,26,0.0,sci.space,from aws iti org allen sherzer subject sixty t...
3,27,27,0.0,rec.autos,from sheinfel ssd comm mot com aviad sheinfeld...
4,28,28,0.0,comp.os.ms-windows.misc,from freenet carleton david clarke subject dos...
5,29,29,0.0,comp.graphics,from davidr rincon ema rockwell com david ray ...
6,30,30,0.0,sci.crypt,from silly ugcs caltech edu brad threatt subje...
7,31,31,0.0,comp.windows.x,from freenet carleton terry maclean subject ho...
8,32,32,0.0,comp.sys.mac.hardware,distribution world from thomas krebs mcontent ...
9,33,33,0.0,alt.atheism,from dbstu benedikt rosenau subject anecdote a...
