<table style="width:100%; border-collapse: collapse;">
  <tr>
    <td style="width:20%; vertical-align:middle;">
      <img src="LogoUVG.png" width="400"/>
    </td>
    <td style="text-align:left; vertical-align:middle;">
      <h2 style="margin-bottom: 0;">Universidad del Valle de Guatemala - UVG</h2>
      <h3 style="margin-top: 0;">Facultad de Ingeniería - Computación</h3>
      <p style="font-size: 16px; margin-bottom: 0; margin-top: -20px">
        <strong>Curso:</strong> Procesamiento de Lenguaje Natural 
        <strong>Sección:</strong> 10
      </p>
      <p style="font-size: 16px; margin: 0;"><strong>Laboratorio 5:</strong> Representaciones Vectoriales de Texto
 (PPMI, TF-IDF y Word2Vec)</p>
      <br>
      <p style="font-size: 15px; margin: 0;"><strong>Autor:</strong></p>
      <ul style="margin-top: 5px; padding-left: 20px; font-size: 15px;">
        <li>Diego Alexander Hernández Silvestre - <strong>21270</strong></li>
      </ul>
    </td>
  </tr>
</table>

Librerias

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
from scipy.sparse import coo_matrix, csr_matrix
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import random
import re

Carga de los datos

In [2]:
categories = [
    'rec.autos',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
]

train = fetch_20newsgroups(
    subset='train',
    categories=categories,
    remove=('headers', 'footers', 'quotes'),
    shuffle=True, random_state=42
)
test = fetch_20newsgroups(
    subset='test',
    categories=categories,
    remove=('headers', 'footers', 'quotes'),
    shuffle=True, random_state=42
)

print(len(train.data), len(test.data), train.target_names)

2169 1446 ['rec.autos', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']


# Preprocesamiento del corpus 

Para determinar qué acciones se tomaran, se decide hacer una exploración inicial del dataset para verificar qué estandarizaciones son útiles y requeridas de realizar. 

In [3]:
def rawTokenizerWithPunct(text):
    return re.findall(r"\S+", text)

def normalizeBase(token):
    return re.sub(r"^[^\w]+|[^\w]+$", "", token.lower())

variants = defaultdict(set)

for doc in train.data[:1000]:
    for token in rawTokenizerWithPunct(doc):
        base = normalizeBase(token)
        if base:
            variants[base].add(token)

ambiguous = {k: v for k, v in variants.items() if len(v) > 1}

for i, (base, forms) in enumerate(list(ambiguous.items())[:20], 1):
    print(f"{i}. base='{base}' → variantes={forms}")

1. base='news' → variantes={'news', '"news"', 'News.', 'NEWS', 'News', 'News,', 'News:', 'news.', 'News)'}
2. base='you' → variantes={'"You', 'You', 'you!', 'you"', 'you', 'you."', '(You', 'YOU', '**You', '(you,', 'you;', '*you*', 'YOU!"', 'YOU.', 'you...', 'you:', '"you', '"You,', '>you', 'you!)', '=you=', '(you', 'you.', '*you*.', 'YOU,', 'you,"', 'you,', "'You", 'you...............', 'you?"', 'You,', 'you....', 'you).', '>You', 'you?', '*You*'}
3. base='may' → variantes={'may,', 'may."', 'May', 'may', 'MAY', '(May'}
4. base='have' → variantes={'Have', 'have.', 'have!', '**have', '*have*', 'have', '"have', 'have?', '(have', 'have,', 'HAVE'}
5. base='missed' → variantes={'MISSED,', 'missed'}
6. base='apr' → variantes={'Apr', 'APR'}
7. base='19' → variantes={'19', '(19).', '19,'}
8. base='1993' → variantes={'1993,', '1993:', '1993', '1993.', '1993)'}
9. base='not' → variantes={'NOT!)', ':not', '*not*', 'not;', 'not.', 'NOT', '"Not', 'not?', 'not,', 'not."', '(Not', 'not*', 'not)', '(no

Centrandonos en el subconjunto de autor y política, se logró identificar que existen numerosas variaciones superficiales que afectan en la representación de las palabras. Por ejemplo, la palabra news aparece en varias formas como news, News y NEWS, mientras que may se encuentra como may, MAY y may. incluyendo el signo de puntuación. Estas variantes son interpretadas como distintos tokens por los vectorizadores, aunque en la práctica, representan lo mismo.

Las variantes identificables corresponden a mayúsculas, minúsculas y signos de puntuación. Por esta razón, se aplica un preprocesamiento que incluye la normalización a minúsculas y la eliminación de signos de puntuación. De esta forma se logrará una mejor representación del contenido existente. 

In [4]:
def preprocessText(text: str) -> str:
    text = text.lower() # minúsculas
    text = re.sub(r"[^a-z\s]", " ", text) # dejar solo letras y espacios (elimina puntuación y dígitos)
    text = re.sub(r"\s+", " ", text).strip() # espacios múltiples → uno
    return text

def tokenize(text: str):
    return text.split()

trainClean = [preprocessText(doc) for doc in train.data]
testClean  = [preprocessText(doc) for doc in test.data]
trainTokens = [tokenize(doc) for doc in trainClean]
testTokens  = [tokenize(doc) for doc in testClean]

trainTokens[0][:25]

['news',
 'you',
 'may',
 'have',
 'missed',
 'apr',
 'not',
 'because',
 'you',
 'were',
 'too',
 'busy',
 'but',
 'because',
 'israelists',
 'in',
 'the',
 'us',
 'media',
 'spiked',
 'it',
 'those',
 'intrepid',
 'israeli',
 'soldiers']

In [5]:
def statsTokens(docs):
    toks = [t for doc in docs for t in doc]
    return {
        "Vocabulary size": len(set(toks)),
        "Tokens": len(toks),
    }

def crudeTokens(texts):
    return [re.findall(r"\w+|\S", t) for t in texts] 

rawDocs = crudeTokens(train.data)
rawStats = statsTokens(rawDocs)
cleanStats = statsTokens(trainTokens)
print("== RAW ==")
print({k: v for k, v in rawStats.items() if k != "top20"})
print("\n== CLEAN ==")
print({k: v for k, v in cleanStats.items() if k != "top20"})


== RAW ==
{'Vocabulary size': 35506, 'Tokens': 700444}

== CLEAN ==
{'Vocabulary size': 26187, 'Tokens': 526858}


Se logra observar que la aplicación de la limpieza de la información es efectiva ya que existe una reducción en el tamaño del vocabulario y la cantidad del tokens generados. 

# Construcción de representación TF-IDF

Se busca representar cada documento como un vector de importancia de términos para poder comparar textos y alimentar posteriores modelos. TF-IDF combina cuánto aparece un término el documento (TF) con qué tan raro es en el corpus (IDF).

In [47]:
trainDocs = [" ".join(toks) for toks in trainTokens]
testDocs  = [" ".join(toks) for toks in testTokens]

tfidf = TfidfVectorizer(
    ngram_range=(1,1),
    min_df=5,
    lowercase=False,                     
    token_pattern=r"(?u)\b[a-z]{2,}\b"  
)

X_train_tfidf = tfidf.fit_transform(trainDocs)
X_test_tfidf  = tfidf.transform(testDocs)

vocab = tfidf.get_feature_names_out()
print("Shape TRAIN:", X_train_tfidf.shape)
print("Shape TEST :", X_test_tfidf.shape)
print("Tamaño de vocabulario:", len(vocab))

for doc_id in range(0, 5): # Top términos por documento
    row = X_train_tfidf[doc_id]
    if row.nnz == 0:  # si el doc quedó vacío con min_df
        print(f"\nDoc {doc_id} → [sin términos]")
        continue

    arr = row.toarray().ravel()
    top_idx = arr.argsort()[-3:][::-1]  # top 3
    print(f"\nDoc {doc_id} (clase='{train.target_names[train.target[doc_id]]}')")
    for i in top_idx:
        print(f"   {vocab[i]:<20} {arr[i]:.4f}")


Shape TRAIN: (2169, 6228)
Shape TEST : (1446, 6228)
Tamaño de vocabulario: 6228

Doc 0 (clase='talk.politics.mideast')
   soldiers             0.3405
   the                  0.3204
   girls                0.1803

Doc 1 (clase='talk.politics.guns')
   the                  0.2126
   effect               0.2010
   to                   0.1926

Doc 2 (clase='rec.autos')
   accident             0.3390
   insurance            0.3144
   company              0.3101

Doc 3 (clase='talk.politics.misc')
   tax                  0.3290
   income               0.2628
   you                  0.2437

Doc 4 → [sin términos]


# Construcción de representación PPMI

#### Matriz de co-ocurrencia

In [21]:
windowSize = 4       # ventana ±k
minCount   = 5       # freq mínima para entrar al vocabulario
maxVocab   = 20000   # tope por memoria/tiempo

def buildVocab(tokensList, minCount=5, maxVocab=20000):
    freq = Counter(t for doc in tokensList for t in doc)
    words = [w for w, c in freq.items() if c >= minCount]
    words.sort(key=lambda w: -freq[w])
    words = words[:maxVocab]
    vocab = {w:i for i, w in enumerate(words)}
    id2tok = np.array(words)
    return vocab, id2tok

vocab, id2tok = buildVocab(trainTokens, minCount=minCount, maxVocab=maxVocab)
V = len(vocab)
print("PPMI | vocabSize:", V)

PPMI | vocabSize: 7420


In [22]:
def buildCoocCsr(tokensList, vocab, windowSize=4):
    rows, cols, data = [], [], []
    for doc in tokensList:
        idxs = [vocab[t] for t in doc if t in vocab]
        L = len(idxs)
        for i, wi in enumerate(idxs):
            s = max(0, i - windowSize)
            e = min(L, i + windowSize + 1)
            for j in range(s, e):
                if j == i: 
                    continue
                rows.append(wi)
                cols.append(idxs[j])
                data.append(1.0)
    C = coo_matrix((data, (rows, cols)), shape=(V, V), dtype=np.float64)
    return C.tocsr()

C = buildCoocCsr(trainTokens, vocab, windowSize=windowSize)
print("Cooc | shape:", C.shape, "| nnz:", C.nnz)

Cooc | shape: (7420, 7420) | nnz: 1104492


### Calcule la matriz PPMI

In [37]:
def computePpmiCsr(C: csr_matrix, useLog2=True, eps=1e-12):
    total = C.sum()
    sumW  = np.array(C.sum(axis=1)).ravel()   # totales por palabra (filas)
    sumC  = np.array(C.sum(axis=0)).ravel()   # totales por contexto (columnas)

    Ccoo = C.tocoo(copy=True)                 # trabajamos sobre los no-cero
    pWc  = Ccoo.data / total
    pW   = sumW[Ccoo.row] / total
    pC   = sumC[Ccoo.col] / total

    logFn = np.log2 if useLog2 else np.log
    pmi   = logFn((pWc + eps) / (pW * pC + eps))
    pmi[pmi < 0] = 0.0                        # PPMI = max(PMI, 0)

    PPMI = coo_matrix((pmi, (Ccoo.row, Ccoo.col)), shape=C.shape).tocsr()
    return PPMI

PPMI = computePpmiCsr(C, useLog2=True)       # log2 como en la slide
print("PPMI | shape:", PPMI.shape, "| nnz:", PPMI.nnz)

PPMI | shape: (7420, 7420) | nnz: 1104492


In [None]:
def docsToPpmiDocs(tokensList, vocab, ppmiCsr):
    V = ppmiCsr.shape[0]
    X = np.zeros((len(tokensList), V), dtype=np.float32)
    for i, doc in enumerate(tokensList):
        idxs = [vocab[t] for t in doc if t in vocab]
        if not idxs:
            continue
        # promedio de las filas PPMI correspondientes a las palabras del Qdoc
        X[i] = ppmiCsr[idxs].mean(axis=0).A1
    return X

PPMI_train_docs = docsToPpmiDocs(trainTokens, vocab, PPMI)
PPMI_test_docs  = docsToPpmiDocs(testTokens,  vocab, PPMI)
print("Docs PPMI | train:", PPMI_train_docs.shape, "| test:", PPMI_test_docs.shape)

Docs PPMI | train: (2169, 7420) | test: (1446, 7420)


In [39]:
def topContexts(word, k=10):
    if word not in vocab:
        print(f"'{word}' no está en el vocab."); return
    i = vocab[word]
    row = PPMI[i].toarray().ravel()
    top = row.argsort()[-k:][::-1]
    for j in top:
        if row[j] > 0:
            print(f"{id2tok[j]:<20} {row[j]:.4f}")

print("\nTop contextos por PPMI")
print("→ gun")
topContexts("gun", 10)
print("→ car")
topContexts("car", 10)


Top contextos por PPMI
→ gun
strict               6.0626
melbourne            5.9922
defenses             5.8403
prevalence           5.8402
strictest            5.8402
stricter             5.5772
zones                5.5772
ownership            5.5292
zip                  5.4397
control              5.4111
→ car
favorite             5.5602
cruisers             5.5056
crawl                5.4661
sports               5.4354
swap                 5.3537
museums              5.3537
screwing             5.3537
collections          5.3537
overturned           5.2606
wax                  5.1108


# Construcción de representación Word2Vec

In [20]:
def setSeed(seed=42):
    np.random.seed(seed)
    random.seed(seed)

def trainWord2Vec(sentences, vectorSize=200, window=5, minCount=5, sg=1, workers=4, seed=42):
    """
    Entrena Word2Vec (gensim) sobre listas de tokens.
    - sentences: lista de listas de tokens (trainTokens)
    """
    setSeed(seed)
    model = Word2Vec(
        sentences=sentences,
        vector_size=vectorSize,
        window=window,
        min_count=minCount,
        sg=sg,                 # 1 = skip-gram, 0 = CBOW
        workers=workers,
        negative=10,           # negative sampling
        epochs=5,
        seed=seed
    )
    return model

w2vModel = trainWord2Vec(trainTokens, vectorSize=200, window=5, minCount=5, sg=1)
print("vocabSize (w2v):", len(w2vModel.wv))
print("vectorSize:", w2vModel.vector_size)


vocabSize (w2v): 7420
vectorSize: 200


In [12]:
def docEmbedding(tokens, model):
    vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not vecs:
        return np.zeros(model.vector_size, dtype=np.float32)
    return np.mean(vecs, axis=0)

def docsToEmbeddings(tokensList, model):
    X = np.zeros((len(tokensList), model.vector_size), dtype=np.float32)
    for i, toks in enumerate(tokensList):
        X[i] = docEmbedding(toks, model)
    return X

XTrainW2V = docsToEmbeddings(trainTokens, w2vModel)
XTestW2V  = docsToEmbeddings(testTokens,  w2vModel)
print("XTrainW2V shape:", XTrainW2V.shape, "| XTestW2V shape:", XTestW2V.shape)

XTrainW2V shape: (2169, 200) | XTestW2V shape: (1446, 200)


In [16]:
def mostSimilarWords(model, word, topn=10):
    if word not in model.wv:
        print(f"'{word}' no está en el vocabulario del modelo")
        return
    for w, sim in model.wv.most_similar(word, topn=topn):
        print(f"{w:<20} {sim:.4f}")

def leastSimilarWords(model, word, k=10, minFreq=50):
    """
    Busca palabras 'lejanas' a 'word' entre términos con frecuencia >= minFreq.
    Nota: es meramente ilustrativo; la 'lejanía' puede estar sesgada por rareza.
    """
    if word not in model.wv:
        print(f"'{word}' no está en el vocabulario del modelo")
        return
    target = model.wv[word]
    # candidatos frecuentes
    cand = [w for w in model.wv.index_to_key if model.wv.get_vecattr(w, "count") >= minFreq and w != word]
    if not cand:
        print("No hay candidatos con esa frecuencia mínima")
        return
    sims = np.array([model.wv.similarity(word, w) for w in cand])
    idx = np.argsort(sims)[:k]   # los k más bajos
    for i in idx:
        print(f"{cand[i]:<20} {sims[i]:.4f}")

# ejemplos típicos del dataset:
print("--------------- Most similar words (car) ---------------")
mostSimilarWords(w2vModel, "car", topn=10)
print("--------------- Least similar words (car) ---------------")
leastSimilarWords(w2vModel, "car", k=10, minFreq=50)

--------------- Most similar words (car) ---------------
tires                0.8471
engine               0.8230
manual               0.8165
test                 0.8160
dealer               0.8158
taurus               0.8110
rear                 0.8110
ford                 0.8096
pedal                0.8066
honda                0.8013
--------------- Least similar words (car) ---------------
azerbaijan           -0.0369
international        -0.0054
anti                 0.0045
soviet               0.0155
against              0.0162
leaders              0.0239
rights               0.0494
union                0.0642
by                   0.0645
united               0.0662


In [40]:
print("\nShapes finales de documentos:")
print("TF-IDF:", X_train_tfidf.shape)
print("PPMI:", PPMI_train_docs.shape)
print("Word2Vec:", XTrainW2V.shape)


Shapes finales de documentos:
TF-IDF: (2169, 26161)
PPMI: (2169, 7420)
Word2Vec: (2169, 200)


# Evaluación comparativa

In [33]:
def trainAndEval(X_train, y_train, X_test, y_test, name):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name}: {acc:.4f}")
    return acc

### TF-IDF

In [48]:
accTfidf = trainAndEval(X_train_tfidf, train.target, X_test_tfidf, test.target, "TF-IDF")

TF-IDF: 0.7635


### PPMI

In [None]:
accPpmi = trainAndEval(PPMI_train_docs, train.target, PPMI_test_docs, test.target, "PPMI")

PPMI (SVD): 0.7607


### Word2Vec

In [44]:
accW2v = trainAndEval(XTrainW2V, train.target, XTestW2V, test.target, "Word2Vec")

Word2Vec: 0.6902


In [50]:
results = pd.DataFrame({
    "Representación": ["TF-IDF", "PPMI (SVD)", "Word2Vec mean"],
    "Accuracy": [accTfidf, accPpmi, accW2v]
})
print("\nResultados comparativos:")
print(results)


Resultados comparativos:
  Representación  Accuracy
0         TF-IDF  0.763485
1     PPMI (SVD)  0.760719
2  Word2Vec mean  0.690180
