In [5]:
# Questão 11
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, silhouette_score
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

# a
dataset = fetch_20newsgroups(remove=("headers", "footers", "quotes"))
X_texts = dataset.data
y       = dataset.target

vectorizer = TfidfVectorizer(max_df=0.5, stop_words="english")
X_tfidf = vectorizer.fit_transform(X_texts)

kmeans = KMeans(n_clusters=20, random_state=42)
labels_kmeans = kmeans.fit_predict(X_tfidf)

ari = adjusted_rand_score(y, labels_kmeans)
nmi = normalized_mutual_info_score(y, labels_kmeans)
sil = silhouette_score(X_tfidf, labels_kmeans, sample_size=1000)  

print(f"K-Means: ARI={ari:.3f}, NMI={nmi:.3f}, Silhouette={sil:.3f}")


K-Means: ARI=0.073, NMI=0.321, Silhouette=0.003


In [6]:
# b
nmf = NMF(n_components=20, random_state=42)
W = nmf.fit_transform(X_tfidf)
H = nmf.components_

termos = vectorizer.get_feature_names_out()
for i, topico in enumerate(H):
    top_words = [termos[x] for x in topico.argsort()[-10:][::-1]]
    print(f"Tópico {i}: {', '.join(top_words)}")


Tópico 0: people, government, gun, law, right, guns, state, rights, crime, make
Tópico 1: windows, dos, ms, version, running, os, microsoft, nt, drivers, driver
Tópico 2: god, jesus, bible, believe, faith, christ, christian, christians, church, life
Tópico 3: geb, dsl, chastity, n3jxp, cadre, pitt, shameful, intellect, skepticism, surrender
Tópico 4: key, chip, encryption, clipper, keys, escrow, algorithm, government, secure, security
Tópico 5: drive, scsi, ide, drives, disk, hard, controller, floppy, hd, cd
Tópico 6: game, team, games, year, players, season, play, hockey, win, league
Tópico 7: thanks, advance, hi, looking, mail, info, help, appreciated, appreciate, information
Tópico 8: space, nasa, shuttle, launch, orbit, moon, earth, station, lunar, program
Tópico 9: card, video, monitor, bus, vga, drivers, cards, driver, color, ati
Tópico 10: armenian, armenians, turkish, genocide, armenia, turkey, turks, said, soviet, muslim
Tópico 11: 00, 10, sale, 50, 20, shipping, 15, new, pric

In [8]:
# c 
lda = LatentDirichletAllocation(n_components=20, random_state=42, learning_method="batch")
lda.fit(X_tfidf)

for i, topico in enumerate(lda.components_):
    top_words = [termos[x] for x in topico.argsort()[-10:][::-1]]
    print(f"Tópico {i}: {', '.join(top_words)}")

Tópico 0: alomar, wins, __, baerga, park, _______, mars, adl, ____, _____
Tópico 1: ax, ifas, ufl, gnv, arens, caps, _incredibly_, loopback, mywinobj, sigh
Tópico 2: telix, kermit, vcr, catalog, olwm, adb, nutek, navy, allergic, tsk
Tópico 3: deletion, ripem, slot, quicktime, lc, luke, pds, tt, adaptor, iisi
Tópico 4: banks, pitt, geb, shameful, cadre, gordon, skepticism, intellect, chastity, n3jxp
Tópico 5: god, jesus, armenian, turkish, christ, armenians, sin, islam, greek, heaven
Tópico 6: mahan, tgv, ver, gainey, cornerstone, lm, washer, syquest, charley, main_win
Tópico 7: lc, printers, ink, bj, deskjet, hernia, davewood, buyer, boulder, pages
Tópico 8: murray, centaur, francis, gm, jagr, nixon, proton, kingman, db, finnish
Tópico 9: sox, morris, champs, keller, sas, quakers, kkeller, ivy, male, upenn
Tópico 10: vesa, helmet, eisa, vlb, cview, spacecraft, megs, captain, caps, switches
Tópico 11: ditto, gl, wheelie, shaft, tires, rle, saab, corn, vgalogo, clinic
Tópico 12: app, pol

Seguindo as etapas de:

1.Pré-processamento -> conversão em vetores TF-IDF.

2.Aplicação dos algoritmos (K-Means, NMF e LDA).

3.Extração de palavras-chave por tópico (no caso de NMF e LDA).

4.Avaliação da coerência dos tópicos com base na interpretabilidade.

O K-Means foi aplicado sobre representações TF-IDF e permitiu explorar a segmentação textual, mas apresentou resultados limitados devido à alta dimensionalidade e sobreposição dos dados.

O NMF gerou tópicos semanticamente mais claros e interpretáveis, enquanto o LDA também identificou temas relevantes, mas com maior difusão entre termos.