In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score , normalized_mutual_info_score
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups

In [2]:
# Charger le dataset NG20
ng20 = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
corpus = ng20.data[:2000]

# Prétraitement des données textuelles
stop_words = set(stopwords.words('english'))

vectorizer = TfidfVectorizer(stop_words='english')  

# Utiliser 'english' au lieu de stop_words
X = vectorizer.fit_transform(corpus)

# Réduction de la dimension avec ACP
pca = PCA(n_components=20)
X_pca = pca.fit_transform(X.toarray())

# Clustering avec k-means sur les données réduites
k = 20  # Vous pouvez ajuster le nombre de clusters selon vos besoins
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_pca)

# Évaluation du clustering (indice de Rand ajusté)
ground_truth_labels = ng20.target[:2000]
ari = adjusted_rand_score(ground_truth_labels, labels)
nmi = normalized_mutual_info_score(ground_truth_labels, labels)
print(f"Adjusted Rand Index: {ari}")
print(f"Normalized Mutual Information: {nmi}")

Adjusted Rand Index: 0.059522186481681644
Normalized Mutual Information: 0.28109560296315783
