In [1]:
import pandas as pd

In [2]:
# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.csv', 'validation': 'dev.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/ibm-research/argument_quality_ranking_30k/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

df["argument_clean"] = df["argument"].apply(preprocess)


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
X_embeds = model.encode(df["argument_clean"], show_progress_bar=True)


2025-05-05 16:57:21.632736: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Batches: 100%|██████████| 656/656 [00:44<00:00, 14.79it/s]


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(df["argument_clean"])


In [9]:
from sklearn.decomposition import PCA

X_reduced = PCA(n_components=50).fit_transform(X_tfidf.toarray())  # ou X_embeds


In [10]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(X_reduced)

df["cluster"] = clusters




In [11]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

X_vis = TSNE(n_components=2, random_state=42).fit_transform(X_reduced)

plt.scatter(X_vis[:, 0], X_vis[:, 1], c=df["cluster"], cmap="tab10")
plt.title("Clusters d'arguments de débat")
plt.show()


KeyboardInterrupt: 

In [12]:
X_reduced

array([[ 0.01019771, -0.02306673, -0.00852955, ..., -0.06405784,
         0.06542808,  0.05409661],
       [-0.01425483, -0.05037529,  0.00445846, ..., -0.01302394,
         0.01066638, -0.01649903],
       [-0.0487009 ,  0.03601257, -0.00620004, ..., -0.04520742,
         0.04608115, -0.01047283],
       ...,
       [-0.01005397, -0.01204734,  0.01343818, ...,  0.015892  ,
         0.02986212,  0.08771468],
       [-0.03285988, -0.02094517, -0.02969811, ..., -0.0088034 ,
         0.01073365,  0.08460943],
       [-0.01535118, -0.04115308,  0.01402405, ...,  0.00737495,
        -0.00196081,  0.0506932 ]])