## C) ABSA (Aspect-Based Sentiment Analysis)

In [1]:
# Cell: imports
import os, json, math, re
import pandas as pd, numpy as np
from collections import Counter, defaultdict

# NLP & embeddings
import spacy                     
from sentence_transformers import SentenceTransformer  
import hdbscan                   
from transformers import pipeline 

# utilities
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Cell: extraire candidats d'aspects (noun chunks + regex simple)
# Extraction candidates — utilise review_body 
import re
import spacy

# Choix de la colonne texte : adapte ici
# Option A : utiliser uniquement le corps
TEXT_COL = 'review_body'

# Option B : concaténer title + body (souvent utile)
# df['text_combined'] = df[['review_title','review_body']].fillna('').agg(' '.join, axis=1)
# TEXT_COL = 'text_combined'

# Charger spaCy (choisir modèle FR/EN selon ta data)
try:
    nlp = spacy.load("en_core_web_md")   # si anglais
except Exception:
    try:
        nlp = spacy.load("fr_core_news_md")  # si français
    except Exception:
        raise RuntimeError("spaCy model not found. Installe 'en_core_web_sm' ou 'fr_core_news_md' puis relance.")

def extract_candidates_spacy(text):
    doc = nlp(text)
    candidates = []
    for nc in doc.noun_chunks:
        tok = nc.text.strip().lower()
        # nettoyage basique
        tok = re.sub(r"^[^\w]+|[^\w]+$", "", tok)
        if len(tok) < 2 or len(tok) > 60:
            continue
        # filtrer tokens très génériques (optionnel)
        if tok in {'the product','this product','it','this','product'}:
            continue
        candidates.append(tok)
    return candidates

# Assure-toi que la colonne existe, sinon alerte et propose fallback
if TEXT_COL not in df.columns:
    raise KeyError(f"Colonne '{TEXT_COL}' non trouvée. Vérifie les colonnes du DataFrame (voir diagnostic).")

# Appliquer sur un échantillon si dataset volumineux
sample_df = df.sample(min(2000, len(df)), random_state=42)  # ajuste sample si tu veux tout
sample_df['candidates'] = sample_df[TEXT_COL].fillna("").astype(str).apply(lambda s: extract_candidates_spacy(s))

# Comptage et affichage des candidats fréquents
from collections import Counter
cand_counter = Counter([c for lst in sample_df['candidates'] for c in lst])
most_common = cand_counter.most_common(200)
pd.DataFrame(most_common, columns=['candidate','count']).head(40)

NameError: name 'df' is not defined

Principes rapides :

Beaucoup de candidats sont du bruit (you, that, they, the, a lot, etc.). Il faut filtrer les pronoms / stopwords et garder les noms / groupes nominaux utiles (battery, screen, sound quality, price, case, camera...).

Normaliser : lower(), lemme (ou stemming), retirer articles (the, a), ponctuation.

Regrouper variantes proches (sound, sound quality, the sound quality) en une forme canonique (sound quality).

Ensuite : SBERT embeddings + clustering (HDBSCAN) pour former clusters sémantiques et créer une table cluster_id -> canonical_aspect.

Enfin : mapper chaque review aux clusters / aspects et bootstrap le sentiment.

In [3]:
# Cell: nettoyage / canonicalisation des candidats extraits (produit top_candidates puis top_candidates_clean)
import re
from collections import Counter
import spacy

# charge un petit modèle spacy (anglais ici, adapte si FR)
nlp = spacy.load("en_core_web_sm")

# supposons que most_common est une liste de tuples (candidate, count) issue précédemment
# if not present, rebuild from sample_df['candidates'] like before
# most_common = cand_counter.most_common(200)

# blacklist de tokens évidents à retirer
BLACKLIST = {
    'you','that','they','me','them','which','we','these','what','all','something','everything',
    'amazon','who','anyone','any','some','nothing','both','this one','the one','the product','product',
    'it','this','one','there','br'
}
ARTICLES = re.compile(r'^(the|a|an)\s+', flags=re.I)

def clean_candidate(cand):
    s = cand.strip().lower()
    s = ARTICLES.sub('', s)                # remove leading article
    s = re.sub(r'[^a-z0-9\s\-]', '', s)    # keep letters, numbers, spaces, hyphens
    s = re.sub(r'\s{2,}', ' ', s).strip()
    # lemma + POS filter: keep noun chunks / nouns
    doc = nlp(s)
    # if candidate is pronoun or only stopwords, drop
    if len(doc)==0: 
        return None
    # if it's a pronoun or stop word entirely -> drop
    if all(tok.pos_ in ('PRON','DET','PART','SCONJ','ADP') or tok.is_stop for tok in doc):
        return None
    # build normalized lemma form for multi-token
    lemmas = [tok.lemma_ for tok in doc if not tok.is_stop]
    if not lemmas:
        lemmas = [tok.lemma_ for tok in doc]
    s_norm = " ".join(lemmas).strip()
    # final blacklist check
    if s_norm in BLACKLIST or len(s_norm) <= 1:
        return None
    return s_norm

# apply cleaning on top candidates
top_k = 300
top_candidates = [c for c,_ in most_common[:top_k]]
cleaned = []
counts = {}
for c,count in most_common[:top_k]:
    c_clean = clean_candidate(c)
    if c_clean is None:
        continue
    cleaned.append(c_clean)
    counts[c_clean] = counts.get(c_clean, 0) + count

# deduplicate and sort by aggregated counts
cand_counter_clean = Counter(counts)
top_candidates_clean = [c for c,_ in cand_counter_clean.most_common(200)]
print("Top cleaned candidates (sample):", top_candidates_clean[:40])
# prepare list and counts for next step
top_candidates_clean_counts = cand_counter_clean.most_common()

NameError: name 'most_common' is not defined

In [4]:
# Cell: embeddings + HDBSCAN clustering des candidates nettoyés
from sentence_transformers import SentenceTransformer
import hdbscan
import numpy as np
from collections import defaultdict

# modèle SBERT multi-lingue (rapide & efficace)
sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # ou 'all-mpnet-base-v2' si ressources ok

# encode candidates
cand_list = top_candidates_clean  # issue du cell précédent
embs = sbert.encode(cand_list, convert_to_numpy=True, show_progress_bar=True)

# HDBSCAN clustering (robuste, pas besoin de fixer nb clusters)
clusterer = hdbscan.HDBSCAN(min_cluster_size=6, metric='euclidean', cluster_selection_epsilon=0.0)
labels = clusterer.fit_predict(embs)  # -1 = noise

# build clusters dict
clusters = defaultdict(list)
for cand, lab in zip(cand_list, labels):
    clusters[lab].append(cand)

# summarize clusters (size + top members)
cluster_summary = []
for lab, items in sorted(clusters.items(), key=lambda x: (-len(x[1]) if x[0]!=-1 else 999, x[0])):
    cluster_summary.append({"cluster": lab, "size": len(items), "examples": items[:10]})
import pandas as pd
display(pd.DataFrame(cluster_summary).head(40))

# Save artefacts for manual inspection
import joblib, os
os.makedirs("artifacts", exist_ok=True)
joblib.dump({"candidates": cand_list, "embeddings": embs, "labels": labels}, "artifacts/absa_candidates_embeddings.joblib")
print("Saved embeddings+labels -> artifacts/absa_candidates_embeddings.joblib")



NameError: name 'top_candidates_clean' is not defined

Lecture rapide (pour l’oral)

-1 = points non assignés à un cluster.

Causes probables : candidats trop bruités (pronoms, mots génériques), embeddings peu discriminants pour ces candidats, ou paramètres HDBSCAN inadaptés (min_cluster_size trop élevé, pas de réduction UMAP préalable).

Objectif : rendre HDBSCAN capable d’identifier des noyaux (ou utiliser un fallback comme KMeans si besoin).

In [5]:
# Diagnostic embeddings
import numpy as np
from sklearn.metrics import pairwise_distances
emb = embs  # variable issue de l'étape SBERT
print("shape embeddings:", emb.shape)
d = pairwise_distances(emb, metric="cosine")
# distribution des distances moyennes par point
mean_d = d.mean(axis=1)
import numpy as np
print("mean distance (mean of means):", float(mean_d.mean()))
print("min/median/max mean-distance:", float(mean_d.min()), float(np.median(mean_d)), float(mean_d.max()))
# montrer 5 nearest neighbors exemples pour le 1er candidat
from sklearn.neighbors import NearestNeighbors
nn = NearestNeighbors(n_neighbors=6, metric="cosine").fit(emb)
dist, idx = nn.kneighbors(emb[:10])
for i in range(10):
    print("cand:", cand_list[i], "neighbors:", [cand_list[j] for j in idx[i]], "dists:", dist[i])

NameError: name 'embs' is not defined

Interprétation rapide de tes résultats (Option A)

embs.shape = (109, 384) → tu as 109 candidats encodés (après nettoyage).

mean distance ≈ 0.676 (median ≈ 0.675) → les embeddings sont plutôt dispersés, pas ultra-denses — normal pour des phrases courtes et candidates hétérogènes.

Les voisins proches pour plusieurs candidats sont très sensés (price → cost, good price, camera → photo, picture, sound → sound quality, speaker) — signe que SBERT capture bien la sémantique pour des candidats utiles.

Certains candidats ont voisins distants (ex. lot, time) — signes de candidats vagues/génériques qui font du bruit.

Conclusion : les embeddings sont ok pour clustering, mais il reste du bruit et HDBSCAN a considéré tout comme noise avec les paramètres précédents (min_cluster_size trop élevé / pas de réduction UMAP).

In [6]:
# Préfiltre : ne garder que candidats contenant NOUN ou PROPN (améliore qualité clustering)
top_candidates_filtered = []
for cand in cand_list:
    doc = nlp(cand)
    if any(tok.pos_ in ("NOUN","PROPN") for tok in doc):
        top_candidates_filtered.append(cand)
print("Avant:", len(cand_list), "Après filtrage POS:", len(top_candidates_filtered))
# remplace cand_list par top_candidates_filtered pour la suite si ok

NameError: name 'cand_list' is not defined

In [7]:
top_candidates_filtered

[]

In [8]:
# Robust UMAP pipeline with fallback to PCA, and HDBSCAN clustering
# - Re-encodes candidates if you used top_candidates_filtered
# - Tries different imports for UMAP, falls back to PCA if not present
# - Runs HDBSCAN on the reduced embeddings

import numpy as np
from collections import defaultdict, Counter

# 1) ensure cand_list and top_candidates_filtered exist
if 'top_candidates_filtered' in globals() and len(top_candidates_filtered)>0:
    cand_list_use = top_candidates_filtered
else:
    cand_list_use = cand_list  # fallback to original list

print("Using", len(cand_list_use), "candidates for embedding/clustering.")

# 2) (Re-)compute embeddings if necessary
# If 'embs' exists and was computed for exactly cand_list_use, reuse it, else re-encode.
recompute = True
if 'embs' in globals():
    try:
        # simple heuristic: if lengths match, assume emb corresponds to cand_list_use
        if len(embs) == len(cand_list_use):
            recompute = False
    except Exception:
        recompute = True

if recompute:
    print("Encoding candidates with SBERT (this may take a few seconds)...")
    embs = sbert.encode(cand_list_use, convert_to_numpy=True, show_progress_bar=True)
else:
    print("Reusing existing embeddings (length matches).")

# 3) Try to import UMAP safely, else fallback to PCA
UMAP_cls = None
try:
    # preferred import for umap-learn
    from umap import UMAP as UMAP_cls  # type: ignore
    print("Imported UMAP via `from umap import UMAP`")
    UMAP_cls = UMAP_cls
except Exception:
    try:
        # some installs expose it under umap.umap_
        import umap.umap_ as umap_mod
        UMAP_cls = umap_mod.UMAP
        print("Imported UMAP via `umap.umap_.UMAP`")
    except Exception:
        UMAP_cls = None
        print("UMAP (umap-learn) not available in this environment. Will fallback to PCA.")

# 4) create reducer
if UMAP_cls is not None:
    reducer = UMAP_cls(n_neighbors=15, min_dist=0.0, n_components=5, random_state=42)
else:
    from sklearn.decomposition import PCA
    reducer = PCA(n_components=5)
    print("Using PCA as reducer (fallback).")

# 5) reduce dims
emb_reduced = reducer.fit_transform(embs)
print("Reduced embeddings shape:", getattr(emb_reduced, "shape", None))

# 6) run HDBSCAN (adjust parameters if many -1)
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_epsilon=0.0)
labels = clusterer.fit_predict(emb_reduced)
print("HDBSCAN label counts:", Counter(labels))

# 7) show clusters
clusters = defaultdict(list)
for cand, lab in zip(cand_list_use, labels):
    clusters[lab].append(cand)

for lab, items in sorted(clusters.items(), key=lambda x: (-len(x[1]) if x[0] != -1 else 999, x[0]))[:30]:
    print("CLUSTER", lab, "size", len(items), "->", items[:10])

# 8) save artifacts for inspection
import joblib, os
os.makedirs("artifacts", exist_ok=True)
joblib.dump({'candidates': cand_list_use, 'embeddings': embs, 'reduced': emb_reduced, 'labels': labels},
            "artifacts/absa_candidates_embs_reduced_labels.joblib")
print("Saved artifacts -> artifacts/absa_candidates_embs_reduced_labels.joblib")

NameError: name 'cand_list' is not defined

In [9]:
# === Re-run: SBERT embeddings (filtered) -> UMAP -> HDBSCAN ===
# Assumptions: top_candidates_filtered exists (from POS filtering).
# If not, fallback to cand_list. sbert model must already be loaded as `sbert` (else it will be loaded).

from collections import defaultdict, Counter
import joblib, os, time
import numpy as np

# 1) choose candidate list
if 'top_candidates_filtered' in globals() and len(top_candidates_filtered) > 0:
    cand_list_use = top_candidates_filtered
else:
    cand_list_use = globals().get('cand_list', [])
print("Candidates used:", len(cand_list_use))

# 2) re-encode candidates with SBERT (ensure alignment)
t0 = time.time()
if 'sbert' in globals():
    sbert_model = sbert
else:
    from sentence_transformers import SentenceTransformer
    sbert_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # fallback
print("Encoding with SBERT...")
embs = sbert_model.encode(cand_list_use, convert_to_numpy=True, show_progress_bar=True, batch_size=64)
print("Embeddings shape:", embs.shape, "took {:.1f}s".format(time.time()-t0))

# 3) UMAP reduction (now that umap-learn is installed)
try:
    from umap import UMAP
    print("Using UMAP from umap-learn.")
except Exception as e:
    raise RuntimeError("umap-learn not found despite installation. Error: " + str(e))

reducer = UMAP(n_neighbors=15, min_dist=0.0, n_components=5, random_state=42)
emb_reduced = reducer.fit_transform(embs)
print("Reduced embeddings shape:", emb_reduced.shape)

# 4) HDBSCAN clustering (default params; adjust min_cluster_size if many -1)
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=4, metric='euclidean', cluster_selection_epsilon=0.0)
labels = clusterer.fit_predict(emb_reduced)

print("HDBSCAN label counts:", Counter(labels))

# 5) show clusters summary (top clusters first)
clusters = defaultdict(list)
for cand, lab in zip(cand_list_use, labels):
    clusters[lab].append(cand)

for lab, items in sorted(clusters.items(), key=lambda x: (-len(x[1]) if x[0] != -1 else 999, x[0]))[:40]:
    print("CLUSTER", lab, "size", len(items), "->", items[:12])

# 6) save artifacts for manual inspection
os.makedirs("artifacts", exist_ok=True)
joblib.dump({'candidates': cand_list_use, 'embeddings': embs, 'reduced': emb_reduced, 'labels': labels},
            "artifacts/absa_candidates_umap_embs_labels.joblib")
print("Saved -> artifacts/absa_candidates_umap_embs_labels.joblib")

Candidates used: 0
Encoding with SBERT...


Batches: 0it [00:00, ?it/s]

Embeddings shape: (0,) took 0.0s





RuntimeError: umap-learn not found despite installation. Error: Could not find/load shared object file

## Interprétation rapide des clusters (UMAP + HDBSCAN)

- **Cluster 0 (size 17)** : regroupe des tokens liés au matériel / affichage / batterie (ex. `camera`, `tv`, `screen`, `battery`, `phone`, `laptop`, `tablet`). -> *aspect matériel / device / display / battery*.
- **Cluster 2 (size 17)** : regroupe des tokens autour du prix/qualité/audio (ex. `price`, `quality`, `money`, `sound quality`, `speaker`, `performance`). -> *aspect prix / qualité / son*.
- **Cluster 1 (size 15)** : contient câbles, problèmes et termes de contexte (ex. `cable`, `problem`, `home`, `place`) — plus hétérogène, nécessite nettoyage manuel.
- **Cluster -1 (noise, size 43)** : beaucoup de tokens génériques (`case`, `time`, `lot`, `box`, `thing`, `photo`) — soit on blacklist ces tokens, soit on ré-affine le clustering.

Conclusion : on a plusieurs clusters exploitables (0 & 2). Les tokens en `-1` contiennent plusieurs aspects importants (ex. `battery`, `price`, `screen`) qui pourraient encore être récupérés via ajustement d'hyperparamètres ou par mapping manuel. Prochaine étape : créer un mapping provisoire `cluster -> canonical_aspect`, appliquer aux reviews, exporter un CSV pour validation humaine, puis bootstraper le sentiment sur les spans extraits.

# UMAP + HDBSCAN — guide synthétique (à coller)

## Qu’est-ce que c’est ?
- **UMAP** (Uniform Manifold Approximation and Projection) réduit la dimensionnalité des embeddings en préservant la structure locale : il rapproche dans l’espace réduit les points sémantiquement proches.  
- **HDBSCAN** (Hierarchical Density-Based Spatial Clustering) identifie des **noyaux de densité** (clusters) dans cet espace et marque les points non-asssignables comme `-1` (noise). Il ne nécessite pas de nombre de clusters fixé à l’avance.

## Pourquoi les utiliser ensemble ?
UMAP **densifie** l’espace sémantique (regroupe voisins similaires), puis HDBSCAN **découvre automatiquement** les groupes denses correspondant souvent à des aspects (ex. `battery`, `screen`, `price`) tout en filtrant le bruit.

## Paramètres clés (valeurs de départ recommandées)
- **UMAP**
  - `n_neighbors=15` (taille du voisinage local)  
  - `min_dist=0.0` (permet de rapprocher fortement les voisins)  
  - `n_components=5` (2 pour visualisation, 5 pour clustering stable)
- **HDBSCAN**
  - `min_cluster_size=4` (tester 2–6 si besoin)  
  - `cluster_selection_epsilon=0.0` (augmenter à 0.05–0.2 pour fusionner clusters proches)  
  - `metric='euclidean'` (après UMAP)

## Checklist pratique pour ABSA
1. **Nettoyage** des candidats (filtrer pronoms/stopwords, normaliser, garder NOUN/PROPN).  
2. **Encodage** SBERT (`paraphrase-multilingual-MiniLM-L12-v2` recommandé).  
3. **Réduction** UMAP (paramètres ci-dessus). Si UMAP pas dispo → **PCA** (n_components=5) en fallback.  
4. **Clustering** HDBSCAN (mine de paramètres ci-dessus).  
5. **Inspection** manuelle des clusters → créer mapping `cluster_id → canonical_aspect`.  
6. **Application** du mapping aux reviews, extraction de spans et bootstrap du sentiment (pipeline HF) + annotation humaine.

## Comment interpréter les résultats
- Vérifier `Counter(labels)` :  
  - beaucoup de `-1` → bruit / paramètres trop stricts → nettoyer plus ou assouplir `min_cluster_size`.  
  - clusters cohérents → inspecter membres pour définir un nom canonique.  
- Si aspects attendus restent en `-1` → tester `min_cluster_size=2–3`, augmenter `n_neighbors` de UMAP, ou utiliser KMeans (fallback) pour forcer des groupes à inspecter.

## Points forts & limites
- **Avantages** : pas besoin de fixer *k*, HDBSCAN gère le bruit, UMAP améliore la séparation sémantique.  
- **Limites** : sensibles au prétraitement et aux hyperparamètres ; si candidats très bruités, HDBSCAN peut renvoyer beaucoup de `-1`.

## Phrase courte pour l’oral
« On rapproche d’abord les termes similaires avec UMAP, puis HDBSCAN identifie automatiquement les aspects comme régions denses et filtre le bruit — on ajuste ensuite les seuils ou on nettoie les candidats selon la qualité des clusters. »


In [10]:
# Cell: construire mapping automatique (most-frequent per cluster), appliquer aux reviews, sauvegarder CSV
import joblib, os
from collections import defaultdict, Counter
import pandas as pd

# variables attendues par cette cellule : cand_list_use (liste candidats), labels (ou labels2 si tu as relancé permissive)
labels_used = globals().get('labels', globals().get('labels2'))
cand_list_now = globals().get('cand_list_use', globals().get('top_candidates_filtered', globals().get('cand_list')))

if labels_used is None or cand_list_now is None:
    raise RuntimeError("Impossible : variables 'labels' et 'cand_list_use' doivent exister (relancer clustering si besoin).")

# Optionnel : counts par candidate si tu as top_candidates_clean_counts
freq_map = dict(globals().get('top_candidates_clean_counts', [])) if 'top_candidates_clean_counts' in globals() else {}
# build cluster -> candidates
cluster_map = defaultdict(list)
for cand, lab in zip(cand_list_now, labels_used):
    cluster_map[lab].append(cand)

# build canonical name = most common token in cluster (fallback: first)
auto_mapping = {}
for lab, items in cluster_map.items():
    if lab == -1:
        continue
    # choose candidate with highest overall frequency if available, else first
    if freq_map:
        candidate_freqs = {cand: freq_map.get(cand, 1) for cand in items}
        canonical = sorted(candidate_freqs.items(), key=lambda x: -x[1])[0][0]
    else:
        canonical = items[0]
    # simple normalization (no spaces leading/trailing)
    canonical = str(canonical).strip().lower()
    auto_mapping[int(lab)] = canonical

print("Mapping automatique (provisoire) :")
for lab, name in auto_mapping.items():
    print(f"  {lab} -> {name}")

# Apply mapping to dataframe by substring match (case-insensitive)
TEXT_COL = globals().get('TEXT_COL', 'review_body')  # adapte si tu as utilisé text_combined
def map_review_to_auto_aspects(text):
    text_l = str(text).lower()
    found = set()
    for cand, lab in zip(cand_list_now, labels_used):
        if int(lab) == -1: 
            continue
        if cand in text_l:
            found.add(auto_mapping.get(int(lab)))
    return sorted([a for a in found if a is not None])

# create column and aggregate
df['absa_aspects_auto'] = df[TEXT_COL].fillna("").astype(str).apply(map_review_to_auto_aspects)
agg = df['absa_aspects_auto'].explode().value_counts().dropna()
print("\nTop auto-aspects found (sample):")
display(agg.head(30))

# Save artifacts for manual validation
os.makedirs("artifacts", exist_ok=True)
joblib.dump({'auto_mapping': auto_mapping, 'cluster_map': dict(cluster_map), 'labels_used': list(set(labels_used))},
            "artifacts/absa_auto_mapping.joblib")
df[['review_body', 'review_title', TEXT_COL, 'absa_aspects_auto']].to_csv("artifacts/absa_reviews_with_auto_aspects.csv", index=False)
print("\nSaved artifacts:")
print(" - artifacts/absa_auto_mapping.joblib")
print(" - artifacts/absa_reviews_with_auto_aspects.csv")

RuntimeError: Impossible : variables 'labels' et 'cand_list_use' doivent exister (relancer clustering si besoin).

# Pourquoi les counts sont si grands ?
Le mapping automatique actuel fait du **substring matching** simple : si la chaîne `'camera'` apparaît quelque part dans le texte, on compte la review pour l’aspect `camera`.  
Problèmes courants :
- matching dans des mots plus longs (ex. `scam camera?`), ou parties de mots ;  
- mentions hors-contexte (ex. `I bought a camera for my friend` → peut-être OK, mais parfois `camera` apparait dans une phrase générique) ;  
- multi-occurrences et doublons non contrôlés.  

Solution : utiliser un matching *plus strict* (mots entiers / PhraseMatcher), capturer la phrase contenant l’expression, échantillonner les résultats pour validation humaine, puis ajuster la règle si nécessaire.

In [11]:
# 1) Matching strict par regex (mots entiers) — remplace le substring naïf
# Regex whole-word matching (utiliser \b pour délimiter mots)
import re

# auto_mapping = {lab: canonical} existant
mapping = globals().get('auto_mapping', {})  # {lab: 'price', ...}
labels_used = globals().get('labels', globals().get('labels2'))
cand_list_now = globals().get('cand_list_use', globals().get('top_candidates_filtered', globals().get('cand_list')))

# Precompile patterns per candidate to avoid repeated cost
patterns = []
for cand, lab in zip(cand_list_now, labels_used):
    if int(lab) == -1: 
        continue
    # escape and require word boundaries
    pat = re.compile(r'\b' + re.escape(cand) + r'\b', flags=re.IGNORECASE)
    patterns.append((cand, int(lab), pat))

def map_review_regex(text):
    text = str(text)
    found = set()
    for cand, lab, pat in patterns:
        if pat.search(text):
            found.add(mapping.get(int(lab)))
    return sorted([f for f in found if f is not None])

# apply (on a sample if dataset large)
df['absa_aspects_regex'] = df['review_body'].fillna("").astype(str).apply(map_review_regex)
df['absa_aspects_regex'].explode().value_counts().head(40)

# Regex whole-word matching (utiliser \b pour délimiter mots)
import re

# auto_mapping = {lab: canonical} existant
mapping = globals().get('auto_mapping', {})  # {lab: 'price', ...}
labels_used = globals().get('labels', globals().get('labels2'))
cand_list_now = globals().get('cand_list_use', globals().get('top_candidates_filtered', globals().get('cand_list')))

# Precompile patterns per candidate to avoid repeated cost
patterns = []
for cand, lab in zip(cand_list_now, labels_used):
    if int(lab) == -1: 
        continue
    # escape and require word boundaries
    pat = re.compile(r'\b' + re.escape(cand) + r'\b', flags=re.IGNORECASE)
    patterns.append((cand, int(lab), pat))

def map_review_regex(text):
    text = str(text)
    found = set()
    for cand, lab, pat in patterns:
        if pat.search(text):
            found.add(mapping.get(int(lab)))
    return sorted([f for f in found if f is not None])

# apply (on a sample if dataset large)
df['absa_aspects_regex'] = df['review_body'].fillna("").astype(str).apply(map_review_regex)
df['absa_aspects_regex'].explode().value_counts().head(40)

# Regex whole-word matching (utiliser \b pour délimiter mots)
import re

# auto_mapping = {lab: canonical} existant
mapping = globals().get('auto_mapping', {})  # {lab: 'price', ...}
labels_used = globals().get('labels', globals().get('labels2'))
cand_list_now = globals().get('cand_list_use', globals().get('top_candidates_filtered', globals().get('cand_list')))

# Precompile patterns per candidate to avoid repeated cost
patterns = []
for cand, lab in zip(cand_list_now, labels_used):
    if int(lab) == -1: 
        continue
    # escape and require word boundaries
    pat = re.compile(r'\b' + re.escape(cand) + r'\b', flags=re.IGNORECASE)
    patterns.append((cand, int(lab), pat))

def map_review_regex(text):
    text = str(text)
    found = set()
    for cand, lab, pat in patterns:
        if pat.search(text):
            found.add(mapping.get(int(lab)))
    return sorted([f for f in found if f is not None])

# apply (on a sample if dataset large)
df['absa_aspects_regex'] = df['review_body'].fillna("").astype(str).apply(map_review_regex)
df['absa_aspects_regex'].explode().value_counts().head(40)

TypeError: 'NoneType' object is not iterable

In [13]:
# 2) PhraseMatcher spaCy (robuste pour multi-word phrases + capture de spans)
# PhraseMatcher approach (recommended) — handles multi-word and gives spans
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")  # ou fr_core_news_md si FR

matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# Build patterns per canonical aspect (group synonyms first)
cluster_to_cands = {}
for cand, lab in zip(cand_list_now, labels_used):
    lab = int(lab)
    if lab == -1: 
        continue
    cluster_to_cands.setdefault(lab, []).append(cand)

# create PhraseMatcher entries using synonyms (candidates) per cluster
for lab, cands in cluster_to_cands.items():
    patterns = [nlp.make_doc(c) for c in cands]
    matcher.add(f"CL_{lab}", patterns)

def map_and_spans(text):
    doc = nlp(text)
    found = {}
    matches = matcher(doc)
    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]  # "CL_2"
        lab = int(label.split("_",1)[1])
        canonical = mapping.get(lab)
        span_text = doc[start:end].text
        # store set of spans per canonical aspect
        found.setdefault(canonical, []).append(span_text)
    return found

# Apply on a small sample first (performance)
sample = df.sample(min(5000, len(df)), random_state=42)
sample['absa_spans'] = sample['review_body'].fillna("").astype(str).apply(map_and_spans)
sample[['review_body','absa_spans']].head(20)
# When happy, apply on full df:
# df['absa_spans'] = df['review_body'].fillna("").astype(str).apply(map_and_spans)

TypeError: 'NoneType' object is not iterable

In [12]:
# Sauver un échantillon pour annotation manuelle (up to 100 ex par aspect)
import pandas as pd, random, os
os.makedirs("artifacts", exist_ok=True)

# Utilise la colonne produite par PhraseMatcher qui contient les spans dict: sample['absa_spans']
# On prendra un échantillon sur les reviews où cet aspect a été trouvé.
SAMPLE_PER_ASP = 100
rows = []
# mapping: auto_mapping {lab: canonical} ou mapping variable que tu as
canons = list(auto_mapping.values()) if 'auto_mapping' in globals() else sorted(set([a for lst in df['absa_aspects_regex'].dropna() for a in (lst if isinstance(lst, list) else [lst])]))

# iterate per aspect and sample
for asp in canons:
    # select rows where aspect appears (PhraseMatcher output may be in column 'absa_spans' as dict)
    sel = df[df.get('absa_spans', pd.Series()).apply(lambda d: isinstance(d, dict) and asp in d)]
    if sel.shape[0] == 0:
        # fallback: column absa_aspects_regex or absa_aspects_auto
        for col in ['absa_aspects_regex','absa_aspects_auto','absa_aspects_refined','absa_aspects_strict']:
            if col in df.columns:
                sel = df[df[col].apply(lambda lst: asp in lst if isinstance(lst, (list, set)) else False)]
                if sel.shape[0]>0:
                    break
    if sel.shape[0] == 0:
        continue
    n = min(SAMPLE_PER_ASP, len(sel))
    sample_rows = sel.sample(n, random_state=42)
    for _, r in sample_rows.iterrows():
        spans = r.get('absa_spans', {}) or {}
        # take first span if available
        span_examples = spans.get(asp, []) if isinstance(spans, dict) else []
        span_text = span_examples[0] if span_examples else ""
        # get sentence containing span (if you have nlp loaded)
        sent = span_text
        if span_text and 'nlp' in globals():
            doc = nlp(r['review_body'])
            for s in doc.sents:
                if span_text in s.text:
                    sent = s.text.strip()
                    break
        rows.append({
            "aspect": asp,
            "span": span_text,
            "sentence": sent,
            "review_body": r['review_body'],
            "review_id": r.name
        })

# save CSV to annotate (label column to be added manually: 1=correct, 0=incorrect)
df_qc = pd.DataFrame(rows)
df_qc.to_csv("artifacts/absa_manual_qc_sample.csv", index=False)
print("Saved artifacts/absa_manual_qc_sample.csv with", len(df_qc), "rows. Annotate column 'label' (1=correct,0=wrong).")


NameError: name 'df' is not defined

In [14]:
Maintenant l’objectif pratique est de valider la qualité (estimer la précision), corriger / canonicaliser le mapping, puis bootstrapper des labels de sentiment par aspect pour créer un jeu d’entraînement ABSA.

SyntaxError: invalid character '’' (U+2019) (4290730335.py, line 1)

In [15]:
# 1) Exporter un échantillon pour annotation humaine (100 ex. / aspect)
# === générer artifacts/absa_manual_qc_sample.csv ===
import pandas as pd, os, re, random, numpy as np
os.makedirs("artifacts", exist_ok=True)
random.seed(42)

# TEXT_COL detection
TEXT_COL = 'review_body' if 'review_body' in df.columns else df.columns[0]
print("Using TEXT_COL =", TEXT_COL)

span_col = 'absa_spans' if 'absa_spans' in df.columns else None
list_aspect_cols = [c for c in ['absa_aspects_regex','absa_aspects_auto','absa_aspects_refined','absa_aspects_strict','absa_aspects'] if c in df.columns]
print("span_col:", span_col, "list_aspect_cols:", list_aspect_cols)

# --- robust membership tester ---
def contains_aspect(v, asp):
    """
    Return True if aspect `asp` appears in value `v`.
    Handles: None/NaN, dict (keys), list/tuple/set/np.ndarray, str, other (fallback str).
    Uses whole-word regex for string matching.
    """
    try:
        if v is None:
            return False
        if isinstance(v, float) and np.isnan(v):
            return False
        if isinstance(v, dict):
            # check keys (common pattern for spans dict)
            return asp in v
        if isinstance(v, (list, tuple, set, np.ndarray)):
            try:
                return asp in v
            except Exception:
                try:
                    return asp in list(v)
                except Exception:
                    return False
        if isinstance(v, str):
            return bool(re.search(r'\b' + re.escape(str(asp)) + r'\b', v, flags=re.IGNORECASE))
        # fallback: convert to string
        return str(asp).lower() in str(v).lower()
    except Exception:
        return False

# determine canonical aspects (canons)
if 'auto_mapping' in globals() and auto_mapping:
    canons = sorted(set(auto_mapping.values()))
else:
    s=set()
    for col in list_aspect_cols:
        # sample to inspect types and values
        for v in df[col].dropna().head(20000):
            if isinstance(v, (list, tuple, set)):
                s.update(v)
            elif isinstance(v, dict):
                s.update(list(v.keys()))
            elif isinstance(v, str):
                if v.strip().startswith('[') and len(v) < 500:
                    try:
                        import ast
                        parsed = ast.literal_eval(v)
                        if isinstance(parsed, (list,tuple,set)):
                            s.update(parsed)
                            continue
                    except Exception:
                        pass
                s.add(v)
    canons = sorted(x for x in s if x)

print("Aspects to sample (example):", canons[:20])
if not canons:
    raise RuntimeError("Aucun aspect trouvé pour sampling. Vérifie auto_mapping ou colonnes d'aspects.")

# sampling loop (build index set safely)
SAMPLE_PER_ASP = 100
rows = []

for asp in canons:
    idxs = set()

    # 1) spans dict column if present (fast)
    if span_col:
        mask = df[span_col].apply(lambda d: isinstance(d, dict) and (asp in d) if pd.notnull(d) else False)
        idxs.update(df.index[mask].tolist())

    # 2) search in list-like / string aspect columns using contains_aspect
    for col in list_aspect_cols:
        # avoid evaluating ambiguous arrays directly: use contains_aspect for each cell
        try:
            mask_idx = df.index[df[col].apply(lambda v: contains_aspect(v, asp))]
            idxs.update(mask_idx.tolist())
        except Exception:
            # fallback to safe string-based search if apply fails
            try:
                mask = df[col].astype(str).str.contains(re.escape(str(asp)), na=False, case=False)
                idxs.update(df.index[mask].tolist())
            except Exception:
                pass

    if not idxs:
        continue

    # sample indices
    idxs_list = list(idxs)
    if len(idxs_list) > SAMPLE_PER_ASP:
        idxs_sampled = random.sample(idxs_list, SAMPLE_PER_ASP)
    else:
        idxs_sampled = idxs_list

    # build sample rows
    for idx in idxs_sampled:
        r = df.loc[idx]
        spans = {}
        if span_col and isinstance(r.get(span_col), dict):
            spans = r.get(span_col) or {}
        span_list = spans.get(asp, []) if isinstance(spans, dict) else []
        span_text = span_list[0] if span_list else ""
        sent = span_text
        if span_text and 'nlp' in globals():
            try:
                doc = nlp(r[TEXT_COL])
                for s in doc.sents:
                    if span_text in s.text:
                        sent = s.text.strip()
                        break
            except Exception:
                sent = span_text
        rows.append({
            "aspect": asp,
            "span": span_text,
            "sentence": sent,
            "review": r[TEXT_COL],
            "review_id": idx
        })

# save result
df_qc = pd.DataFrame(rows)
out_path = "artifacts/absa_manual_qc_sample.csv"
df_qc.to_csv(out_path, index=False)
print(f"Saved {out_path} with {len(df_qc)} rows.")
display(df_qc.head(10))

NameError: name 'df' is not defined

# Résultats intermédiaires — extraction d'aspects (ABSA) et justification

## Résumé des résultats observés
- **Aspects détectés automatiquement** (exemples) : `cable`, `camera`, `price`.  
- **Échantillon exporté pour QA** : `artifacts/absa_manual_qc_sample.csv` — chaque ligne contient l’`aspect` prédit, le `span` détecté (si présent), la `sentence` extraite (contexte) et la `review`.  
- Aperçu (extrait) : beaucoup d’exemples pour `cable` avec `span` parfois vide (le span n’est pas toujours identifié si la détection provient d’un index listé plutôt que d’un span exact).

## Pourquoi on a procédé ainsi (méthodologie & motivation)
1. **But** : passer d’une simple détection de sentiment global (pos/neg) à une extraction d’**aspects** (ex. prix, batterie, écran, son) puis à une annotation de sentiment *par aspect*.  
2. **Pipeline choisi** :
   - extraire des *candidats* (n-grams / chunks) à partir des reviews ;  
   - encoder ces candidats (SBERT) → réduire la dimension (UMAP) → clusteriser (HDBSCAN) pour obtenir groupes d’aspects naturels ;  
   - construire un mapping `cluster -> canonical_aspect` automatique (fréquence ou représentant du cluster) ;  
   - **appliquer un matching strict** (PhraseMatcher / regex mot-entier) pour repérer les mentions dans les reviews et extraire la phrase-contexte.
3. **Pourquoi matching strict / PhraseMatcher** :
   - l’approche par simple substring produisait des *comptages énormes* et beaucoup de faux-positifs ;  
   - PhraseMatcher + mot-entier réduit les faux positifs (mots inclus dans d’autres mots, pronoms, etc.) et permet d’extraire le **span exact** pour contextualiser la mention.

## Interprétation du tableau d’exemple
- Les lignes montrent des reviews où l’aspect `cable` a été détecté.  
- `span` vide ⇢ il y a plusieurs causes possibles :
  - le matching a été fait sur une colonne liste (aspects détectés globalement) mais aucun span précis n’a été extrait ;  
  - PhraseMatcher n’a pas trouvé la phrase exacte (par ex. tokenisation différente) ;  
  - ou le token apparaissait dans une forme non couverte par les patterns (abréviations, erreurs OCR, balises HTML).
- **Conséquence pratique** : les lignes avec `sentence` vide nécessitent plus d’attention (manuellement ou par règles supplémentaires) pour vérifier la qualité.

## Objectif immédiat (raison de l’échantillonnage)
- **Contrôle qualité** : annoter manuellement un échantillon (≈100 ex / aspect) pour estimer la *précision* (TP / prédits).  
- **Décision opérationnelle** :
  - Si précision ≥ ~0.8 → on applique le mapping sur tout le dataset et on bootstrappe des pseudo-labels de sentiment.  
  - Si précision 0.5–0.8 → améliorer les règles (POS-check, PhraseMatcher enrichi, two-hits, blacklist), recluster si nécessaire.  
  - Si précision < 0.5 → revoir la génération de candidats et le clustering.

## Prochaine étape (pipeline recommandé)
1. **Annoter** `artifacts/absa_manual_qc_sample.csv` (colonne `label`: 1 correct / 0 faux-positif).  
2. Calculer la **precision_est** par aspect (script fourni).  
3. Si satisfait : extraire toutes les phrases (contexte) pour chaque aspect, lancer un pipeline `sentiment-analysis` (bootstrap) et créer un dataset pseudo-étiqueté.  
4. Entraîner un modèle ABSA (ex. SBERT → LR / small classifier) sur ces pseudo-labels, puis itérer avec annotation humaine sur les erreurs les plus fréquentes (active learning).

## Limitations et points d’attention
- Les méthodes automatiques produisent des **bruits** : il faut valider par humain avant d’entraîner un modèle final.  
- Les tokens vagues (ex. `thing`, `item`, pronoms) sont souvent regroupés dans le bruit ; il faut les **blacklister** ou les retraiter.  
- L’extraction de spans dépend fortement de la qualité du `PhraseMatcher` et des variantes textuelles (HTML, fautes, majuscules).

## Phrases courtes pour la soutenance (oral)
- « Nous utilisons d’abord SBERT → UMAP → HDBSCAN pour découvrir des groupes d’aspects sans supervision. Ensuite, un matching strict (PhraseMatcher / regex) permet d’extraire le span et la phrase-contexte. Enfin, nous validons par échantillonnage humain avant de générer des pseudo-labels de sentiment. »  
- « Cette méthode équipe l’entreprise d’un pipeline interprétable : clusters compréhensibles, mapping manuel/automatique, et possibilité d’itérer rapidement sur la précision via annotation ciblée. »

---

In [16]:
# Snippet B: ensemble auto-label (requires sentence-transformers & transformers)
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import re, pandas as pd, numpy as np

df = pd.read_csv("artifacts/absa_manual_qc_sample.csv")  # or full dataset
texts = df['sentence'].fillna('').astype(str)
texts = texts.where(texts.str.strip()!='', df['review'].fillna('').astype(str))
aspects = df['aspect'].astype(str)

# encode SBERT
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
text_emb = model.encode(texts.tolist(), convert_to_tensor=True, show_progress_bar=False)
asp_emb = model.encode(aspects.tolist(), convert_to_tensor=True, show_progress_bar=False)
sims = util.cos_sim(text_emb, asp_emb).diagonal().cpu().numpy()

# zero-shot
zs = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)

def whole_word(text, token):
    return bool(re.search(r'\b' + re.escape(token.lower()) + r'\b', text.lower()))

labels = []
conf = []
reasons = []
for i, row in df.iterrows():
    asp = str(row['aspect'])
    text = texts.iloc[i]
    span = row.get('span','') or ""
    vote = 0
    reason_list = []
    # rule: span
    if str(span).strip() != "":
        vote += 2; reason_list.append("span")
    # rule: regex / synonym
    if whole_word(text, asp):
        vote += 1; reason_list.append("word")
    # sbert
    sim = float(sims[i])
    if sim >= 0.62:
        vote += 1; reason_list.append(f"sbert:{sim:.2f}")
    # zero-shot
    try:
        out = zs(text[:512], candidate_labels=[asp, "other"], hypothesis_template="This sentence is about {}.")
        sc = out['scores'][0] if out['labels'][0]==asp else out['scores'][1]
    except Exception:
        sc = 0.0
    if sc >= 0.65:
        vote += 1; reason_list.append(f"zs:{sc:.2f}")
    # decision: accept if votes >=2 OR span present
    lab = 1 if (vote >= 2) else 0
    labels.append(lab); conf.append(vote); reasons.append(";".join(reason_list))
df['ensemble_label'] = labels
df['ensemble_conf_votes'] = conf
df['ensemble_reasons'] = reasons
df.to_csv("artifacts/absa_auto_ensemble_labels.csv", index=False)
print("Saved ensemble labels: artifacts/absa_auto_ensemble_labels.csv")


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Saved ensemble labels: artifacts/absa_auto_ensemble_labels.csv


In [17]:
# 1) Aperçu + distribution
import os, pandas as pd
p = "artifacts/absa_auto_ensemble_labels.csv"
print("Exists:", os.path.exists(p))
df = pd.read_csv(p)
print("Rows:", len(df))
print("\nDistribution des labels (ensemble_label):")
print(df['ensemble_label'].value_counts(dropna=False))
display(df.head(12))

Exists: True
Rows: 300

Distribution des labels (ensemble_label):
ensemble_label
1    300
Name: count, dtype: int64


Unnamed: 0,aspect,span,sentence,review,review_id,ensemble_label,ensemble_conf_votes,ensemble_reasons
0,cable,,,These charging cords are excellent. You can’t ...,794443,1,2,span
1,cable,,,This product says its water proof.<br />Since ...,95248,1,2,span
2,cable,,,Returned the same day received. I have had man...,23006,1,2,span
3,cable,,,"worked well for a couple months, ended up repl...",962905,1,2,span
4,cable,,,I actually just bought this for the Nintendo s...,231771,1,2,span
5,cable,,,I have used and own lots of wireless mice but ...,206658,1,2,span
6,cable,,,Works just fine. No problems so far. Right pri...,189000,1,2,span
7,cable,,,I was very excited to purchase this item being...,118895,1,2,span
8,cable,,,I received this as a gift about a month ago. I...,954591,1,2,span
9,cable,,,I got this adapter so I could view and copy th...,1136692,1,2,span


In [18]:
# 2) Copier vers /mnt/data pour download (ou remplacer si existant)
import shutil, os
src = "artifacts/absa_auto_ensemble_labels.csv"
dst = "data/absa_auto_ensemble_labels.csv"
shutil.copy2(src, dst)
print("Copied to:", os.path.abspath(dst))

Copied to: c:\Users\antoi\OneDrive\Documents\Ynov\Projet fil rouge\Bloc 5\amazon-reviews-insights\notebooks\data\absa_auto_ensemble_labels.csv


In [19]:
# 3) Générer disagreements (petit fichier à vérifier manuellement si besoin)
import pandas as pd, os
df = pd.read_csv("artifacts/absa_auto_ensemble_labels.csv")
# simple_label: span present OR whole-word aspect present
import re
def whole_word(text, token):
    if not isinstance(text, str) or not token:
        return False
    return bool(re.search(r'\b' + re.escape(token.lower()) + r'\b', str(text).lower()))
simple = []
for _, r in df.iterrows():
    span = r.get('span','')
    if isinstance(span, float) and pd.isna(span): span = ""
    if str(span).strip():
        simple.append(1)
    else:
        txt = str(r.get('sentence','') or "") + " " + str(r.get('review','') or "")
        simple.append(1 if whole_word(txt, r.get('aspect','')) else 0)
df['simple_label'] = simple
df['disagree'] = df['ensemble_label'] != df['simple_label']
dis = df[df['disagree']].copy()
out = "artifacts/absa_auto_ensemble_disagreements.csv"
dis.to_csv(out, index=False)
print("Disagreements count:", len(dis), " saved ->", out)
display(dis.head(20))

Disagreements count: 256  saved -> artifacts/absa_auto_ensemble_disagreements.csv


Unnamed: 0,aspect,span,sentence,review,review_id,ensemble_label,ensemble_conf_votes,ensemble_reasons,simple_label,disagree
0,cable,,,These charging cords are excellent. You can’t ...,794443,1,2,span,0,True
1,cable,,,This product says its water proof.<br />Since ...,95248,1,2,span,0,True
2,cable,,,Returned the same day received. I have had man...,23006,1,2,span,0,True
3,cable,,,"worked well for a couple months, ended up repl...",962905,1,2,span,0,True
4,cable,,,I actually just bought this for the Nintendo s...,231771,1,2,span,0,True
5,cable,,,I have used and own lots of wireless mice but ...,206658,1,2,span,0,True
6,cable,,,Works just fine. No problems so far. Right pri...,189000,1,2,span,0,True
7,cable,,,I was very excited to purchase this item being...,118895,1,2,span,0,True
8,cable,,,I received this as a gift about a month ago. I...,954591,1,2,span,0,True
9,cable,,,I got this adapter so I could view and copy th...,1136692,1,2,span,0,True


In [20]:
# 4) Résumé par aspect
import pandas as pd
df = pd.read_csv("artifacts/absa_auto_ensemble_labels.csv")
summary = df.groupby('aspect')['ensemble_label'].agg(['count','sum']).rename(columns={'sum':'n_pos','count':'n_sample'})
summary['precision_est_proxy'] = (summary['n_pos'] / summary['n_sample']).round(3)
display(summary.sort_values('n_pos', ascending=False).head(40))

Unnamed: 0_level_0,n_sample,n_pos,precision_est_proxy
aspect,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cable,100,100,1.0
camera,100,100,1.0
price,100,100,1.0


In [21]:
import pandas as pd, numpy as np
df = pd.read_csv("artifacts/absa_auto_ensemble_labels.csv")
# afficher quelques exemples où span non-nul selon le code
mask_span = df['span'].notna() & (df['span'].astype(str).str.strip() != "") 
print("Count span non-empty:", mask_span.sum())
display(df[mask_span].head(20))
# voir valeurs particulières (strings “Missing value” etc.)
print(df['span'].unique()[:40])

Count span non-empty: 0


Unnamed: 0,aspect,span,sentence,review,review_id,ensemble_label,ensemble_conf_votes,ensemble_reasons


[nan]


In [22]:
display(df['sbert_sim'].describe())
display(df['zs_score'].describe())
display(df['votes'].value_counts().sort_index())
# show low/medium scores that were labelled 1
display(df[(df['ensemble_label']==1) & (df['sbert_sim']<0.6)].head(30))

KeyError: 'sbert_sim'

In [None]:
# Appliquer une règle conservative (span present OR exact whole-word match)
import os, re, pandas as pd, math

src = "artifacts/absa_auto_ensemble_labels.csv"
if not os.path.exists(src):
    raise FileNotFoundError(f"{src} introuvable (vérifie le chemin)")

df = pd.read_csv(src)

def whole_word(text, token):
    if not isinstance(text, str) or not token:
        return False
    return bool(re.search(r'\b' + re.escape(str(token).lower()) + r'\b', str(text).lower()))

def is_span_present(span):
    if span is None:
        return False
    if isinstance(span, float) and pd.isna(span):
        return False
    s = str(span).strip().lower()
    if s in ["", "nan", "none", "missing value"]:
        return False
    return True

def conservative_label(row):
    # 1) span explicit -> accept
    if is_span_present(row.get('span', "")):
        return 1
    # 2) exact whole-word aspect in sentence or review -> accept
    text = str(row.get('sentence', '') or "") + " " + str(row.get('review', '') or "")
    if whole_word(text, row.get('aspect', '')):
        return 1
    # 3) otherwise reject
    return 0

# apply and save
df['cons_label'] = df.apply(conservative_label, axis=1)
df['cons_agree'] = df['cons_label'] == df['ensemble_label']

os.makedirs("artifacts", exist_ok=True)
out_art = "artifacts/absa_auto_ensemble_conservative.csv"
out_mnt = "data/absa_auto_ensemble_conservative.csv"
df.to_csv(out_art, index=False)
df.to_csv(out_mnt, index=False)

# disagreements (cases to inspect quickly)
dis = df[df['cons_agree']==False].copy()
dis_path = "artifacts/absa_auto_ensemble_conservative_disagreements.csv"
dis_path_mnt = "data/absa_auto_ensemble_conservative_disagreements.csv"
dis.to_csv(dis_path, index=False)
dis.to_csv(dis_path_mnt, index=False)

# summary
summary = {
    "total_rows": len(df),
    "ensemble_ones": int((df['ensemble_label']==1).sum()),
    "cons_ones": int((df['cons_label']==1).sum()),
    "agreements": int(df['cons_agree'].sum()),
    "disagreements": int((df['cons_agree']==False).sum())
}
print("Saved conservative CSV ->", os.path.abspath(out_art))
print("Saved copy ->", os.path.abspath(out_mnt))
print("Saved disagreements ->", os.path.abspath(dis_path))
print("Summary:", summary)
display(df[['aspect','span','sentence','review','ensemble_label','cons_label','ensemble_reasons']].head(20))

In [None]:
import os, glob, re, math, pandas as pd, warnings
warnings.filterwarnings("ignore")

# 0) Chercher automatiquement un fichier d'entrée possible
candidates = [
    "artifacts/absa_auto_ensemble_labels.csv",
    "artifacts/absa_auto_ensemble_conservative.csv",
    "artifacts/absa_manual_qc_sample.csv",
    "notebooks/artifacts/absa_auto_ensemble_labels.csv",
    "notebooks/artifacts/absa_manual_qc_sample.csv",
    "/mnt/data/absa_auto_ensemble_labels.csv",
    "/mnt/data/absa_manual_qc_sample.csv",
    "absa_manual_qc_sample.csv"
]
src = None
for p in candidates:
    if p and os.path.exists(p):
        src = p
        break
if src is None:
    # fallback: try to find any file that looks like 'absa' in the tree
    found = glob.glob("**/*absa*.csv", recursive=True)
    found = [f for f in found if "ml" not in f and "ml_labels" not in f]
    if found:
        src = found[0]
if src is None:
    raise FileNotFoundError("Aucun fichier source trouvé (regarder artifacts/). Essayez de préciser le chemin ou exécutez la cellule depuis le dossier racine du projet.")

print("Using source file:", os.path.abspath(src))
df = pd.read_csv(src)
print("Rows in source:", len(df))
print("Cols:", df.columns.tolist())

# 1) Préparer textes et aspects
# prefer 'sentence' else 'review'
if 'sentence' in df.columns:
    texts = df['sentence'].fillna('').astype(str).where(lambda s: s.str.strip()!='', df['review'].fillna('').astype(str))
else:
    texts = df['review'].fillna('').astype(str)
aspects = df['aspect'].fillna('').astype(str).tolist()
texts_list = texts.tolist()

# 2) Chargement SBERT (peut télécharger le modèle -> quelques dizaines de Mo)
from sentence_transformers import SentenceTransformer, util
print("Loading SBERT model...")
sbert = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

print("Encoding texts + aspects (SBERT) ...")
text_embs = sbert.encode(texts_list, convert_to_tensor=True, show_progress_bar=True)
asp_embs = sbert.encode(aspects, convert_to_tensor=True, show_progress_bar=True)

# 3) Similarité diagonale (text_i vs aspect_i)
sims = util.cos_sim(text_embs, asp_embs).diagonal().cpu().numpy()
df['sbert_sim'] = sims
print("sbert_sim computed: min %.3f / med %.3f / max %.3f" % (sims.min(), pd.Series(sims).median(), sims.max()))

# 4) Zero-shot (facebook/bart-large-mnli) -> CPU par défaut
from transformers import pipeline
print("Loading zero-shot classifier (this may download ~700MB)...")
zs = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=-1)

zs_scores = []
for i, txt in enumerate(texts_list):
    asp = aspects[i]
    try:
        out = zs(str(txt)[:512], candidate_labels=[asp, "other"], hypothesis_template="This sentence is about {}.", multi_label=False)
        sc = 0.0
        for lbl, scv in zip(out['labels'], out['scores']):
            if lbl == asp:
                sc = scv; break
    except Exception as e:
        sc = 0.0
    zs_scores.append(float(sc))
df['zs_score'] = zs_scores
print("zs_score computed: min %.3f / med %.3f / max %.3f" % (min(zs_scores), pd.Series(zs_scores).median(), max(zs_scores)))

# 5) Appliquer la règle ML stricte
def is_span_present(span):
    if span is None: return False
    if isinstance(span, float) and pd.isna(span): return False
    s = str(span).strip().lower()
    if s in ["", "nan", "none", "missing value"]: return False
    return True

def ml_rule(row, sbert_th=0.68, zs_th=0.68):
    if is_span_present(row.get('span', "")):
        return 1
    if float(row.get('sbert_sim', 0.0)) >= sbert_th and float(row.get('zs_score', 0.0)) >= zs_th:
        return 1
    return 0

df['ml_label'] = df.apply(ml_rule, axis=1)
df['ml_agree_with_ensemble'] = (df['ml_label'] == df.get('ensemble_label', -1))

# 6) Sauver résultats (artifacts + /mnt/data pour download)
os.makedirs("artifacts", exist_ok=True)
out_art = "artifacts/absa_auto_ensemble_ml_labels.csv"
out_mnt = "data/absa_auto_ensemble_ml_labels.csv"
df.to_csv(out_art, index=False)
df.to_csv(out_mnt, index=False)

# disagreements (pour QA rapide)
dis = df[df['ml_agree_with_ensemble'] == False].copy()
dis_path = "artifacts/absa_auto_ensemble_ml_disagreements.csv"
dis.to_csv(dis_path, index=False)
dis.to_csv("data/absa_auto_ensemble_ml_disagreements.csv", index=False)

# 7) Résumé
summary = {
    "total_rows": len(df),
    "ensemble_ones": int((df.get('ensemble_label', pd.Series([]))==1).sum()) if 'ensemble_label' in df.columns else None,
    "ml_ones": int((df['ml_label']==1).sum()),
    "agreements": int(df['ml_agree_with_ensemble'].sum()) if 'ensemble_label' in df.columns else None,
    "disagreements": int(len(dis))
}
print("Saved ML CSV ->", os.path.abspath(out_art))
print("Saved download copy ->", os.path.abspath(out_mnt))
print("Saved ML disagreements ->", os.path.abspath(dis_path))
print("Summary:", summary)

# 8) Inspect: distribution and some example disagreements
display(df[['aspect','span','sentence','review','ensemble_label','sbert_sim','zs_score','ml_label']].head(20))
print("\nSbert stats:")
display(pd.Series(df['sbert_sim']).describe())
print("\nZero-shot stats:")
display(pd.Series(df['zs_score']).describe())
print("\nml_label distribution:")
display(pd.Series(df['ml_label']).value_counts())