In [1]:
# dans une cellule Colab
!pip install --upgrade "transformers>=4.48.0"
!pip install gliclass

Collecting gliclass
  Using cached gliclass-0.1.11-py3-none-any.whl.metadata (4.9 kB)
Collecting numpy<2.0.0,>=1.26.4 (from gliclass)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting transformers<=4.48.2,>=4.37.2 (from gliclass)
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading gliclass-0.1.11-py3-none-any.whl (32 kB)
Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m79.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, transformers, gliclass
  A

In [12]:
# ---------- 1. Chargement & mapping ----------
import pandas as pd

df = pd.read_csv(
    'Content Taxonomy 3.1.tsv',
    sep='\t',
    skiprows=3,
    engine='python'
)
df.columns = ['Unique ID', 'Parent', 'Name', 'Tier 1', 'Tier 2', 'Tier 3', 'Tier 4', 'Extension']

df['label_text'] = (
    df[['Tier 1', 'Tier 2', 'Tier 3', 'Tier 4']]
    .fillna('')
    .agg(' '.join, axis=1)
    .str.strip()
    .str.replace(r'\s+', ' ', regex=True)
)

# on garde l'ID d'origine (chaîne)
id2label = {str(r['Unique ID']).strip(): r['label_text']
            for _, r in df.iterrows()
            if pd.notna(r['Unique ID']) and str(r['Unique ID']).strip()}
label2id = {v: k for k, v in id2label.items()}

# ---------- 2. Hiérarchie ----------
tier1_mask   = df['Tier 2'].isna()
tier1_labels = df.loc[tier1_mask, 'label_text'].tolist()
tier1_ids    = df.loc[tier1_mask, 'Unique ID'].astype(str).tolist()
tier1_map    = dict(zip(tier1_labels, tier1_ids))

# index rapide : label → descendants
from collections import defaultdict
descendants = defaultdict(list)
for _, r in df.iterrows():
    parent_lbl = r['Tier 1']
    if pd.notna(parent_lbl):
        descendants[parent_lbl].append(r['label_text'])

In [21]:
with open('/content/article2.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [22]:
# ---------- 3. GLiClass hiérarchique ----------
from gliclass import GLiClassModel, ZeroShotClassificationPipeline
from transformers import AutoTokenizer
import torch

model_name = "knowledgator/gliclass-modern-base-v3.0"
model      = GLiClassModel.from_pretrained(model_name)
tokenizer  = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

pipe = ZeroShotClassificationPipeline(
    model, tokenizer,
    classification_type='multi-label',
    device='cuda:0' if torch.cuda.is_available() else 'cpu'
)

# --- inférence 2 passes ---
# 1. Tier 1
tier1_res = pipe(text, tier1_labels, threshold=0.1)[0]
best_t1   = max(tier1_res, key=lambda x: x["score"])["label"]

# 2. Descendants du Tier 1 gagnant
candidates = descendants[best_t1]
if not candidates:                 # on tombe sur une feuille
    candidates = [best_t1]

final_res = pipe(text, candidates, threshold=0.3)[0]
top5 = sorted(final_res, key=lambda x: x["score"], reverse=True)[:5]

# 3. Ajoute le code IAB
for r in top5:
    r["iab_code"] = f"IAB{label2id[r['label']]}"


# ---------- 4. Sentiment & content-type ----------
from transformers import pipeline as sent_pipe, AutoTokenizer as SentTok

sent_tok = SentTok.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
sent_clf = sent_pipe(
    "text-classification",
    model="cardiffnlp/twitter-roberta-base-sentiment-latest",
    tokenizer=sent_tok,
    truncation=True,           # force le troncage
    max_length=512,
    device='cuda:0' if torch.cuda.is_available() else 'cpu'
)

# text tronqué pour le sentiment
sent = sent_clf(text[:sent_tok.model_max_length-4])[0]["label"].lower()

# Content-type
ctypes = ["product recommendation", "news article", "travel guide", "opinion", "review"]
ctype_res = pipe(text, ctypes, threshold=0.3)[0]
content_type = max(ctype_res, key=lambda x: x["score"])["label"]


100%|██████████| 1/1 [00:09<00:00,  9.97s/it]
100%|██████████| 1/1 [00:05<00:00,  5.40s/it]
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
100%|██████████| 1/1 [00:03<00:00,  3.90s/it]


In [23]:
# ---------- 5. Affichage final ----------
import json

iab_codes = [r["iab_code"] for r in top5][:4]           # max 4 codes
result_json = {
    "iab_categories": iab_codes,
    "sentiment": sent,
    "content_type": content_type
}
print(json.dumps(result_json, indent=2, ensure_ascii=False))

{
  "iab_categories": [
    "IAB653",
    "IAB670",
    "IAB655",
    "IAB664"
  ],
  "sentiment": "neutral",
  "content_type": "opinion"
}
