In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# 1️⃣ Load dataset
dataset = load_dataset("json", data_files={"train": "test_fine_tune.json", "test": "test_fine_tune.json"}, field="train")

# 2️⃣ Load tokenizer and model
model_name = "facebook/bart-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# 3️⃣ Preprocessing
def preprocess(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding=True)

dataset = dataset.map(preprocess, batched=True)

# 4️⃣ Remove unused columns for Trainer
dataset = dataset.remove_columns(["premise", "hypothesis"])

# 5️⃣ Fine-tuning with Trainer
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    logging_steps=50,
    learning_rate=2e-5,
    save_total_limit=2,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

trainer.train()

# 6️⃣ Test fine-tuned model with zero-shot pipeline
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

test_texts = [
    "I am looking for a product for my child",
    "I want to adopt a cat",
    "I need assistance for a disabled person",
    "I just want something else"
]

candidate_labels = ["child", "handicap", "pet", "other"]

for text in test_texts:
    result = classifier(text, candidate_labels)
    print(f"\nText: {text}")
    print("Predicted class:", result['labels'][0])
    print("Scores:", result['scores'])




Step,Training Loss


Device set to use cpu



Text: I am looking for a product for my child
Predicted class: child
Scores: [0.9959827065467834, 0.00175605365075171, 0.001264249556697905, 0.000997065333649516]

Text: I want to adopt a cat
Predicted class: pet
Scores: [0.9968515038490295, 0.001390138640999794, 0.000978102209046483, 0.0007802379550412297]

Text: I need assistance for a disabled person
Predicted class: handicap
Scores: [0.9981399774551392, 0.000879904895555228, 0.0005158438580110669, 0.00046425481559708714]

Text: I just want something else
Predicted class: other
Scores: [0.8058078289031982, 0.1796957403421402, 0.012700246647000313, 0.0017961935373023152]


# Test

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler, GPT2Tokenizer
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
from torch.optim import AdamW

# ==== PARAMÈTRES ====
MODEL_NAME = "microsoft/deberta-v3-base"  
NUM_LABELS = 4  # adapte selon ton dataset
BATCH_SIZE = 4
EPOCHS = 10
LR = 2e-5
MAX_LEN = 512  

# ==== DATA ====
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("finetune_dataset.csv")

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Hugging Face dataset
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token 

def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True, max_length=MAX_LEN)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# PyTorch format
label_cols = ["handicap", "pet", "child", "other"]

train_ds.set_format("torch", columns=["input_ids", "attention_mask"] + label_cols)
val_ds.set_format("torch", columns=["input_ids", "attention_mask"] + label_cols)
test_ds.set_format("torch", columns=["input_ids", "attention_mask"] + label_cols)

# ==== MODEL ====
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

# ==== TRAIN ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)
num_training_steps = EPOCHS * len(train_ds)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

def sigmoid(x): return 1 / (1 + torch.exp(-x))

def compute_f1(y_true, y_pred):
    y_pred = (y_pred > 0.5).astype(int)
    return f1_score(y_true, y_pred, average="micro")

model.train()
for epoch in range(EPOCHS):
    loop = tqdm(DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True))
    for batch in loop:
        labels = torch.stack([batch[col] for col in label_cols], dim=1).float().to(device)
        inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

# ==== EVAL ====
model.eval()
preds, trues = [], []
for batch in DataLoader(val_ds, batch_size=BATCH_SIZE):
    labels = torch.stack([batch[col] for col in label_cols], dim=1).float().to(device)
    inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        logits = model(**inputs).logits
    preds.append(sigmoid(logits.cpu()).numpy())
    trues.append(labels)

import numpy as np
f1 = compute_f1(np.vstack(trues), np.vstack(preds))
print("Validation F1:", f1)


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/4 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Exception ignored in: <function tqdm.__del__ at 0x0000019AB9659940>
Traceback (most recent call last):
  File "c:\Users\emma\Desktop\project\large-project\venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\emma\Desktop\project\large-project\venv\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

## LoRA

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from transformers import AdamW
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.auto import tqdm
import pandas as pd
import numpy as np

# ==== PARAMÈTRES ====
MODEL_NAME = "microsoft/deberta-v3-base"
NUM_LABELS = 4
BATCH_SIZE = 4
EPOCHS = 3
LR = 2e-5
MAX_LEN = 512

# ==== DATA ====
df = pd.read_csv("finetune_dataset.csv")

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# === TOKENIZER ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token  # pour éviter les warnings

def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True, max_length=MAX_LEN)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

label_cols = ["handicap", "pet", "child", "other"]

train_ds.set_format("torch", columns=["input_ids", "attention_mask"] + label_cols)
val_ds.set_format("torch", columns=["input_ids", "attention_mask"] + label_cols)
test_ds.set_format("torch", columns=["input_ids", "attention_mask"] + label_cols)

# ==== MODEL LoRA ====
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification"
)

# Config LoRA
lora_config = LoraConfig(
    r=8,                        # rang faible
    lora_alpha=32,               # scaling factor
    target_modules=["query", "value"],  # couches attention
    lora_dropout=0.1,
    bias="none",
    task_type="MULTILABEL_CLASSIFICATION"
)

model = get_peft_model(base_model, lora_config)

# ==== DEVICE ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ==== OPTIMIZER + SCHEDULER ====
optimizer = AdamW(model.parameters(), lr=LR)
num_training_steps = EPOCHS * len(train_ds) // BATCH_SIZE
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# ==== FONCTIONS UTILITAIRES ====
def sigmoid(x): return 1 / (1 + torch.exp(-x))

def compute_f1(y_true, y_pred):
    y_pred = (y_pred > 0.5).astype(int)
    return f1_score(y_true, y_pred, average="micro")

# ==== TRAIN ====
model.train()
for epoch in range(EPOCHS):
    loop = tqdm(DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True))
    for batch in loop:
        labels = torch.stack([batch[col] for col in label_cols], dim=1).float().to(device)
        inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

# ==== EVAL ====
model.eval()
preds, trues = [], []
for batch in DataLoader(val_ds, batch_size=BATCH_SIZE):
    labels = torch.stack([batch[col] for col in label_cols], dim=1).numpy()
    inputs = {k: v.to(device) for k, v in batch.items() if k in ["input_ids", "attention_mask"]}
    with torch.no_grad():
        logits = model(**inputs).logits
    preds.append(sigmoid(logits.cpu()).numpy())
    trues.append(labels)

f1 = compute_f1(np.vstack(trues), np.vstack(preds))
print("Validation F1:", f1)
