In [1]:
import torch
print("GPU activé ?", torch.cuda.is_available())

GPU activé ? True


In [None]:
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# 1. Chargez le modèle BERT pour la classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 2. Chargez le tokenizer (traducteur texte → nombres)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 3. Activez le GPU si disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("✅ Tout est chargé ! Device utilisé :", device)

In [None]:
import pandas as pd

# 1. Créez un mini-dataset de test (2 catégories : 1=positif, 0=négatif)
data = {
    "text": [
        "J'ai adoré ce film, les acteurs sont excellents !",  # Positif
        "Une pure déception, l'intrigue est nulle.",         # Négatif
        "Superbe réalisation, bien que un peu long.",        # Positif
        "Je ne recommande pas du tout."                     # Négatif
    ],
    "label": [1, 0, 1, 0]
}
df = pd.DataFrame(data)
print("📊 Dataset créé :\n", df)

In [None]:
# 1. Tokenisation des textes
def tokenize(text):
    return tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")

# 2. Appliquez à tout le dataset
df["tokens"] = df["text"].apply(tokenize)
print("\n🔡 Tokenisation réussie ! Exemple :")
print(df["tokens"][0])  # Affiche le 1er texte tokenisé

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = 128  # Longueur maximale fixe

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Tokenisation avec padding forcé à max_length
        tokens = self.tokenizer(
            self.texts[idx],
            padding='max_length',  # <-- Ajout crucial
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        return {k: v.squeeze(0) for k, v in tokens.items()}, torch.tensor(self.labels[idx])

In [None]:
train_dataset = SentimentDataset(df["text"], df["label"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [None]:
from tqdm.auto import tqdm

# 1. Paramètres
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
epochs = 3  # Petit nombre pour tester

# 2. Boucle d'entraînement
model.train()
for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Époque {epoch + 1}/{epochs}")

    for batch, labels in progress_bar:
        # Envoi des données sur le GPU
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = labels.to(device)

        # Forward pass
        outputs = model(**batch, labels=labels)
        loss = outputs.loss

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_postfix({"Loss": loss.item()})

    print(f"Loss moyen : {total_loss / len(train_loader):.4f}")

print("\n🎉 Entraînement terminé !")

In [None]:
model.eval()  # Mode évaluation
total_correct = 0

with torch.no_grad():  # Désactive le calcul des gradients
    for batch, labels in tqdm(DataLoader(train_dataset, batch_size=2), desc="Évaluation"):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = labels.to(device)

        outputs = model(**batch)
        predictions = torch.argmax(outputs.logits, dim=1)
        total_correct += (predictions == labels).sum().item()

accuracy = total_correct / len(train_dataset)
print(f"\n📊 Précision sur le training set : {accuracy * 100:.2f}%")

In [None]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    proba = torch.softmax(outputs.logits, dim=1)
    return "👍 Positif" if torch.argmax(proba).item() == 1 else "👎 Négatif"

# Exemples
test_phrases = [
    "Ce film m'a bouleversé, une pure merveille !",  # 👍
    "Je n'ai pas aimé l'acting, trop forcé.",        # 👎
    "Les effets spéciaux étaient incroyables",       # 👍
    "Scénario confus et dialogues faibles."          # 👎
]

for phrase in test_phrases:
    print(f'"{phrase}" → {predict(phrase)}')

In [None]:
model_path = "my_bert_sentiment_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"✅ Modèle sauvegardé dans '{model_path}/'")

In [None]:
from google.colab import files
!zip -r model.zip {model_path}
files.download("model.zip")