In [None]:
# Import des librairies
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import create_optimizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
# Configurations
MODEL_NAME = 'bert-base-uncased'
NUM_LABELS = 7
EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
MAX_LEN = 128

In [None]:
# Charger tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# chargement du dataset
dataset = pd.read_csv("dataset_psycho_clean.csv")
dataset.sample(5)

In [None]:
# encodage labels
# conversion des labels (y) en entiers
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

dataset['status'] = label_encoder.fit_transform(dataset['status'])

In [None]:
# Vérification de l'encodage des labels
print("Labels encodés :", dataset['status'].value_counts())
print("Correspondance des classes :", label_encoder.classes_)

In [None]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    dataset['statement'].tolist(),
    dataset['status'].tolist(),
    test_size=0.2,
    stratify=dataset['status'],   # labels déséquilibrés
    random_state=42
)

## Pré-traitement des données

In [None]:
# Tokenization
def encode(texts):
    return tokenizer(
        texts,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='tf'
    )


train_encodings = encode(train_texts)
val_encodings = encode(val_texts)

In [None]:
# Créer les datasets TensorFlow
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
)).shuffle(100).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
)).batch(BATCH_SIZE)

In [None]:
# Charger modèle
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

In [None]:
# Optimizer et loss
steps_per_epoch = len(train_dataset)
num_train_steps = steps_per_epoch * EPOCHS
optimizer, schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_warmup_steps=0,
    num_train_steps=num_train_steps
)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]

In [None]:
# Compiler modèle
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [None]:
# Entraînement
model.fit(train_dataset, validation_data=val_dataset, epochs=EPOCHS)

## Test du modèle

In [None]:
# Exemple de prédiction
def predict_statement(statement):
    inputs = encode([statement])
    outputs = model(inputs)
    logits = outputs.logits
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    return predicted_class

# Test prédiction
example = "This is amazing!"
status = predict_statement(example)
print(f"Predicted status: {status}")

## Evaluation du modèle