In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import CamembertModel, CamembertTokenizer
import pandas as pd
import torch
from torchinfo import summary
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from sklearn.utils.class_weight import compute_class_weight

#Importation du data

Import data

In [None]:
import json

# Charger le fichier JSON
with open('data.json', 'r') as f:
    data = json.load(f)

data['intentions']

In [None]:
questions = []
labels = []


for intention in data["intentions"]:
    tag = intention["tag"]
    for question in intention["questions"]:
        questions.append(question)
        labels.append(tag)

# Créer un DataFrame
df = pd.DataFrame({"texte": questions, "label": labels})


df["label"].value_counts()

In [None]:
df.to_csv('data_intension.csv')

In [None]:
# Enregistrez le DataFrame au format CSV
df=pd.read_csv('data_intension.csv')
df

#pretraitement

In [None]:
# Converting the labels into encodings

le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])
# check class distribution
len(df['label'].value_counts())

In [None]:
# get length of all the messages in the train set
seq_len = [len(i.split()) for i in df['texte']]
pd.Series(seq_len).hist(bins = 10)
# Based on the histogram we are selecting the max len as 8
max_seq_len =250

#Prétraitement des données

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(df["texte"], df["label"], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)


In [None]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

encoded_input_train = tokenizer(list(X_train), padding=True, truncation=True, return_tensors='pt')
input_ids_train = encoded_input_train['input_ids']
attention_mask_train = encoded_input_train['attention_mask']

encoded_input_val = tokenizer(list(X_val), padding=True, truncation=True, return_tensors='pt')
input_ids_val = encoded_input_val['input_ids']
attention_mask_val = encoded_input_val['attention_mask']

encoded_input_test = tokenizer(list(X_test), padding=True, truncation=True, return_tensors='pt')
input_ids_test = encoded_input_test['input_ids']
attention_mask_test = encoded_input_test['attention_mask']


In [None]:
y_train

In [None]:
train_seq = torch.tensor(input_ids_train)
train_mask = torch.tensor(attention_mask_train)
y_train
train_y = torch.tensor(y_train.tolist())

val_seq = torch.tensor(input_ids_val)
val_mask = torch.tensor(attention_mask_val)
val_y = torch.tensor(y_val.tolist())

test_seq = torch.tensor(input_ids_test)
test_mask = torch.tensor(attention_mask_test)
test_y = torch.tensor(y_test.tolist())


In [None]:

batch_size = 16
val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


batch_size = 16
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

#Modèle

In [None]:


class CamembertClassifier(nn.Module):
    def __init__(self, num_classes):
        super(CamembertClassifier, self).__init__()
        self.camembert = CamembertModel.from_pretrained('camembert-base')
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(768, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, num_classes)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.camembert(input_ids=input_ids, attention_mask=attention_mask)
        cls_hs = outputs.last_hidden_state[:, 0]
        x = self.fc1(cls_hs)
        x = self.dropout(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)
        x = torch.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x



In [None]:
# initialisation du modèle
num_classes = 43
model = CamembertClassifier(num_classes)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)



#Entrainement du modèle

In [None]:
num_epochs = 100
eval_interval = 1
patience = 3


train_losses = []
val_losses = []
test_losses = []

train_accuracies = []
val_accuracies = []
test_accuracies = []

In [None]:
best_val_loss = float('inf')
no_improvement_count = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_train = 0
    total_train = 0

    for input_ids, attention_mask, labels in train_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, predicted = torch.max(output.data, 1)
        total_train += labels.size(0)
        correct_train += (predicted == labels).sum().item()

    average_loss = total_loss / len(train_dataloader)
    train_losses.append(average_loss)

    accuracy_train = 100 * correct_train / total_train
    train_accuracies.append(accuracy_train)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss (Train): {average_loss:.4f}, Accuracy (Train): {accuracy_train:.2f}%')

    # Évaluation sur l'ensemble de validation
    if (epoch + 1) % eval_interval == 0:
        model.eval()
        with torch.no_grad():
            correct_val = 0
            total_val = 0
            val_predictions = []

            for input_ids, attention_mask, labels in val_dataloader:
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
                labels = labels.to(device)

                output = model(input_ids, attention_mask)
                _, predicted = torch.max(output.data, 1)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

                val_predictions.extend(predicted.tolist())

            accuracy_val = 100 * correct_val / total_val
            val_accuracies.append(accuracy_val)

            val_loss = criterion(output, labels).item()
            val_losses.append(val_loss)
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss (Val): {val_loss:.4f}, Accuracy (Val): {accuracy_val:.2f}%')


            if val_loss < best_val_loss:
                best_val_loss = val_loss
                no_improvement_count = 0
            else:
                no_improvement_count += 1


            if no_improvement_count >= patience:
                print(f'Arrêt précoce à l\'époque {epoch+1}...')
                break

        if no_improvement_count >= patience:
            break




#Plot

In [None]:

# Plot des métriques
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

# Plot des pertes d'entraînement et de validation
plt.subplot(1, 2, 1)
plt.plot(range(1, len(train_losses)+1), train_losses, label='Train Loss', color='blue')
plt.plot(range(1, len(val_losses)+1), val_losses, label='Validation Loss', color='red')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss')

# Plot des précisions d'entraînement et de validation
plt.subplot(1, 2, 2)
plt.plot(range(1, len(train_accuracies)+1), train_accuracies, label='Train Accuracy', color='blue')
plt.plot(range(1, len(val_accuracies)+1), val_accuracies, label='Validation Accuracy', color='red')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')

plt.tight_layout()
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Évaluation sur l'ensemble de test
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    predictions = []

    for input_ids, attention_mask, labels in test_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        output = model(input_ids, attention_mask)
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        predictions.extend(predicted.tolist())

    accuracy_test = 100 * correct / total



In [None]:
# Calculer la matrice de confusion
y_true = y_test.tolist()
y_pred = predictions
cm = confusion_matrix(y_true, y_pred)


# Afficher le DataFrame
print("Matrice de Confusion :")
print(cm)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculer les différentes métriques
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Afficher les métriques
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')


#Sauvegarde du modèle

In [None]:
import os

os.makedirs('ModelChatBot', exist_ok=True)

torch.save(model.state_dict(), 'ModelChatBot/model.pth')

In [None]:
from sklearn.metrics import confusion_matrix

matrice_confusion= confusion_matrix(y_true, y_pred)
print("Matrice de confusion:\n", matrice_confusion)

rapport_classification = classification_report(y_true, y_pred)
print("Rapport de classification:\n", rapport_classification)
