### 1. Bibliothèques

pip install --upgrade pip
pip install pandas 
pip install scikit-learn
pip install datasets
pip install transformers
pip install sentencepiece # pour camembert
pip install tf-keras

In [1]:
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import numpy as np
import tensorflow as tf


  from .autonotebook import tqdm as notebook_tqdm


### 2. Chargement des donnéees


In [2]:
# Lecture du fichier 
file1='Data clear/fichier_combiné.csv'
df = pd.read_csv(file1)
print(df.shape,df.head())

(33471, 2)                                                Block  Label
0  ﻿The Project Gutenberg eBook of Contes Françai...      3
1  You may copy it, give it away or re-use it und...      3
2  If you are not located in the United States, y...      3
3  Title: Contes Français Editor: Douglas Labaree...      3
4                                                 D.      3


In [3]:
# Mélanager le dataframe
df_suffle = df.sample(frac=0.05, random_state=42).reset_index(drop=True) # On prend 5% du dataFrame
print(df_suffle.shape)
print(df_suffle.head())

(1674, 2)
                                               Block  Label
0  Un cordon de grosses pierres fixait tout autou...      1
1  Vous dirai-je pourtant que j'ai perdu quelques...      2
2  lèvre, _f._ lip; lèvres en fleur, full-blown l...      3
3  Bonaparte (Napoléon), the French Emperor (1769...      3
4  mengheldichten: fyghes noeper;  Bacchus-Cortryck.      2


In [4]:
# Stocker la data sous forme de liste
texts = df_suffle['Block'].tolist()
labels = df_suffle['Label'].tolist()

### 4. Chargement du model 

In [5]:
# Charger le modèle pré-entraîné et le tokenizer
model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4) 


All PyTorch model weights were used when initializing TFCamembertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFCamembertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 5. Tokenizer le dataset

In [6]:
# Tokeniser les textes
encoded_inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='tf')


In [7]:
# Convertir les labels en tenseur TensorFlow
labels = tf.convert_to_tensor(labels)

### 6. Entrainement du model 

In [8]:
# Entraîner le modèle (ajuster les hyperparamètres si nécessaire)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(encoded_inputs, labels, epochs=3, batch_size=1)


Epoch 1/3
  51/1674 [..............................] - ETA: 1:03:24 - loss: 7.5136 - accuracy: 0.3529

KeyboardInterrupt: 

### 7. Prediction

In [None]:
# Prédiction sur une nouvelle phrase
new_text = "Votre nouvelle phrase à classifier"
encoded_input = tokenizer(new_text, return_tensors='tf')
output = model(**encoded_input)
scores = output.logits.numpy()
predicted_class = np.argmax(scores)
print("La classe prédite est :", predicted_class)