In [None]:
# Transformers installation
! pip install transformers datasets evaluate accelerate


Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.1

# Text classification

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load tweet_sentiment_multilingual dataset

Start by loading the IMDb dataset from the 🤗 Datasets library:

In [None]:
from datasets import load_dataset

data = load_dataset("cardiffnlp/tweet_sentiment_multilingual",'french')
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1839
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 324
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 870
    })
})

Then take a look at an example:

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1839
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 324
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 870
    })
})

In [None]:
data["test"][0]

{'text': 'Royal: le président n\'aime pas les pauvres? "c\'est n\'importe quoi" http …',
 'label': 0}

There are two fields in this dataset:

- `text`: the movie review text.
- `label`: a value that is either `0` for a negative review or `1` for a neutral review and `2` for positive.

## Preprocess

The next step is to load a camemBERT tokenizer to preprocess the `text` field:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than camemBERT's maximum input length:

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once:

In [None]:
tokenized_emotions = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/1839 [00:00<?, ? examples/s]

Map:   0%|          | 0/324 [00:00<?, ? examples/s]

Map:   0%|          | 0/870 [00:00<?, ? examples/s]

Now create a batch of examples using [DataCollatorWithPadding](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorWithPadding). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the accuracy:

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:

In [None]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral":1,  "positive":2}

</Tip>

You're ready to start training your model now! Load camemBERT with [AutoModelForSequenceClassification](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSequenceClassification) along with the number of expected labels, and the label mappings:

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "camembert-base", num_labels=3, id2label=id2label, label2id=label2id
)



model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


At this point, only three steps remain:

1. Define your training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the accuracy and save the training checkpoint.
2. Pass the training arguments to [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [None]:
training_args = TrainingArguments(
    output_dir="camembert_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_emotions["train"],
    eval_dataset=tokenized_emotions["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.851034,0.626543
2,No log,0.762737,0.712963
3,No log,0.696552,0.716049
4,No log,0.686212,0.743827
5,0.712600,0.663667,0.75
6,0.712600,0.712141,0.765432
7,0.712600,0.764125,0.743827
8,0.712600,0.766245,0.765432
9,0.293200,0.776473,0.774691
10,0.293200,0.787677,0.765432


TrainOutput(global_step=1150, training_loss=0.46266832600469177, metrics={'train_runtime': 8009.0363, 'train_samples_per_second': 2.296, 'train_steps_per_second': 0.144, 'total_flos': 363464978028138.0, 'train_loss': 0.46266832600469177, 'epoch': 10.0})

<Tip>

[Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.

</Tip>

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
trainer.push_to_hub()

events.out.tfevents.1702818264.95aa1f210e94.1065.0:   0%|          | 0.00/8.29k [00:00<?, ?B/s]

'https://huggingface.co/ac0hik/camembert_model/tree/main/'

<Tip>

For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).

</Tip>

## Inference

Great, now that you've finetuned a model, you can use it for inference!

Grab some text you'd like to run inference on:

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/camembert_model")
model = AutoModelForSequenceClassification.from_pretrained("//content/camembert_model")

In [None]:
text = "ton clavier est blanc"

In [None]:
inputs = tokenizer(text, return_tensors="pt")

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'neutral'

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for sentiment analysis with your model, and pass your text to it:

In [None]:
texts = ["bah c'est nul je trouve","merci c'est tres gentil de ta part","ton clavier est noir"]

In [None]:
multipule_inputs = []
for text in texts:
  multipule_inputs.append(tokenizer(text, return_tensors="pt"))

Pass your inputs to the model and return the `logits`:

Get the class with the highest probability, and use the model's `id2label` mapping to convert it to a text label:

In [None]:
multi_logitss = []
with torch.no_grad():
  for inputs in multipule_inputs:
    logits = model(**inputs).logits
    multi_logitss.append(logits)

In [None]:
output = []
for logits in multi_logitss:
  predicted_class_id = logits.argmax().item()
  model.config.id2label[predicted_class_id]
  output.append(model.config.id2label[predicted_class_id])
output

['negative', 'positive', 'negative']

In [None]:
# prompt: i wanna zip a folder here

!zip -r camembert_SA.zip /content/camembert_model


  adding: content/camembert_model/ (stored 0%)
  adding: content/camembert_model/checkpoint-1035/ (stored 0%)
  adding: content/camembert_model/checkpoint-1035/model.safetensors (deflated 12%)
  adding: content/camembert_model/checkpoint-1035/special_tokens_map.json (deflated 52%)
  adding: content/camembert_model/checkpoint-1035/rng_state.pth (deflated 24%)
  adding: content/camembert_model/checkpoint-1035/training_args.bin (deflated 51%)
  adding: content/camembert_model/checkpoint-1035/config.json (deflated 53%)
  adding: content/camembert_model/checkpoint-1035/tokenizer.json (deflated 75%)
  adding: content/camembert_model/checkpoint-1035/tokenizer_config.json (deflated 82%)
  adding: content/camembert_model/checkpoint-1035/trainer_state.json (deflated 75%)
  adding: content/camembert_model/checkpoint-1035/scheduler.pt (deflated 55%)
  adding: content/camembert_model/checkpoint-1035/optimizer.pt (deflated 24%)
  adding: content/camembert_model/checkpoint-575/ (stored 0%)
  adding: 