In [None]:
# Transformers installation
! pip install transformers datasets evaluate accelerate

# Text classification

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load Sarcasm dataset

Start by loading the IMDb dataset from the 🤗 Datasets library:

In [None]:
import pandas as pd

df = pd.read_csv('/content/sarcasm_datafinal.csv')
df.columns


Index(['Titre', 'Sous-titre', 'label'], dtype='object')

In [None]:
combined_text = df['Titre'] + ' ' +  df['Sous-titre']
df['text'] = combined_text
new_df = df[['text','label']]
new_df.head()
new_df.to_csv('sarcasm_merged.csv',index=False)

In [None]:
from datasets import load_dataset, DatasetDict
# data = pd.read_csv('three_emotions.csv')
dataset = load_dataset('csv', data_files='sarcasm_merged.csv')


# Specify the percentage for the training set
train_percentage = 0.8

# Use the 'train_test_split' method to split the dataset
splits = dataset['train'].train_test_split(test_size=1 - train_percentage, shuffle=True, seed=42)
train_data, test_data = splits['train'], splits['test']

# Create a DatasetDict containing 'train' and 'test' splits
data = DatasetDict({
    'train': train_data,
    'test': test_data
})

Generating train split: 0 examples [00:00, ? examples/s]

Then take a look at an example:

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 11152
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2788
    })
})

In [None]:
data["test"][10]

{'text': 'Le gouvernement demande aux Français d’arrêter de se placer dans la trajectoire des balles des chasseurs Après un nouveau décès en marge d’une battue, le gouvernement a demandé, de manière plus solennelle, aux Français d’arrêter de se trouver systématiquement dans la trajectoire des balles des chasseurs.',
 'label': 1}

There are two fields in this dataset:

- `text`: the Title and subtitle combined
- `label`: a value that is either `0` or `1` whether it is sarcastic or not

## Preprocess

The next step is to load a camemBERT tokenizer to preprocess the `text` field:

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Create a preprocessing function to tokenize `text` and truncate sequences to be no longer than camemBERT's maximum input length:

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_text = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/11152 [00:00<?, ? examples/s]

Map:   0%|          | 0/2788 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

Before you start training your model, create a map of the expected ids to their labels with `id2label` and `label2id`:

In [None]:
id2label = {1: "sarcasm", 0: "not_sarcasm"}
label2id = {"sarcasm": 1, "not_sarcasm":0}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "camembert-base", num_labels=2, id2label=id2label, label2id=label2id
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="sarcasm_camembertfineTuned_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_text["train"],
    eval_dataset=tokenized_text["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1733,0.054364,0.986011


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


TrainOutput(global_step=697, training_loss=0.14336048410817917, metrics={'train_runtime': 11736.2835, 'train_samples_per_second': 0.95, 'train_steps_per_second': 0.059, 'total_flos': 637723642649280.0, 'train_loss': 0.14336048410817917, 'epoch': 1.0})

<Tip>

[Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) applies dynamic padding by default when you pass `tokenizer` to it. In this case, you don't need to specify a data collator explicitly.

</Tip>

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/ac0hik/sarcasm_camembertfineTuned_model/commit/249a1c54c0ad044e739ef9a12eac87be11fe5756', commit_message='End of training', commit_description='', oid='249a1c54c0ad044e739ef9a12eac87be11fe5756', pr_url=None, pr_revision=None, pr_num=None)

<Tip>

For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).

</Tip>

## Inference

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/sarcasm_camembertfineTuned_model")
model = AutoModelForSequenceClassification.from_pretrained("//content/sarcasm_camembertfineTuned_model")

In [None]:
text = "Frontières, expulsions, nombre de clandestins... La Cour des Comptes juge sévèrement la politique de lutte contre l’immigration illégale" + " " + "Dans un rapport dévoilé ce jeudi, l’institution de la rue Cambon chiffre à 1,8 milliard le coût de cette politique dont les résultats sont mitigés. Les administrations en charge de ce contentieux sont notamment «saturées»."

In [None]:
inputs = tokenizer(text, return_tensors="pt")

In [None]:
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'not_sarcasm'