In [None]:
# Transformers installation
! pip install transformers datasets evaluate accelerate


# Text classification

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load emotions dataset

In [None]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_csv('/content/dataFinal.csv')
dataset = Dataset.from_pandas(df)

# Specify the percentage for the training set
train_percentage = 0.8

# Use the 'train_test_split' method to split the dataset
splits = dataset.train_test_split(test_size=1 - train_percentage, shuffle=True, seed=42)
train_data, test_data = splits['train'], splits['test']

# Create a DatasetDict containing 'train' and 'test' splits
data = DatasetDict({
    'train': train_data,
    'test': test_data
})

Then take a look at an example:

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10224
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2556
    })
})

In [None]:
data["test"][20]

{'text': 'Je me sens inquiet en ouvrant la porte bleue', 'label': 4}

## Preprocess

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True)

In [None]:
tokenized_emotions = data.map(preprocess_function, batched=True)

Map:   0%|          | 0/10224 [00:00<?, ? examples/s]

Map:   0%|          | 0/2556 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [None]:
id2label = {0: "tristesse", 1: "joie", 2: "degout", 3:"colere", 4:"peur", 5:"surpris"}
label2id = {"tristesse": 0, "joie":1,  "degout":2, "colere":3, "peur":4, "surpris":5}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "camembert-base", num_labels=6, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="translateddata_emotion_classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_emotions["train"],
    eval_dataset=tokenized_emotions["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.177,0.645832,0.814163
2,0.6088,0.417184,0.872848
3,0.4064,0.364712,0.893192


TrainOutput(global_step=1917, training_loss=0.6456454838696531, metrics={'train_runtime': 25511.3929, 'train_samples_per_second': 1.202, 'train_steps_per_second': 0.075, 'total_flos': 1392096588424128.0, 'train_loss': 0.6456454838696531, 'epoch': 3.0})

In [None]:
trainer.push_to_hub()

'https://huggingface.co/ac0hik/translateddata_emotion_classifier/tree/main/'

## Inference

In [None]:
text = "aufait, je suis resté impressionné par plus de quelques entreprises"

In [None]:
from transformers import AutoTokenizer


In [None]:
import torch
from transformers import AutoModelForSequenceClassification

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/translateddata_emotion_classifier")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/content/translateddata_emotion_classifier")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

'surpris'