In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import pandas as pd
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("../data/feedbacks_enriched.csv")
df = df[["clean_text", "rating_sentiment"]]

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["rating_sentiment"].map(label2id)


In [3]:
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42
)


In [4]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
def tokenize(batch):
    return tokenizer(
        df["clean_text"].tolist(),
        truncation=True,
        padding="max_length",
        max_length=128
    )


In [10]:
#dataset  pytorch minimal
import torch

class FeedbackDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        encodings = tokenize(df)
        self.encodings = encodings
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


In [11]:
train_dataset = FeedbackDataset(train_df.head(200))  #pour pas laguer a cause de la RAM
test_dataset = FeedbackDataset(test_df.head(50))


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    learning_rate=2e-5,
    logging_steps=20,
    save_strategy="no"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()


In [None]:
model.save_pretrained("../models/bert_sentiment")
tokenizer.save_pretrained("../models/bert_sentiment")