In [12]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

train_df = pd.read_csv('training_data_xxs.csv')
test_df = pd.read_csv('test_data.csv')

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4) 

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

trainer.train()

trainer.save_model("./fine-tuned-distilbert")
tokenizer.save_pretrained("./fine-tuned-distilbert")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1499 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]



  0%|          | 0/564 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.7495878338813782, 'eval_runtime': 7.9435, 'eval_samples_per_second': 2.518, 'eval_steps_per_second': 0.378, 'epoch': 1.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.8534401059150696, 'eval_runtime': 10.2162, 'eval_samples_per_second': 1.958, 'eval_steps_per_second': 0.294, 'epoch': 2.0}
{'loss': 0.3849, 'grad_norm': 0.2515682280063629, 'learning_rate': 2.269503546099291e-06, 'epoch': 2.66}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.9557962417602539, 'eval_runtime': 7.359, 'eval_samples_per_second': 2.718, 'eval_steps_per_second': 0.408, 'epoch': 3.0}
{'train_runtime': 4325.5127, 'train_samples_per_second': 1.04, 'train_steps_per_second': 0.13, 'train_loss': 0.3568641243251503, 'epoch': 3.0}


('./fine-tuned-distilbert\\tokenizer_config.json',
 './fine-tuned-distilbert\\special_tokens_map.json',
 './fine-tuned-distilbert\\vocab.txt',
 './fine-tuned-distilbert\\added_tokens.json',
 './fine-tuned-distilbert\\tokenizer.json')

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_dir = "./fine-tuned-distilbert"
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

test_df = pd.read_csv('test_data.csv')

unique_labels = test_df['label'].unique()
print(f"Unique labels in the test set: {unique_labels}")

num_labels = model.config.num_labels
if any(label >= num_labels or label < 0 for label in unique_labels):
    raise ValueError(f"Test dataset contains labels outside the range [0, {num_labels - 1}]")

test_dataset = Dataset.from_pandas(test_df)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

trainer = Trainer(
    model=model,
)

predictions = trainer.predict(tokenized_test_dataset)

predictions_tensor = torch.tensor(predictions.predictions)

preds = torch.argmax(predictions_tensor, axis=1)

labels = torch.tensor(test_df['label'].values)
accuracy = accuracy_score(labels, preds)

precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Unique labels in the test set: [2 0 1 3]


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Accuracy: 0.7
Precision: 0.8125
Recall: 0.7
F1 Score: 0.6984126984126984
