In [1]:
import pandas as pd
import datasets
import os
import json
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from datasets import Dataset, DatasetDict
from transformers import (Trainer, 
                          TrainingArguments, 
                          AutoTokenizer, 
                          AutoModelForSequenceClassification)
from evaluate import load
from utils.preprocessor import Preprocessor

2024-10-30 14:25:44.909466: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-30 14:25:44.918110: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-30 14:25:44.926627: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-30 14:25:44.929326: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-30 14:25:44.937369: I tensorflow/core/platform/cpu_feature_guar

In [2]:
preprocessor = Preprocessor(path2dataset="data/data.csv")

In [3]:
data = preprocessor.get_dataset_with_negative_samples(path_save="data/data_for_sentence_pair_classification/data", 
                                                      question_col="service_name", 
                                                      context_col="local_name",
                                                      save_dataset=False)

In [4]:
data['label'].value_counts(normalize=True)

0    0.83318
1    0.16682
Name: label, dtype: float64

In [5]:
preprocessor.get_dict_for_training(test_size=0.2, 
                                   random_state=42, 
                                   save_path="data/data_for_sentence_pair_classification",
                                   save_dict=False)

DatasetDict({
    train: Dataset({
        features: ['service_name', 'local_name', 'label', '__index_level_0__'],
        num_rows: 69646
    })
    validation: Dataset({
        features: ['service_name', 'local_name', 'label', '__index_level_0__'],
        num_rows: 17412
    })
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [12]:
def preprocess_function(batch):
    return tokenizer(batch["local_name"], batch["service_name"], truncation=True, padding="max_length")
 
dataset = datasets.load_dataset("json", data_files={"train": "train.json", "valid": "valid.json"})

In [13]:
tokenized_data = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np

def compute_metrics(eval_pred):
    f1_metric = load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_metric.add_batch(predictions=predictions, references=labels)
    return f1_metric.compute()

model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["valid"],
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()