In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np

In [39]:
def load_data(file_paths):
    dataframes = [pd.read_csv(file, delimiter=';') for file in file_paths]
    data = pd.concat(dataframes, ignore_index=True)
    return data

In [40]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [41]:
file_paths = [
    'SpacialFor_Gosha.csv',
]

In [42]:
data = load_data(file_paths)

In [43]:
print("Columns in the DataFrame:", data.columns)

if 'data' in data.columns and 'class' in data.columns:
    print("'data' and 'class' columns are present.")
else:
    print("Columns are missing or misnamed. Please check the column names.")


if 'data' in data.columns and 'class' in data.columns:
    texts = data['data'].tolist()
    labels = data['class'].tolist()

else:
    raise KeyError("Required columns ('data', 'class') are not present in the DataFrame.")

Columns in the DataFrame: Index(['data', 'class'], dtype='object')
'data' and 'class' columns are present.


In [44]:
print("Missing values in 'class' column:", data['class'].isna().sum())
print("Infinite values in 'class' column:", np.isinf(data['class']).sum())

Missing values in 'class' column: 1000
Infinite values in 'class' column: 0


In [45]:
data = data.dropna(subset=['class'])
data = data[~np.isinf(data['class'])]

In [46]:
data['class'] = data['class'].astype(int)
print("Unique labels in the dataset:", data['class'].unique())

if not all(data['class'].between(0, 2)):
    raise ValueError("Labels must be in the range [0, 1, 2]")

Unique labels in the dataset: [0 2 1]


In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [48]:
max_len = 128
texts = data['data'].tolist()
labels = data['class'].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = TextDataset(val_texts, val_labels, tokenizer, max_len)

In [49]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# pip install accelerate -U

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
)

In [50]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=2e-5,
)



In [51]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1)),
        'f1': classification_report(p.label_ids, p.predictions.argmax(-1), output_dict=True)['macro avg']['f1-score']
    }
)

In [52]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=2660, training_loss=0.6693605626436104, metrics={'train_runtime': 9840.9606, 'train_samples_per_second': 2.159, 'train_steps_per_second': 0.27, 'total_flos': 1397461139907840.0, 'train_loss': 0.6693605626436104, 'epoch': 5.0})

In [53]:
evaluation_results = trainer.evaluate()
print("Evaluation results:", evaluation_results)

Evaluation results: {'eval_loss': 0.811968982219696, 'eval_accuracy': 0.7347130761994356, 'eval_f1': 0.7344273743911741, 'eval_runtime': 114.7601, 'eval_samples_per_second': 9.263, 'eval_steps_per_second': 1.159, 'epoch': 5.0}


In [55]:
import os
model_save_path = './saved_model'

# Ensure the save path directory exists
os.makedirs(model_save_path, exist_ok=True)

# Save the model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./saved_model\\tokenizer_config.json',
 './saved_model\\special_tokens_map.json',
 './saved_model\\vocab.txt',
 './saved_model\\added_tokens.json')