In [None]:
!pip install datasets



In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ

In [None]:
import pandas as pd
import ast
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import evaluate
import os
from evaluate import load
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification
from transformers import BertTokenizer, BertForTokenClassification

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
df = pd.read_csv('eo.csv')

def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return np.nan  # –∏–ª–∏ –º–æ–∂–Ω–æ –∏—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å –ø—É—Å—Ç–æ–π —Å–ø–∏—Å–æ–∫: []

# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ —Å—Ç—Ä–æ–∫ –≤ —Å–ø–∏—Å–∫–∏
df['labeled'] = df['labeled'].apply(safe_literal_eval)

In [None]:
def extract_tokens_labels(labeled):
    # –ü—Ä–æ–≤–µ—Ä–∫–∞, —è–≤–ª—è–µ—Ç—Å—è –ª–∏ labeled —Å–ø–∏—Å–∫–æ–º
    if isinstance(labeled, list):
        tokens = [token for token, _ in labeled]
        labels = [label for _, label in labeled]
    else:
        tokens, labels = [], []  # –ü—É—Å—Ç—ã–µ —Å–ø–∏—Å–∫–∏ –¥–ª—è NaN –∑–Ω–∞—á–µ–Ω–∏–π
    return tokens, labels

In [None]:
df[['tokens', 'labels']] = df['labeled'].apply(lambda x: pd.Series(extract_tokens_labels(x)))

In [None]:
# –°–æ–∑–¥–∞–µ–º –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏–µ –º–µ—Ç–æ–∫ –≤ —á–∏—Å–ª–æ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è
label_encoder = LabelEncoder()
all_labels = [label for sublist in df['labels'] for label in sublist]
label_encoder.fit(all_labels)

# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –º–µ—Ç–∫–∏ –≤ —á–∏—Å–ª–æ–≤–æ–π —Ñ–æ—Ä–º–∞—Ç
df['labels'] = df['labels'].apply(lambda x: label_encoder.transform(x).tolist())
# –ü—Ä–µ–æ–±—Ä–∞–∑—É–µ–º –º–µ—Ç–∫–∏ —Ç–∞–∫, —á—Ç–æ–±—ã –æ–Ω–∏ –Ω–∞—á–∏–Ω–∞–ª–∏—Å—å —Å 0, –µ—Å–ª–∏ —ç—Ç–æ –Ω–µ–æ–±—Ö–æ–¥–∏–º–æ
df['labels'] = df['labels'].apply(lambda x: [label - 1 if label > 0 else label for label in x])

In [None]:
# –°–æ–∑–¥–∞–Ω–∏–µ Dataset
dataset = Dataset.from_pandas(df[['tokens', 'labels']])

In [None]:
def prepare_data(df):
    tokens = df['tokens'].tolist()
    labels = df['labels'].tolist()
    return {'tokens': tokens, 'labels': labels}

In [None]:
data = prepare_data(df)

if not data['tokens']:
    raise ValueError("No data available to split.")

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö –Ω–∞ –æ–±—É—á–∞—é—â—É—é –∏ —Ç–µ—Å—Ç–æ–≤—É—é –≤—ã–±–æ—Ä–∫–∏
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['tokens'], data['labels'], test_size=0.3, random_state=42
)

In [None]:
# –°–æ–∑–¥–∞–Ω–∏–µ Dataset –¥–ª—è –æ–±—É—á–µ–Ω–∏—è –∏ —Ç–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏—è
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation=True,
        is_split_into_words=True,
        padding='max_length',
        return_offsets_mapping=True
    )

    labels = []
    for i, label_list in enumerate(examples['labels']):
        offset_mapping = tokenized_inputs.offset_mapping[i]
        label_ids = []
        previous_word_idx = None

        for j, (start, end) in enumerate(offset_mapping):
            if start == 0 and end == 0:
                label_ids.append(-100)  # Special tokens
            else:
                word_idx = tokenized_inputs.char_to_token(i, start, end - 1)
                if word_idx is None or word_idx >= len(label_list):
                    label_ids.append(-100)  # Token not mapped to a word or index out of range
                elif word_idx != previous_word_idx:
                    label_ids.append(label_list[word_idx])
                else:
                    label_ids.append(-100)  # Same word as previous token
                previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

In [None]:
def remove_ignore_labels(labels):
    return [label for label in labels if label != -100]

In [None]:
# –ü—Ä–∏–º–µ–Ω–µ–Ω–∏–µ —Ñ—É–Ω–∫—Ü–∏–∏ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ü–∏–∏
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/2946 [00:00<?, ? examples/s]

Map:   0%|          | 0/631 [00:00<?, ? examples/s]

Map:   0%|          | 0/632 [00:00<?, ? examples/s]

In [None]:
train_dataset = train_dataset.remove_columns(['tokens'])
vsl_dataset = val_dataset.remove_columns(['tokens'])
test_dataset = test_dataset.remove_columns(['tokens'])

In [None]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

In [None]:
def compute_metrics(pred):
    predictions = pred.predictions.argmax(axis=-1)  # –ü–æ–ª—É—á–∞–µ–º –∏–Ω–¥–µ–∫—Å—ã –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–Ω—ã—Ö –º–µ—Ç–æ–∫

    # –°–≥–ª–∞–∂–∏–≤–∞–µ–º predictions –∏ labels
    flattened_predictions = [p for preds in predictions for p in preds]
    flattened_labels = [l for labels in pred.label_ids for l in labels]

    # –£–±–∏—Ä–∞–µ–º -100 –∏–∑ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π –∏ –º–µ—Ç–æ–∫
    valid_predictions = [p for p, l in zip(flattened_predictions, flattened_labels) if l != -100]
    valid_labels = [l for l in flattened_labels if l != -100]

    # –í—ã—á–∏—Å–ª–µ–Ω–∏–µ –º–µ—Ç—Ä–∏–∫
    accuracy = accuracy_metric.compute(predictions=valid_predictions, references=valid_labels)["accuracy"]
    precision = precision_metric.compute(predictions=valid_predictions, references=valid_labels, average="weighted")["precision"]
    recall = recall_metric.compute(predictions=valid_predictions, references=valid_labels, average="weighted")["recall"]
    f1 = f1_metric.compute(predictions=valid_predictions, references=valid_labels, average="weighted")["f1"]

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –æ–±—É—á–µ–Ω–∏—è
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    load_best_model_at_end=True,
    report_to="none"  # –û—Ç–∫–ª—é—á–µ–Ω–∏–µ W&B
)



In [None]:
# –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# –¢—Ä–µ–Ω–∏—Ä–æ–≤–∫–∞ –º–æ–¥–µ–ª–∏
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.446374,0.888629,0.815241,0.888629,0.837411
2,0.457600,0.433816,0.886442,0.812121,0.886442,0.842895
3,0.390800,0.448671,0.889097,0.818567,0.889097,0.842419


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=1107, training_loss=0.41681755569030704, metrics={'train_runtime': 1190.1881, 'train_samples_per_second': 7.426, 'train_steps_per_second': 0.93, 'total_flos': 2309361614641152.0, 'train_loss': 0.41681755569030704, 'epoch': 3.0})

In [None]:
# –û—Ü–µ–Ω–∫–∞ —Ñ–∏–Ω–∞–ª—å–Ω—ã—Ö –º–µ—Ç—Ä–∏–∫ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–º –Ω–∞–±–æ—Ä–µ
final_metrics = trainer.evaluate(test_dataset)
print("–ò—Ç–æ–≥–æ–≤—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–º –Ω–∞–±–æ—Ä–µ:", final_metrics)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


–ò—Ç–æ–≥–æ–≤—ã–µ –º–µ—Ç—Ä–∏–∫–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤–æ–º –Ω–∞–±–æ—Ä–µ: {'eval_loss': 0.4498614966869354, 'eval_accuracy': 0.8815151040851463, 'eval_precision': 0.807306214502757, 'eval_recall': 0.8815151040851463, 'eval_f1': 0.8366245508646941, 'eval_runtime': 18.3214, 'eval_samples_per_second': 34.495, 'eval_steps_per_second': 4.312, 'epoch': 3.0}


In [None]:
# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –æ–±—É—á–µ–Ω–Ω–æ–π –º–æ–¥–µ–ª–∏
model.save_pretrained('./train_model')
tokenizer.save_pretrained('./train_model')

('./train_model/tokenizer_config.json',
 './train_model/special_tokens_map.json',
 './train_model/vocab.txt',
 './train_model/added_tokens.json',
 './train_model/tokenizer.json')