In [1]:
import pyarrow.parquet as pa


In [2]:
table = pa.read_table('/kaggle/input/translated-train-unlp-2025/translated_train.parquet') 
table


pyarrow.Table
id: string
content: string
lang: string
manipulative: bool
techniques: list<element: string>
  child 0, element: string
trigger_words: list<element: list<element: int64>>
  child 0, element: list<element: int64>
      child 0, element: int64
translated_content: string
----
id: [["0bb0c7fa-101b-4583-a5f9-9d503339141c","7159f802-6f99-4e9d-97bd-6f565a4a0fae","e6a427f1-211f-405f-bd8b-70798458d656","1647a352-4cd3-40f6-bfa1-d87d42e34eea","9c01de00-841f-4b50-9407-104e9ffb03bf",...,"0e5dd135-ef41-48d3-b274-faedf3a2126c","08e6772a-9793-4ec9-babd-2a9e0e8b31f9","d7cfa984-46f2-450d-b4ec-28a0b5d93756","4256b2b8-43bc-4d90-95c4-5fb25f1ab0e3","d7700072-24d9-443c-8bdb-b5cdd5530d86"]]
content: [["Новий огляд мапи DeepState від російського військового експерта, кухара путіна 2 розряду, спеціаліста по снарядному голоду та ректора музичної академії міноборони рф Євгєнія Пригожина. 
Пригожин прогнозує, що невдовзі настане день звільнення Криму і день розпаду росії. Каже, що передумови цього вж

In [3]:
import os
import ast
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

import wandb

wandb.login(key="b275fa43653e64df67803d8487e7760058f8f0ab")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshah1st-work-ua[0m ([33mshah1st-work-ua-igor-sikorsky-kyiv-polytechnic-institute[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
technique_labels = [
    'straw_man',
    'appeal_to_fear',
    'fud',
    'bandwagon',
    'whataboutism',
    'loaded_language',
    'glittering_generalities',
    'euphoria',
    'cherry_picking',
    'cliche'
]

In [5]:
df = pd.read_parquet('/kaggle/input/translated-train-unlp-2025/translated_train.parquet', engine='pyarrow')
df['techniques']

0                   [euphoria, loaded_language]
1             [loaded_language, cherry_picking]
2                   [loaded_language, euphoria]
3                                          None
4                             [loaded_language]
                         ...                   
3817                [loaded_language, euphoria]
3818                          [loaded_language]
3819                                       None
3820                                       None
3821    [loaded_language, whataboutism, cliche]
Name: techniques, Length: 3822, dtype: object

In [6]:
ssubmission = pd.read_csv('/kaggle/input/unlp-2025-shared-task-classification-techniques/sample_submission.csv')
targets = ssubmission.set_index('id').columns

from collections.abc import Iterable

for col in targets:
    df[col] = 0

for ind, row in df.iterrows():
    if row['techniques'] is not None and isinstance(row['techniques'], Iterable):
        for t in row['techniques']:
            t_norm = t.strip().lower()
            if t_norm in targets:
                df.loc[ind, t_norm] = 1

df['labels'] = list(df[targets].values)

print(df[['id', 'techniques', 'labels']])

                                        id  \
0     0bb0c7fa-101b-4583-a5f9-9d503339141c   
1     7159f802-6f99-4e9d-97bd-6f565a4a0fae   
2     e6a427f1-211f-405f-bd8b-70798458d656   
3     1647a352-4cd3-40f6-bfa1-d87d42e34eea   
4     9c01de00-841f-4b50-9407-104e9ffb03bf   
...                                    ...   
3817  0e5dd135-ef41-48d3-b274-faedf3a2126c   
3818  08e6772a-9793-4ec9-babd-2a9e0e8b31f9   
3819  d7cfa984-46f2-450d-b4ec-28a0b5d93756   
3820  4256b2b8-43bc-4d90-95c4-5fb25f1ab0e3   
3821  d7700072-24d9-443c-8bdb-b5cdd5530d86   

                                   techniques                          labels  
0                 [euphoria, loaded_language]  [0, 0, 0, 0, 0, 1, 0, 1, 0, 0]  
1           [loaded_language, cherry_picking]  [0, 0, 0, 0, 0, 1, 0, 0, 1, 0]  
2                 [loaded_language, euphoria]  [0, 0, 0, 0, 0, 1, 0, 1, 0, 0]  
3                                        None  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  
4                           [loaded_language]  

In [7]:
df['labels'] = df['labels'].apply(lambda label_list: [float(l) for l in label_list])

In [8]:
df_uk = df[['id', 'content', 'labels']].rename(columns={'content': 'text'})
df_en = df[['id', 'translated_content', 'labels']].rename(columns={'translated_content': 'text'})

train_uk0, test_uk = train_test_split(df_uk, test_size=0.2, random_state=42)
train_en0, test_en = train_test_split(df_en, test_size=0.2, random_state=42)

train_uk, valid_uk = train_test_split(train_uk0, test_size=0.1, random_state=42)
train_en, valid_en = train_test_split(train_en0, test_size=0.1, random_state=42)

dataset_uk_train = Dataset.from_pandas(train_uk)
dataset_uk_valid = Dataset.from_pandas(valid_uk)
dataset_uk_test = Dataset.from_pandas(test_uk)

dataset_en_train = Dataset.from_pandas(train_en)
dataset_en_valid = Dataset.from_pandas(valid_en)
dataset_en_test = Dataset.from_pandas(test_en)

In [9]:
def tokenize_function(examples, tokenizer):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

def prepare_dataset(dataset, tokenizer):
    dataset = dataset.map(lambda x: tokenize_function(x, tokenizer), batched=True)
    
    def cast_labels(batch):
        batch["labels"] = [[float(l) for l in label_list] for label_list in batch["labels"]]
        return batch
    dataset = dataset.map(cast_labels, batched=True)
    
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    return dataset


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = (1 / (1 + np.exp(-logits)) > 0.5).astype(int)
    f1 = f1_score(labels, preds, average='macro', zero_division=0)
    precision = precision_score(labels, preds, average='macro', zero_division=0)
    recall = recall_score(labels, preds, average='macro', zero_division=0)
    return {"f1": f1, "precision": precision, "recall": recall}

def train_model(model_name, train_dataset, eval_dataset, output_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    train_dataset = prepare_dataset(train_dataset, tokenizer)
    eval_dataset = prepare_dataset(eval_dataset, tokenizer)
    
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(technique_labels),
        problem_type="multi_label_classification"
    )
    
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    metrics = trainer.evaluate()
    return trainer, metrics



In [10]:
print("Обучение модели на украинских оригинальных текстах...")
trainer_uk, metrics_uk = train_model(
        model_name="bert-base-multilingual-cased",
        train_dataset=dataset_uk_train,
        eval_dataset=dataset_uk_valid,
        output_dir="model_uk"
    )   


Обучение модели на украинских оригинальных текстах...


Map:   0%|          | 0/2751 [00:00<?, ? examples/s]

Map:   0%|          | 0/2751 [00:00<?, ? examples/s]

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3179,0.305397,0.07967,0.07513,0.084795
2,0.2642,0.298392,0.113739,0.155588,0.091228
3,0.2588,0.302267,0.129183,0.369608,0.095881
4,0.2222,0.300201,0.155996,0.347165,0.119654
5,0.2068,0.304399,0.157937,0.347733,0.11679


In [11]:
print("Обучение модели на переведённых текстах...")
trainer_en, metrics_en = train_model(
        model_name="bert-base-uncased",
        train_dataset=dataset_en_train,
        eval_dataset=dataset_en_valid,
        output_dir="model_en"
    )   

Обучение модели на переведённых текстах...


Map:   0%|          | 0/2751 [00:00<?, ? examples/s]

Map:   0%|          | 0/2751 [00:00<?, ? examples/s]

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

Map:   0%|          | 0/306 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3237,0.320968,0.071667,0.068254,0.075439
2,0.2579,0.302038,0.117261,0.153686,0.100292
3,0.26,0.302558,0.141513,0.259872,0.109732
4,0.2213,0.295017,0.175064,0.403242,0.134886
5,0.2068,0.296363,0.173345,0.351901,0.138328


In [12]:
tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
dataset_uk_test = prepare_dataset(dataset_uk_test, tokenizer1)
tokenizer2 = AutoTokenizer.from_pretrained("bert-base-uncased")
dataset_en_test = prepare_dataset(dataset_en_test, tokenizer2)
print("Сравнение моделей:")    
print("Модель на украинских текстах:", trainer_uk.evaluate(dataset_uk_test))     
print("Модель на переведённых текстах:", trainer_en.evaluate(dataset_en_test))

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Map:   0%|          | 0/765 [00:00<?, ? examples/s]

Сравнение моделей:
Модель на украинских текстах: {'eval_loss': 0.27367255091667175, 'eval_f1': 0.17656205186042148, 'eval_precision': 0.2980612671871993, 'eval_recall': 0.13792460583108596, 'eval_runtime': 2.7011, 'eval_samples_per_second': 283.219, 'eval_steps_per_second': 17.771, 'epoch': 5.0}
Модель на переведённых текстах: {'eval_loss': 0.27202582359313965, 'eval_f1': 0.18919659881143192, 'eval_precision': 0.3497547974413647, 'eval_recall': 0.148049206561725, 'eval_runtime': 2.7094, 'eval_samples_per_second': 282.354, 'eval_steps_per_second': 17.716, 'epoch': 5.0}


### Модель на переведённых текстах имеет немного более высокие показатели по всем метрикам, но результаты сопоставимы