In [36]:
import pandas as pd
import numpy as np
import tensorflow as tf

DATA_FILE = "parsed_data.csv"
df = pd.read_csv("parsed_data.csv", usecols=["title", "pov"])
titles = df["title"]
povs = df["pov"].map({"UA":0, "RU":1, "NONE":2}).rename("label")
df = df.drop("pov", axis=1)
df = df.rename(columns={"title":"text"})
df = df.join(povs)

df = df.dropna(axis=0)

train_ds = df.sample(frac=0.8)
test_ds = df.drop(train_ds.index)

from datasets import Dataset

train_ds = Dataset.from_pandas(train_ds, split="train")
test_ds = Dataset.from_pandas(test_ds, split="test")

print(df)



                                                    text  label
0      NYT Kherson braces for battle as RF administra...      0
1      Russia’s Wagner Group commanders cut off convi...      1
2      Convict recruits are tortured and executed in ...      1
3                                    love me some himars      0
4      3 Ukrainian jets Allegedly MiG29 flying Eastwa...      0
...                                                  ...    ...
12497       destroying the positions of Ukrainian Forces      1
12498                                            Kharkov      2
12499  Geotag on photos and video and how to take the...      2
12500                         update, useful links, more      2
12501    rUkraineWarVideoReport  Sub in work in progress      2

[12500 rows x 2 columns]


In [38]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_dataset(data):
    return tokenizer(data["text"], max_length=50, truncation=True, padding="max_length")

train_ds_tokenized = train_ds.map(tokenize_dataset)
test_ds_tokenized = test_ds.map(tokenize_dataset)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [39]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=3)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./urr_transformer/",          
    logging_dir='./urr_transformer/logs',            
    logging_strategy='epoch',
    logging_steps=100,    
    num_train_epochs=2,              
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,  
    learning_rate=5e-6,
    seed=42,
    save_strategy='epoch',
    save_steps=100,
    evaluation_strategy='epoch',
    eval_steps=100,
    load_best_model_at_end=True
)

import evaluate

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds_tokenized,
    eval_dataset=test_ds_tokenized,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()