In [1]:
import sys
import comet_ml
import torch

import numpy as np

from sklearn.metrics import (
    f1_score, confusion_matrix, classification_report,
    balanced_accuracy_score, accuracy_score,
)
from transformers.integrations import CometCallback

sys.path.append('..')
from lib.utils import set_seed, get_device
from lib.utils.constants import RANDOM_SEED
from lib.data.loading import pd_read_jsonl_file

comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
# import os
# from dotenv import load_dotenv

# load_dotenv()

# comet_api_key = os.getenv("COMET_API_KEY")
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [28]:
import torch

torch.cuda.empty_cache()

In [3]:
set_seed(RANDOM_SEED)

DEVICE = get_device()
print(f'Using device: {DEVICE}')

Using device: cuda


In [4]:
df_en_train = pd_read_jsonl_file('../data/en_train.jsonl')
df_en_test = pd_read_jsonl_file('../data/en_dev.jsonl')
df_en_test_no_label = pd_read_jsonl_file('../data/en_devtest_text_id_only.jsonl')

In [5]:
from sklearn.model_selection import train_test_split

df_en_train, df_en_dev = train_test_split(df_en_train, test_size=0.3, random_state=RANDOM_SEED, stratify=df_en_train["label"])

In [54]:
df_en_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 427536 entries, 386827 to 73522
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          427536 non-null  object
 1   source      427536 non-null  object
 2   sub_source  427536 non-null  object
 3   lang        427536 non-null  object
 4   model       427536 non-null  object
 5   label       427536 non-null  int64 
 6   text        427536 non-null  object
dtypes: int64(1), object(6)
memory usage: 26.1+ MB


In [55]:
df_en_train.head()

Unnamed: 0,id,source,sub_source,lang,model,label,text
386827,c923eeb6-485f-426a-8d92-e097e7d314ff,m4gt,reddit,en,human,0,Virgin Islander here.\n\nAs the wiki entry not...
447629,f963f997-4bba-4ff7-9089-3eeb23b69f00,m4gt,outfox,en,llama3-8b,1,No I think that the Electoral College does not...
527410,035cdd43-a558-4071-b561-09ab5a046654,mage,yelp,en,human,0,"Based on Yelp reviews, I thought I had found m..."
19183,18571e52-70d5-4fe8-8059-b388a48657c6,m4gt,reddit,en,gpt-3.5-turbo,1,Contrary to what you may have seen in western ...
551034,4b6a8cb8-1d78-42a3-a5a3-ab863a7fb844,m4gt,reddit,en,human,0,"That's a bad question. It's like asking ""how b..."


In [7]:
MODEL_NAME = "allenai/longformer-base-4096"

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [9]:
# Count the number of tokens in each sentence
df_en_train['num_tokens'] = df_en_train['text'].apply(lambda x: len(tokenizer(x)['input_ids']))
df_en_dev['num_tokens'] = df_en_dev['text'].apply(lambda x: len(tokenizer(x)['input_ids']))
df_en_test['num_tokens'] = df_en_test['text'].apply(lambda x: len(tokenizer(x)['input_ids']))

Token indices sequence length is longer than the specified maximum sequence length for this model (1457 > 512). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

In [None]:
df_en_train['num_tokens'].describe()

In [None]:
df_en_dev['num_tokens'].describe()

In [None]:
df_en_test['num_tokens'].describe()

In [None]:
# Create a distribution plot over the number of tokens grouped by dataset and label
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

fig, ax = plt.subplots(2, 3, figsize=(25, 10))

sns.histplot(df_en_train, x='num_tokens', hue='label', kde=True, ax=ax[0][0])
sns.histplot(df_en_dev, x='num_tokens', hue='label', kde=True, ax=ax[0][1])
sns.histplot(df_en_test, x='num_tokens', kde=True, ax=ax[0][2])

sns.histplot(df_en_train, x='num_tokens', hue='label', kde=True, ax=ax[1][0])
sns.histplot(df_en_dev, x='num_tokens', hue='label', kde=True, ax=ax[1][1])
sns.histplot(df_en_test, x='num_tokens', kde=True, ax=ax[1][2])

ax[0][0].set_title('Train')
ax[0][0].set_xlim(left=-50)
ax[0][1].set_title('Dev')
ax[0][1].set_xlim(left=-50)
ax[0][2].set_title('Test')
ax[0][2].set_xlim(left=-50)

ax[1][0].set_title('Train')
ax[1][0].set_xlim(left=-10, right=2500)
ax[1][1].set_title('Dev')
ax[1][1].set_xlim(left=-10, right=2500)
ax[1][2].set_title('Test')
ax[1][2].set_xlim(left=-10, right=2500)

plt.show()

# Load model

In [30]:
MODEL_NAME = "allenai/longformer-base-4096"

In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight'

In [32]:
model

LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

In [33]:
for param in model.parameters():
    param.requires_grad = False

for param in model.longformer.encoder.layer[-1].parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True

In [34]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

longformer.encoder.layer.11.attention.self.query.weight
longformer.encoder.layer.11.attention.self.query.bias
longformer.encoder.layer.11.attention.self.key.weight
longformer.encoder.layer.11.attention.self.key.bias
longformer.encoder.layer.11.attention.self.value.weight
longformer.encoder.layer.11.attention.self.value.bias
longformer.encoder.layer.11.attention.self.query_global.weight
longformer.encoder.layer.11.attention.self.query_global.bias
longformer.encoder.layer.11.attention.self.key_global.weight
longformer.encoder.layer.11.attention.self.key_global.bias
longformer.encoder.layer.11.attention.self.value_global.weight
longformer.encoder.layer.11.attention.self.value_global.bias
longformer.encoder.layer.11.attention.output.dense.weight
longformer.encoder.layer.11.attention.output.dense.bias
longformer.encoder.layer.11.attention.output.LayerNorm.weight
longformer.encoder.layer.11.attention.output.LayerNorm.bias
longformer.encoder.layer.11.intermediate.dense.weight
longformer.encod

In [35]:
def get_num_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


trainable_params = get_num_parameters(model)
print(f'Trainable parameters: {trainable_params} ({trainable_params / model.num_parameters():.2%})')

Trainable parameters: 9451778 (6.36%)


# Measure performance before training/fine-tuning the model

In [11]:
import torch

from tqdm import tqdm


def get_performance_metrics(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    bal_acc = balanced_accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    cr = classification_report(y_true, y_pred)

    return {
        'f1': f1,
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        'confusion_matrix': cm,
        'classification_report': cr
    }


def measure_initial_performance(model, tokenizer, df_test, device, max_length=512, batch_size=32):
    model.to(device)
    model.eval()

    test_samples = df_test.text.tolist()

    outputs = []
    for i in tqdm(range(0, len(test_samples), batch_size)):
        batch = test_samples[i:i + batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
            outputs.append(output.logits)

    outputs = torch.cat(outputs, dim=0)
    predictions = torch.argmax(outputs, dim=1).cpu().numpy()

    performance = get_performance_metrics(df_test.label, predictions)
    return performance


initial_performance = measure_initial_performance(model, tokenizer, df_en_dev, DEVICE)
initial_performance

100%|██████████| 8180/8180 [08:46<00:00, 15.53it/s]


{'f1': 0.38856221156147674,
 'accuracy': 0.6211347886215512,
 'balanced_accuracy': 0.498585638605751,
 'confusion_matrix': array([[   575,  97753],
        [  1418, 162012]]),
 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.29      0.01      0.01     98328\n           1       0.62      0.99      0.77    163430\n\n    accuracy                           0.62    261758\n   macro avg       0.46      0.50      0.39    261758\nweighted avg       0.50      0.62      0.48    261758\n'}

In [11]:
def test_samples(model, tokenizer, df, device, max_length=512):
    model.to(device)
    model.eval()

    test_samples = df.text.tolist()[:5]
    inputs = tokenizer(
        test_samples,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs).logits

    # print(outputs.shape)
    print(outputs)

In [12]:
test_samples(model, tokenizer, df_en_train, DEVICE)

tensor([[-0.0314,  0.0181],
        [-0.0319,  0.0190],
        [-0.0338,  0.0227],
        [-0.0318,  0.0197],
        [-0.0323,  0.0195]], device='cuda:0')


# Fine-tune the model

In [12]:
class_weights = (
    1 / df_en_train.label.value_counts(normalize=True).sort_index()
).tolist()
class_weights = torch.tensor(class_weights)
class_weights = class_weights / class_weights.sum()

class_weights

tensor([0.6252, 0.3748])

In [13]:
from datasets import Dataset, DatasetDict

ds_train = Dataset.from_pandas(df_en_train)
ds_dev = Dataset.from_pandas(df_en_dev)
ds_test = Dataset.from_pandas(df_en_test)

In [14]:
ds_train

Dataset({
    features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text', '__index_level_0__'],
    num_rows: 427536
})

In [15]:
ds_train_shuffled = ds_train.shuffle(seed=RANDOM_SEED)

In [16]:
dataset = DatasetDict({
    'train': ds_train_shuffled,
    'dev': ds_dev,
    'test': ds_test
})

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text', '__index_level_0__'],
        num_rows: 427536
    })
    dev: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text', '__index_level_0__'],
        num_rows: 183231
    })
    test: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 261758
    })
})

In [18]:
MAX_LEN = 2048
# col_to_delete = ['source', 'sub_source', 'lang', 'model']


def preprocess_dataset(ds):
    return tokenizer(ds['text'], truncation=True, max_length=MAX_LEN)


tokenized_datasets = dataset.map(preprocess_dataset, batched=True)
tokenized_datasets.set_format("torch")
     

Map:   0%|          | 0/427536 [00:00<?, ? examples/s]

Map:   0%|          | 0/183231 [00:00<?, ? examples/s]

Map:   0%|          | 0/261758 [00:00<?, ? examples/s]

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 427536
    })
    dev: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 183231
    })
    test: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 261758
    })
})

In [36]:
from transformers import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [37]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    return {
        'macro_f1': f1_score(predictions, labels, average='macro'),
        'accuracy': accuracy_score(predictions,labels),
        'balanced_accuracy': balanced_accuracy_score(predictions, labels),
    }

In [38]:
import torch
import torch.nn.functional as F

from transformers import Trainer


class WeightedCrossEntropyTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(
                class_weights, dtype=torch.float32
        ).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [22]:
# from comet_ml import Experiment
# from comet_ml.integration.pytorch import log_model

# experiment = Experiment(
#     api_key=comet_api_key,
#     project_name="mgt-detection-coling-2025",
#     workspace="unibuc-phd",
# )

In [39]:
params = {
    "learning_rate": 5e-3,
    "num_train_epochs": 2,
    "weight_decay": 1e-2,
    "seed": RANDOM_SEED,
}

# experiment.log_parameters(params)

In [40]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='../results/subtask_a/003_longformer-base-4096',
    learning_rate=params["learning_rate"],
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=params["num_train_epochs"],
    weight_decay=params["weight_decay"],
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    # report_to="comet_ml",
    seed=params["seed"],
)

In [41]:
trainer = WeightedCrossEntropyTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['dev'],
    tokenizer=tokenizer,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    class_weights=class_weights,
    # callbacks=[CometCallback()],
)

  self.class_weights = torch.tensor(


In [None]:
train_result = trainer.train()

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Macro F1,Accuracy,Balanced Accuracy
1,0.7975,0.897694,0.384687,0.625189,0.625189




In [None]:
metrics = train_result.metrics
max_train_samples = len(ds_train)
metrics["train_samples"] = min(max_train_samples, len(ds_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
trainer.save_model("../results/subtask_a/003_longformer-base-4096/longformer-base-4096")

In [None]:
import pandas as pd
from tqdm import tqdm


def make_predictions(model, tokenizer, df, device, file_path=None, max_len=MAX_LEN, batch_size=16):
    model.eval()

    has_labels = ("label" in df.columns)

    all_ids = []
    all_true = []
    all_predictions = []
    with torch.no_grad():
        for i in tqdm(range(0, len(df), batch_size)):
            ids = df[i:i + batch_size].id.tolist()
            texts = df[i:i + batch_size].text.tolist()
            labels = None
            if has_labels:
                labels = df[i:i + batch_size].label.tolist()

            inputs = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
            
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)

            outputs = model(input_ids, attention_mask).logits
            preds = torch.argmax(outputs, dim=1).cpu().numpy()

            all_ids.extend(ids)
            if has_labels:
                all_true.extend(labels)
            all_predictions.extend(preds)

    df_predictions = pd.DataFrame({
        "id": all_ids,
        "label": all_predictions,
    })

    if has_labels:
        df_predictions["true"] = all_true

    if file_path is not None:
        df_predictions.to_json(
            file_path,
            orient="records",
            lines=True,
        )

    return df_predictions
    

In [None]:
df_test_predictions = make_predictions(
    trainer.model,
    tokenizer,
    df_en_test,
    DEVICE,
)

In [None]:
df_en_train.label.value_counts(normalize=True)

In [None]:
df_test_predictions.label.value_counts(normalize=True)

In [None]:
# Compute metrics for test dataset
print(
    f1_score(df_test_predictions.true, df_test_predictions.label, average='macro')
)

In [None]:
df_test_submission_predictions = make_predictions(
    trainer.model,
    tokenizer,
    df_en_test_no_label,
    DEVICE,
    "../results/subtask_a/003_longformer-base-4096/subtask_a.jsonl",
)

In [None]:
df_test_submission_predictions.label.value_counts(normalize=True)

In [43]:
# experiment.end()