In [1]:
import torch
import numpy as np
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, set_seed
from transformers import AutoModelForSequenceClassification, Trainer
from sklearn.metrics import f1_score, classification_report
import os

SEED = 2387
set_seed(SEED)
import logging
#logging.getLogger("transformers").setLevel(logging.ERROR)

# Params & Settings

In [2]:
ex = 23
num_epochs = 5
checkpoint = 'bert-base-uncased'

# Helper Funcs

In [3]:
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

def heaviside(X):
    return np.heaviside(X - 0.5, 0)

def f1(preds):
    logits, y_true = preds
    y_pred = heaviside(sigmoid(logits))
    return {'f1':f1_score(y_true, y_pred, average='macro')}

# Loading datasets (train, val, zhihu)

In [4]:
custom_tokens = pd.read_csv(f'experiments/ex{ex}/custom-tokens-mlabel.csv')

In [5]:
# custom_tokens = pd.read_csv(f'experiments/ex{ex}/custom-tokens-mlabel.csv')
# custom_tokens = custom_tokens['0'].tolist()
custom_tokens = []

In [6]:
#ds_url = 'https://zenodo.org/record/7550385/files/'
ds_url = f'experiments/ex{ex}/'
ds_files = {
    'train': ds_url + 'train-mlabel.tsv',
    'val': ds_url + 'val-mlabel.tsv',
    'val.zhihu': ds_url + 'zhihu-mlabel.tsv'
}

ds = load_dataset('csv', data_files=ds_files, delimiter='\t')
ds = ds.rename_columns({'Text':'text', 'Labels':'label', 'Argument ID': 'id'})
ds

Using custom data configuration default-e54f585afe5ee5ee


Downloading and preparing dataset csv/default to /home/erfan/.cache/huggingface/datasets/csv/default-e54f585afe5ee5ee/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Generating val split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Generating val.zhihu split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/erfan/.cache/huggingface/datasets/csv/default-e54f585afe5ee5ee/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 5393
    })
    val: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 1896
    })
    val.zhihu: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 100
    })
})

In [7]:
import ast
def convert_labels(example):
    example["label"] = [float(i) for i in ast.literal_eval(example["label"])]
    return example

ds = ds.map(convert_labels)


  0%|          | 0/5393 [00:00<?, ?ex/s]

  0%|          | 0/1896 [00:00<?, ?ex/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

# Loading Model & Tokenizer & Collator

In [8]:
custom_tokens

[]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenizer.add_tokens(custom_tokens)
# vocab_size = tokenizer.vocab_size

def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [10]:
training_args = TrainingArguments(
    f'experiments/ex{ex}/models', True,
    num_train_epochs=num_epochs,
    
    evaluation_strategy='epoch',
    save_strategy='epoch'
)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=20, problem_type="multi_label_classification")
# model.resize_token_embeddings(vocab_size)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=f1,
)

# Training the Model

In [12]:
train_output = trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5393
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3375
  Number of trainable parameters = 109497620


  0%|          | 0/3375 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3872, 'learning_rate': 4.259259259259259e-05, 'epoch': 0.74}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to experiments/ex23/models/checkpoint-675
Configuration saved in experiments/ex23/models/checkpoint-675/config.json


{'eval_loss': 0.34467318654060364, 'eval_f1': 0.25278144758708027, 'eval_runtime': 14.5999, 'eval_samples_per_second': 129.864, 'eval_steps_per_second': 16.233, 'epoch': 1.0}


Model weights saved in experiments/ex23/models/checkpoint-675/pytorch_model.bin
tokenizer config file saved in experiments/ex23/models/checkpoint-675/tokenizer_config.json
Special tokens file saved in experiments/ex23/models/checkpoint-675/special_tokens_map.json


{'loss': 0.3195, 'learning_rate': 3.518518518518519e-05, 'epoch': 1.48}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to experiments/ex23/models/checkpoint-1350
Configuration saved in experiments/ex23/models/checkpoint-1350/config.json


{'eval_loss': 0.3370417356491089, 'eval_f1': 0.3225185940350258, 'eval_runtime': 14.7641, 'eval_samples_per_second': 128.42, 'eval_steps_per_second': 16.052, 'epoch': 2.0}


Model weights saved in experiments/ex23/models/checkpoint-1350/pytorch_model.bin
tokenizer config file saved in experiments/ex23/models/checkpoint-1350/tokenizer_config.json
Special tokens file saved in experiments/ex23/models/checkpoint-1350/special_tokens_map.json


{'loss': 0.2855, 'learning_rate': 2.777777777777778e-05, 'epoch': 2.22}
{'loss': 0.2481, 'learning_rate': 2.037037037037037e-05, 'epoch': 2.96}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to experiments/ex23/models/checkpoint-2025
Configuration saved in experiments/ex23/models/checkpoint-2025/config.json


{'eval_loss': 0.3370012640953064, 'eval_f1': 0.36227846346238846, 'eval_runtime': 14.9813, 'eval_samples_per_second': 126.558, 'eval_steps_per_second': 15.82, 'epoch': 3.0}


Model weights saved in experiments/ex23/models/checkpoint-2025/pytorch_model.bin
tokenizer config file saved in experiments/ex23/models/checkpoint-2025/tokenizer_config.json
Special tokens file saved in experiments/ex23/models/checkpoint-2025/special_tokens_map.json


{'loss': 0.2072, 'learning_rate': 1.2962962962962962e-05, 'epoch': 3.7}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to experiments/ex23/models/checkpoint-2700
Configuration saved in experiments/ex23/models/checkpoint-2700/config.json


{'eval_loss': 0.347305029630661, 'eval_f1': 0.372829420267183, 'eval_runtime': 15.0421, 'eval_samples_per_second': 126.046, 'eval_steps_per_second': 15.756, 'epoch': 4.0}


Model weights saved in experiments/ex23/models/checkpoint-2700/pytorch_model.bin
tokenizer config file saved in experiments/ex23/models/checkpoint-2700/tokenizer_config.json
Special tokens file saved in experiments/ex23/models/checkpoint-2700/special_tokens_map.json


{'loss': 0.1854, 'learning_rate': 5.555555555555556e-06, 'epoch': 4.44}


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

Saving model checkpoint to experiments/ex23/models/checkpoint-3375
Configuration saved in experiments/ex23/models/checkpoint-3375/config.json


{'eval_loss': 0.35196682810783386, 'eval_f1': 0.37969985402511675, 'eval_runtime': 15.1267, 'eval_samples_per_second': 125.341, 'eval_steps_per_second': 15.668, 'epoch': 5.0}


Model weights saved in experiments/ex23/models/checkpoint-3375/pytorch_model.bin
tokenizer config file saved in experiments/ex23/models/checkpoint-3375/tokenizer_config.json
Special tokens file saved in experiments/ex23/models/checkpoint-3375/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 1029.2592, 'train_samples_per_second': 26.198, 'train_steps_per_second': 3.279, 'train_loss': 0.26091371437355326, 'epoch': 5.0}


In [13]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
device_name = torch.cuda.get_device_name(torch.device('cuda'))

repline = f'{ex},{SEED},{checkpoint},{trainable_params},{train_output[2]["train_runtime"]},{device_name},{len(tokenized_datasets["train"])},{num_epochs}'

In [14]:
res_train = trainer.evaluate(tokenized_datasets['train'])
res_val = trainer.evaluate(tokenized_datasets['val'])
res_zhihu = trainer.evaluate(tokenized_datasets['val.zhihu'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 5393
  Batch size = 8


  0%|          | 0/675 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


  0%|          | 0/13 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [15]:
repline += f',{res_train["eval_f1"]},{res_val["eval_f1"]},{res_zhihu["eval_f1"]}'
repline += f',{res_train["eval_loss"]},{res_val["eval_loss"]},{res_zhihu["eval_loss"]}'

In [16]:
repline

'23,2387,bert-base-uncased,109497620,1029.2592,NVIDIA GeForce GTX 1650,5393,5,0.7548028542096653,0.37969985402511675,0.2622371134761659,0.14891977608203888,0.35196682810783386,0.30424949526786804'

In [17]:
with open('experiments/report.csv', 'a') as file:
    file.write(f'{repline}\n')

In [18]:
preds_train = trainer.predict(tokenized_datasets['train'])
preds_val = trainer.predict(tokenized_datasets['val'])
preds_zhihu = trainer.predict(tokenized_datasets['val.zhihu'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 5393
  Batch size = 8


  0%|          | 0/675 [00:00<?, ?it/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1896
  Batch size = 8


  0%|          | 0/237 [00:00<?, ?it/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 100
  Batch size = 8


  0%|          | 0/13 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [19]:
res_folder = f'experiments/ex{ex}/results'
os.mkdir(res_folder)

In [20]:
pd.DataFrame(preds_train[0]).to_csv(f'{res_folder}/train_pred.csv', index=False)
pd.DataFrame(preds_train[1]).to_csv(f'{res_folder}/train_true.csv', index=False)

pd.DataFrame(preds_val[0]).to_csv(f'{res_folder}/val_pred.csv', index=False)
pd.DataFrame(preds_val[1]).to_csv(f'{res_folder}/val_true.csv', index=False)

pd.DataFrame(preds_zhihu[0]).to_csv(f'{res_folder}/zhihu_pred.csv', index=False)
pd.DataFrame(preds_zhihu[1]).to_csv(f'{res_folder}/zhihu_true.csv', index=False)

In [21]:
#ds_url = 'https://zenodo.org/record/7550385/files/'
ds_url = f'experiments/ex{ex}/'
ds_files = {
    'test': ds_url + 'test-mlabel.tsv',
    'test.nahj': ds_url + 'nahj-mlabel.tsv',
    'test.nyt': ds_url + 'nyt-mlabel.tsv'
}

ds = load_dataset('csv', data_files=ds_files, delimiter='\t')
ds = ds.rename_columns({'Text':'text', 'Argument ID': 'id'})
ds

Using custom data configuration default-5af5d53bb11c0fc0


Downloading and preparing dataset csv/default to /home/erfan/.cache/huggingface/datasets/csv/default-5af5d53bb11c0fc0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Generating test.nahj split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


Generating test.nyt split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /home/erfan/.cache/huggingface/datasets/csv/default-5af5d53bb11c0fc0/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'text'],
        num_rows: 1576
    })
    test.nahj: Dataset({
        features: ['id', 'text'],
        num_rows: 279
    })
    test.nyt: Dataset({
        features: ['id', 'text'],
        num_rows: 80
    })
})

In [22]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# tokenizer.add_tokens(custom_tokens)
# vocab_size = tokenizer.vocab_size

def tokenize_function(example):
    return tokenizer(example['text'], truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loading configuration file config.json from cache at /home/erfan/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/erfan/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loa

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [23]:
preds_test = trainer.predict(tokenized_datasets['test'])
preds_nahj = trainer.predict(tokenized_datasets['test.nahj'])
preds_nyt = trainer.predict(tokenized_datasets['test.nyt'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1576
  Batch size = 8


  0%|          | 0/197 [00:00<?, ?it/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 279
  Batch size = 8


  0%|          | 0/35 [00:00<?, ?it/s]

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: id, text. If id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 80
  Batch size = 8


  0%|          | 0/10 [00:00<?, ?it/s]

In [24]:
cols = ['Self-direction: thought','Self-direction: action','Stimulation','Hedonism','Achievement','Power: dominance','Power: resources','Face','Security: personal','Security: societal','Tradition','Conformity: rules','Conformity: interpersonal','Humility','Benevolence: caring','Benevolence: dependability','Universalism: concern','Universalism: nature','Universalism: tolerance','Universalism: objectivity']

In [25]:
pd.DataFrame(heaviside(sigmoid(preds_test[0])), index=ds['test']['id'], columns=cols).astype('int').to_csv(f'experiments/ex{ex}/BERT-test.tsv', sep='\t')
pd.DataFrame(heaviside(sigmoid(preds_nahj[0])), index=ds['test.nahj']['id'], columns=cols).astype('int').to_csv(f'experiments/ex{ex}/BERT-nahj.tsv', sep='\t')
pd.DataFrame(heaviside(sigmoid(preds_nyt[0])), index=ds['test.nyt']['id'], columns=cols).astype('int').to_csv(f'experiments/ex{ex}/BERT-nyt.tsv', sep='\t')

NameError: name 'vals' is not defined