### Imports and Config

In [63]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import numpy as np

In [17]:
checkpoint = "bert-base-uncased"

### Loading the data

In [10]:
raw_dataset = load_dataset("my_dataset_loading_script", "condemnation")

Downloading and preparing dataset condemnation_dataset/condemnation (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/geev/.cache/huggingface/datasets/condemnation_dataset/condemnation/1.1.0/ef891eb2986445dbe69883df48d98d7039da591e0c35cb211978c71951a3c83e...


0 examples [00:00, ? examples/s]

filepath my_dataset_loading_script/condemnation_train.csv


0 examples [00:00, ? examples/s]

filepath my_dataset_loading_script/condemnation_test.csv
Dataset condemnation_dataset downloaded and prepared to /home/geev/.cache/huggingface/datasets/condemnation_dataset/condemnation/1.1.0/ef891eb2986445dbe69883df48d98d7039da591e0c35cb211978c71951a3c83e. Subsequent calls will reuse this data.


In [11]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1344
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 336
    })
})

In [14]:
raw_train_dataset = raw_dataset["train"]
raw_train_dataset[0]

{'text': 'As multiple sexual allegations come to light with people like <OTHER TARGET 1>, NPR’s -JOHN DOE-, and Michael Fallon, just to name a few...',
 'label': 1}

In [15]:
raw_train_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['no_condemnation', 'condemnation'], names_file=None, id=None)}

### Classification

In [39]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/geev/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /home/geev/.cache/huggingface/transform

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 1344
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 336
    })
})

In [40]:
training_args = TrainingArguments("test_trainer")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/geev/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
 

In [60]:
trainer = Trainer(
model,
training_args,
train_dataset = tokenized_datasets["train"],
data_collator = data_collator,
tokenizer = tokenizer,
)

In [61]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 1344
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 252


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=252, training_loss=0.12003489146156916, metrics={'train_runtime': 86.4583, 'train_samples_per_second': 46.635, 'train_steps_per_second': 2.915, 'total_flos': 132517527476160.0, 'train_loss': 0.12003489146156916, 'epoch': 3.0})

In [62]:
predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 336
  Batch size = 16


(336, 2) (336,)


In [59]:
metric = load_metric("f1", "accuracy")
preds = np.argmax(predictions.predictions, axis=1)
preds
metric.compute(predictions=preds, references=predictions.label_ids)

{'f1': 0.8377723970944311}

In [65]:
def get_metrics(y_pred, y_true):
    metrics ={}
    metrics["accuracy"] = accuracy_score(y_true, y_pred)
    metrics["precision"] = precision_score(y_true, y_pred)
    metrics["recall"] = recall_score(y_true, y_pred)
    metrics["f1"] = f1_score(y_true, y_pred)
    returin m

SyntaxError: unexpected EOF while parsing (<ipython-input-65-04adf2624862>, line 1)