**Claim extraction from text documents**

By fine tuning transformers

In [1]:
# load the dataset to huggingface class
from datasets import load_dataset

"""the data consists of 395057 entries with
 - 209612 being no claims
 - 185445 being claims
 """

# 70 / 30 split
train = load_dataset('csv', data_files='data/combined_dataset.csv', split='train[:70%]').rename_column("y", "labels")
test = load_dataset('csv', data_files='data/combined_dataset.csv', split='train[70%:]').rename_column("y", "labels")

print(train, test)

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-97b16aeddd44bfca
Found cached dataset csv (/home/cas/.cache/huggingface/datasets/csv/default-97b16aeddd44bfca/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Using custom data configuration default-97b16aeddd44bfca
Found cached dataset csv (/home/cas/.cache/huggingface/datasets/csv/default-97b16aeddd44bfca/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Dataset({
    features: ['Unnamed: 0', 'claim', 'labels'],
    num_rows: 276540
}) Dataset({
    features: ['Unnamed: 0', 'claim', 'labels'],
    num_rows: 118517
})


In [2]:
# get tokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    tokens = tokenizer(
        batch["claim"], 
        padding="max_length", 
        truncation=True, 
        return_tensors='pt'
    )
    
    return {
        "labels": batch["labels"],
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

In [3]:
# tokenize all train claims in advance
tokenized_train = train.map(tokenize_function, remove_columns=["Unnamed: 0", "claim"], batched=True)

Loading cached processed dataset at /home/cas/.cache/huggingface/datasets/csv/default-97b16aeddd44bfca/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-517a89e030824f6a.arrow


In [4]:
tokenized_test = test.map(tokenize_function, remove_columns=["Unnamed: 0", "claim"], batched=True)

Loading cached processed dataset at /home/cas/.cache/huggingface/datasets/csv/default-97b16aeddd44bfca/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-086d40cc9938543d.arrow


In [5]:
# create smaller subsets for computation efficency
small_train_dataset = tokenized_train.shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_test.shuffle(seed=42).select(range(1000))

In [6]:
tokenized_test

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 118517
})

In [7]:
# load the distilbert to finetune from huggingface
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")

In [8]:
# load class for hyperparameters
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

In [9]:
# define evaluation metrics
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 2.06MB/s]


In [15]:
# copy pasta 🍝 (no idea what this does)
# maybe error at axis?
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [16]:
# create the trainer to finetune with
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train, #small_train_dataset,
    eval_dataset=tokenized_test, #small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [17]:
# start fine tuning! 🍿
trainer.train()

***** Running training *****
  Num examples = 276540
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 103704
  Number of trainable parameters = 66985530

[A

ValueError: Expected input batch_size (4096) to match target batch_size (8).