In [97]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset
import evaluate
import numpy as np


In [9]:
train = load_dataset("csv",data_files = "/Users/Sarah/Documents/TD/Clara Project/bankingFAQdata.csv", split='train[:90%]')
test = load_dataset("csv",data_files = "/Users/Sarah/Documents/TD/Clara Project/bankingFAQdata.csv", split='train[10%:]')
# could do this too train_dataset, validation_dataset= train_dataset.train_test_split(test_size=0.1).values()

Using custom data configuration default-cf00f77e6b2126f4
Found cached dataset csv (/Users/Sarah/.cache/huggingface/datasets/csv/default-cf00f77e6b2126f4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Using custom data configuration default-cf00f77e6b2126f4
Found cached dataset csv (/Users/Sarah/.cache/huggingface/datasets/csv/default-cf00f77e6b2126f4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


Dataset({
    features: ['Question', 'Answer', 'Class'],
    num_rows: 1588
})


In [35]:
train[0]

{'Question': 'Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number',
 'Answer': 'Please listen to the recorded message and follow the instructions while entering your card details.',
 'Class': 'security'}

In [36]:
data = DatasetDict({"train":train,"test":test})
data = data.remove_columns('Class')
data = data.remove_columns('Answer')

In [37]:
data

DatasetDict({
    train: Dataset({
        features: ['Question'],
        num_rows: 1588
    })
    test: Dataset({
        features: ['Question'],
        num_rows: 1588
    })
})

In [31]:

tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')


In [38]:
context_length = 10
def tokenize(element):
    outputs = tokenizer(
        element['Question'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
           )
    input_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length == context_length:
            input_batch.append(input_ids)

    return {'input_ids':input_batch}

In [39]:
tokenized_datasets = data.map(tokenize, batched=True, remove_columns=data['train'].column_names)

100%|██████████| 2/2 [00:00<00:00, 43.84ba/s]
100%|██████████| 2/2 [00:00<00:00, 45.68ba/s]


In [45]:
# add data collator to do dynamic padding - pads based on batch size to save time/memory
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [46]:
out = data_collator([tokenized_datasets['train'][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 10])
attention_mask shape: torch.Size([5, 10])
labels shape: torch.Size([5, 10])


In [61]:
#set hyperparams and where to store weights and architecture
training_args = TrainingArguments('Checkpoints', evaluation_strategy='epoch')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [99]:
def compute_metrics(eval_preds):
    # metric = evaluate.load('glue','mrpc')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    f1_score = f1_score(y_true = labels, y_pred = predictions)
    # return metric.compute(predictions=predictions, references=labels)
    return {'f1':f1_score}

In [100]:
#Define trainer and pass all arguments
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics
)

In [101]:
trainer.train()

***** Running training *****
  Num examples = 1226
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 462
  Number of trainable parameters = 124439808
  5%|▍         | 23/462 [00:32<11:36,  1.59s/it]

In [73]:
predictions = trainer.predict(tokenized_datasets['test'])
print(predictions.predictions.shape, predictions.label_ids.shape)



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
279it [03:28,  1.34it/s]

(1175, 10, 50257) (1175, 10)





In [74]:
preds = np.argmax(predictions.predictions,axis=-1)

In [86]:
tokenizer.decode(preds[1])

' is be done with the post dated cheques after'

In [85]:
tokenizer.decode(tokenized_datasets['test'][1]['input_ids'])

'What will be done with the post dated cheques'

In [82]:
tokenized_datasets['test'][0]

{'input_ids': [2437, 460, 314, 651, 616, 2209, 3421, 287, 616, 8063]}

# Test on the fly

In [91]:
test =[ 'My money is','My money is']

In [92]:
input_ids_test = tokenizer(test)

In [98]:
input_ids_test
input_ids_data = Dataset(input_ids_test)

TypeError: Expected a pyarrow.Table or a datasets.table.Table object, but got {'input_ids': [[3666, 1637, 318], [3666, 1637, 318]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}.

In [96]:
preds_test = trainer.predict(input_ids_test)

TypeError: 'DatasetDict' object is not callable

Dataset({
    features: ['input_ids'],
    num_rows: 1175
})