# Imports

In [159]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset, concatenate_datasets, load_metric
import evaluate
import numpy as np
from sklearn.metrics import f1_score


In [140]:
train = load_dataset("csv",data_files = "/Users/Sarah/Documents/TD/Clara Project/bankingFAQdata.csv", split='train[:90%]')
test = load_dataset("csv",data_files = "/Users/Sarah/Documents/TD/Clara Project/bankingFAQdata.csv", split='train[10%:]')
# could do this too train_dataset, validation_dataset= train_dataset.train_test_split(test_size=0.1).values()
train[0]

Using custom data configuration default-cf00f77e6b2126f4
Found cached dataset csv (/Users/Sarah/.cache/huggingface/datasets/csv/default-cf00f77e6b2126f4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Using custom data configuration default-cf00f77e6b2126f4
Found cached dataset csv (/Users/Sarah/.cache/huggingface/datasets/csv/default-cf00f77e6b2126f4/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


{'Question': 'Do I need to enter ‘#’ after keying in my Card number/ Card expiry date/ CVV number',
 'Answer': 'Please listen to the recorded message and follow the instructions while entering your card details.',
 'Class': 'security'}

# Preprocess

In [148]:
train_q = train.remove_columns('Class')
train_q = train_q.remove_columns('Answer')

train_a = train.remove_columns('Class')
train_a = train_a.remove_columns('Question')
train_a = train_a.rename_column('Answer','Question')

train1 = concatenate_datasets([train_q,train_a])

In [150]:
test_q = test.remove_columns('Class')
test_q = test_q.remove_columns('Answer')

test_a = test.remove_columns('Class')
test_a = test_a.remove_columns('Question')
test_a = test_a.rename_column('Answer','Question')

test1 = concatenate_datasets([test_q,test_a])

### Dataset dictionary

In [152]:
data = DatasetDict({"train":train1,"test":test1})

### Same case

In [112]:
def lowercase_question(example):
    return {"Question": example['Question'].lower()}

data = data.map(lowercase_question)

In [123]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained('gpt2')

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file config.json from cache at /Users/Sarah/.cache/huggingface/hub/models--gpt2/snapshots/f27b190eeac4c2302d24068eabf5e9d6044389ae/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    

In [38]:
context_length = 10
def tokenize(element):
    outputs = tokenizer(
        element['Question'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True
           )
    input_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):
        if length == context_length:
            input_batch.append(input_ids)

    return {'input_ids':input_batch}

In [39]:
tokenized_datasets = data.map(tokenize, batched=True, remove_columns=data['train'].column_names)

100%|██████████| 2/2 [00:00<00:00, 43.84ba/s]
100%|██████████| 2/2 [00:00<00:00, 45.68ba/s]


In [175]:
# add data collator to do dynamic padding and create labels (shifted inputs) - pads based on batch size to save time/memory
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [46]:
out = data_collator([tokenized_datasets['train'][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 10])
attention_mask shape: torch.Size([5, 10])
labels shape: torch.Size([5, 10])


In [124]:
#set hyperparams and where to store weights and architecture
training_args = TrainingArguments('/Users/Sarah/Documents/TD/Clara Project/Checkpoints', evaluation_strategy='epoch')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [160]:
def compute_metrics(eval_preds):
    # metric = evaluate.load('glue','mrpc')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(y_true = labels, y_pred = predictions)
    # return metric.compute(predictions=predictions, references=labels)
    return f1

In [None]:
# def compute_metrics(eval_preds):
#     # metric = evaluate.load('glue','mrpc')
#     metric = load_metric('f1')
#     logits, labels = eval_preds
#     predictions = np.argmax(logits, axis=-1)
#     #f1_score = f1_score(y_true = labels, y_pred = predictions)
#     return metric.compute(predictions=predictions, references=labels)
#     #return f1_score

In [165]:
#Define trainer and pass all arguments
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(tokenized_datasets['test'])
print(predictions.predictions.shape, predictions.label_ids.shape)


In [166]:
preds = np.argmax(predictions.predictions,axis=-1)

In [167]:
tokenizer.decode(preds[1])

' is be done with the post dated cheques after'

In [168]:
tokenizer.decode(tokenized_datasets['test'][1]['input_ids'])

'What will be done with the post dated cheques'

In [82]:
tokenized_datasets['test'][0]

{'input_ids': [2437, 460, 314, 651, 616, 2209, 3421, 287, 616, 8063]}

# Test on the fly

In [236]:
input_sentence = "My money is"
def on_the_fly(input_sentence,iters):
    
    for iter in range(iters):
        encoding = tokenizer([input_sentence], return_tensors='pt')
        preds_test = trainer.predict(encoding['input_ids'])
        preds_test = np.argmax(preds_test.predictions,axis=-1)
        generated = tokenizer.decode(preds_test[0][-1])
        input_sentence = input_sentence+generated
    
    return input_sentence

In [241]:
out = on_the_fly(input_sentence,60)

***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 647.67it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 878.02it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 569.88it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 598.08it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 510.94it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 497.25it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 723.53it/s]
***** Running Prediction *****
  Num examples = 1
  Batch size = 8




100%|██████████| 1/1 [00:00<00:00, 503.46it/s]
***** Running Prediction *****
  Num examples = 1
  Batc

In [242]:
out

'My money is not credited to my account as soon as I open a new account, will my existing one be blocked for KYC/CRS/MSC/MSC/MSC/MSC/MSC/MSC/MSC/MSC/MSC/MSC/MSC/'