In [50]:
import pandas as pd

train = pd.read_json('data/trainmodel.json')
validate = pd.read_json('data/val.json')

train['answers'] = train['answers'].apply(lambda x: x[0])
validate['answers'] = validate['answers'].apply(lambda x: x[0])

In [51]:
questions = train['qText'].values
answers = train['answers'].values
questions_val = validate['qText'].values
answers_val = validate['answers'].values


In [53]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def tokenize(batch):
    return tokenizer(list(batch), padding=True, truncation=True, return_tensors='pt', max_length=64)


questions_tokens = tokenize(questions)
answers_tokens = tokenize(answers)
questions_val_tokens = tokenize(questions_val)
answers_val_tokens = tokenize(answers_val)

In [54]:
import torch
from torch.utils.data import Dataset, DataLoader


class QADataset(Dataset):
    def __init__(self, questions_tokens, answers_tokens):
        self.questions_tokens = questions_tokens
        self.answers_tokens = answers_tokens

    def __len__(self):
        return len(self.questions_tokens['input_ids'])

    def __getitem__(self, idx):
        question = {
            key: val[idx] for key, val in self.questions_tokens.items()
        }
        answer = {
            key: val[idx] for key, val in self.answers_tokens.items()
        }
        return question, answer

In [60]:
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

config_encoder = BertConfig()
config_decoder = BertConfig()

config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = EncoderDecoderModel(config=config)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
learning_rate = 5e-5
batch_size = 16
epochs = 5

train_dataset = QADataset(questions_tokens[:256], answers_tokens[:256])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
history = []
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        questions, answers = batch
        input_ids = questions['input_ids']
        labels = answers['input_ids'].clone()
        attention_mask = questions['attention_mask']
        outputs = outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        break
    history.append(total_loss / len(train_loader))
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}')




In [None]:
model.eval()

questions, answers = next(iter(train_loader))
outputs = model(input_ids=questions['input_ids'],
                decoder_input_ids=answers['input_ids'],
                attention_mask=questions['attention_mask'])
logits = outputs.logits
predictions = torch.argmax(logits, dim=2)
predicted_text = tokenizer.batch_decode(predictions)
predicted_text