In [14]:
!pip install transformers

In [15]:
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AutoTokenizer
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AdamW

In [50]:
model = BertForQuestionAnswering.from_pretrained("salti/bert-base-multilingual-cased-finetuned-squad")
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# model = BertForQuestionAnswering.from_pretrained('jcblaise/bert-tagalog-base-cased')
# tokenizer = AutoTokenizer.from_pretrained('jcblaise/bert-tagalog-base-cased')

# model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es')
# tokenizer = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-uncased')

In [51]:
def get_answer_dict(text, start, end):
  retVal = dict()
  retVal["text"] = text
  retVal["answer_start"] = start
  retVal["answer_end"] = end
  return retVal

In [52]:
def read_data(filename):
  df = pd.read_csv(filename)
  print(df)
  df["answers"] = df.apply(lambda row: get_answer_dict(row["answer"], row["start"], row["end"]), axis=1)
  context = list(df["context"])
  questions = list(df["question"])
  answers = list(df["answers"])
  return context, questions, answers

In [53]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(answers[i]['answer_start'])
        end_positions.append(answers[i]['answer_end'] - 1)

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [54]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()

# Read in our custom dataset, get encodings
# dataset1: '/content/wiki_manual_tagalog_translated.csv'
contexts, questions, answers = read_data('qa_onetwenty.csv')
train_encodings = tokenizer(contexts, questions, truncation=True, padding=True)
add_token_positions(train_encodings, answers)
train_dataset = CustomDataset(train_encodings)

In [59]:
# Train
batchsize = 16
num_epochs = 25
learning_rate = 5e-4

train_loader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True)
optim = AdamW(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  print(epoch)
  for batch in train_loader:
      optim.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      loss.backward()
      optim.step()



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [60]:
# Save model in different forms
model_path = '/content/drive/MyDrive/mnlp/final/power_120'
model.save_pretrained(model_path)