In [5]:
# from huggingface_hub import notebook_login
# notebook_login()

from datasets import load_dataset
import pandas as pd

squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)


def get_start_end(train):
    ans_start = []
    ans_end = []

    for ans in train['answers']:
        start = ans['answer_start'][0]
        end = start + len(ans['text'][0].strip())

        ans_start.append(start)
        ans_end.append(end)
        
    return {
        "start": ans_start,
        "end": ans_end   
    }

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
# Convert the dataset to a dictionary
data_dict = squad["train"].to_dict()
# Create a DataFrame from the dictionary
df = pd.DataFrame.from_dict(data_dict)
df.head()
train = df.iloc[:1000, :].copy()
val = df.iloc[1000:1500, :].copy()

In [7]:
train_answers = get_start_end(train)
val_answers = get_start_end(val)

In [12]:
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [13]:
tokenizer = AutoTokenizer.from_pretrained("amandyk/KazakhBERTmulti")
model = AutoModelForMaskedLM.from_pretrained("amandyk/KazakhBERTmulti")

In [14]:
questions = [q.strip() for q in train["question"]]
context = [q.strip() for q in train["context"]]
train_encodings = tokenizer(context, questions, max_length=512, truncation=True, padding=True)

questions = [q.strip() for q in val["question"]]
context = [q.strip() for q in val["context"]]
val_encodings = tokenizer(context, questions, max_length=512, truncation=True, padding=True)

In [15]:
train_encodings.char_to_token(0, train['answers'][0]['answer_start'][0])

In [16]:
train['answers'][0]['answer_start'][0]

1432

In [17]:
train_encodings.char_to_token(0, train_answers['end'][0])

In [18]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers['start'])):
        start = encodings.char_to_token(i, answers['start'][i])
        end = encodings.char_to_token(i, answers['end'][i])
        
        
        if start is None:
            # start = tokenizer.model_max_length
            start = 512
        
        go_back = 1
        while end is None:
            end = encodings.char_to_token(i, answers['end'][i] - go_back)
            go_back += 1
            
#         if end is None:
#             print('yes')
#             end = tokenizer.model_max_length
            
        start_positions.append(start)
        end_positions.append(end)
    
    encodings.update({
        'start_positions': start_positions,
        'end_positions': end_positions
    })
    

In [19]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [20]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [21]:
len(train_encodings['input_ids'][0])

512

In [22]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __len__(self, ):
        return len(self.encodings.input_ids)
    def __getitem__(self, x):
        return {key: torch.tensor(val[x]) for key, val in self.encodings.items()}

In [23]:
next(iter(SquadDataset(train_encodings))).keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [24]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [25]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)



In [27]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=True)

In [28]:
for epoch in range(3):
    loop = tqdm(train_loader)
    for batch in loop:
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.backward()
        optim.step()
        
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

  0%|                                                                                          | 0/500 [00:00<?, ?it/s]


TypeError: forward() got an unexpected keyword argument 'start_positions'