In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("./drive/MyDrive/nlp/")

In [3]:
! pip install sentencepiece transformers datasets python-utils 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 26.0 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 53.6 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 74.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 74.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.

In [9]:
import json
import pandas as pd
import numpy as np

from datasets import load_dataset, load_from_disk, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments, Trainer
from tqdm import tqdm


import torch
from torch import nn

In [10]:
! pwd

/content/drive/MyDrive/nlp


In [11]:
! ls

 albert.ipynb		     dev_merged.json    train_merged.json
'Copy of albert (1).ipynb'   model
'Copy of albert.ipynb'	     test_merged.json


# LOADING DATA

In [13]:
train_path = './train_merged.json'
dev_path = './dev_merged.json'
test_path = './test_merged.json'

with open(train_path,'r',encoding='utf-8') as f:
  train_data = json.load(f)

with open(dev_path,'r',encoding='utf-8') as f:
  dev_data = json.load(f)

with open(test_path,'r',encoding='utf-8') as f:
  test_data = json.load(f)


train_df = pd.DataFrame(columns=['context','question','answers'])
for row in train_data:
  context = row['paragraphs'][0]['context']
  qas = row['paragraphs'][0]['qas']
  for qa in qas:
    question = qa['question']
    if len( qa['answers']) != 0:
      answer_start = [qa['answers'][0]['answer_start']]
      answer_text = [qa['answers'][0]['text']]
    else:
      answer_start = []
      answer_text = []
    train_df = train_df.append({'context': context, 'question': question, 'answers': {'answer_start': answer_start, 'text': answer_text}}, ignore_index=True)

dev_df = pd.DataFrame(columns=['context','question','answers'])
for row in dev_data:
  context = row['paragraphs'][0]['context']
  qas = row['paragraphs'][0]['qas']
  for qa in qas:
    question = qa['question']
    if len( qa['answers']) != 0:
      answer_start = [qa['answers'][0]['answer_start']]
      answer_text = [qa['answers'][0]['text']]
    else:
      answer_start = []
      answer_text = []
    dev_df = dev_df.append({'context': context, 'question': question, 'answers': {'answer_start': answer_start, 'text': answer_text}}, ignore_index=True)

test_df = pd.DataFrame(columns=['context','question','answers'])
for row in test_data:
  context = row['paragraphs'][0]['context']
  qas = row['paragraphs'][0]['qas']
  for qa in qas:
    question = qa['question']
    if len( qa['answers']) != 0:
      answer_start = [qa['answers'][0]['answer_start']]
      answer_text = [qa['answers'][0]['text']]
    else:
      answer_start = []
      answer_text = []
    test_df = test_df.append({'context': context, 'question': question, 'answers': {'answer_start': answer_start, 'text': answer_text}}, ignore_index=True)

In [25]:
train_dataset = Dataset.from_pandas(train_df).shuffle(seed=0)
dev_dataset = Dataset.from_pandas(dev_df).shuffle(seed=0)
test_dataset = Dataset.from_pandas(test_df).shuffle(seed=0)

In [26]:
train_dataset[2]

{'context': 'مدرن تاکینگ یک گروهِ آلمانی بود که متشکل از خواننده توماس آندرس و تنظیم کننده، ترانه\u200cسرا و تهیه\u200cکننده دیتر بولن با مشارکت لوئیس رودریگز در تولید بود. از آنها به عنوان موفق\u200cترین دوتایی پاپ آلمان یاد شده\u200cاست، و تعدادی تک آهنگ هیت داشته\u200cاند که در بسیاری از کشورها به مقام پنجم رسیده\u200cاند. برخی از محبوب\u200cترین و مشهورترین تک\u200cآهنگها عبارتند از: قلب منی، روح منی، بخواهی برنده می\u200cشوی، شری، شری لیدی، برادر لویی، آتلانتیس بانگ می\u200cزند (درخواست کمک برای عشق) و کادیلاک جرونیمو.بلافاصله پس از دومین ترانه موفق، Modern Talking تک آهنگ "Cheri , Cheri Lady" را منتشر کرد.به دلیل آلبوم ششم دریافت نشده آنها، بولن پایان مصاحبه را هنگام مصاحبه اعلام کرد، در حالی که اندرس در لس آنجلس بود. این مسبب خصومت\u200cهای بیشتر بین این دو شد که حتی در کنار هم بودن، رابطه ای اغوا کننده و نزاع داشته\u200cاند. به گفته بولن، دلیل اصلی شکستن این گروه، همسر سابق آندرس، نورا بود که از مصاحبه شوهرش توسط خبرنگاران زن امتناع ورزید، و دائماً خواهان تغییر عظیمی در نمایش، 

# LOADING MODEL

In [16]:
# important parameters
batch_size = 6
epoch = 1

model_name = 'm3hrdadfi/albert-fa-base-v2'
max_length = 512 
doc_stride = 256 
lr = 1e-5

In [17]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
device = torch.device("cuda")

Downloading config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/69.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at m3hrdadfi/albert-fa-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.decoder.weight', 'predictions.LayerNorm.bias', 'sop_classifier.classifier.bias', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.bias', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.weight', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at m3hrdadfi/albert-fa-base-v2

In [18]:
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,)
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [19]:
tokenized_train = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_dev = dev_dataset.map(prepare_train_features, batched=True, remove_columns=dev_dataset.column_names)
tokenized_test = test_dataset.map(prepare_train_features, batched=True, remove_columns=test_dataset.column_names)

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [22]:
tokenized_train[0].keys() ,  tokenized_dev[0].keys() , tokenized_test[0].keys()

(dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']),
 dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']),
 dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions']))

In [23]:
args = TrainingArguments(
    "./model",
    save_strategy = 'epoch',
    evaluation_strategy = "epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.0001)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 10192
  Num Epochs = 1
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 1699


Epoch,Training Loss,Validation Loss
1,1.5817,2.059556


***** Running Evaluation *****
  Num examples = 1131
  Batch size = 6
Saving model checkpoint to ./model/checkpoint-1699
Configuration saved in ./model/checkpoint-1699/config.json
Model weights saved in ./model/checkpoint-1699/pytorch_model.bin
tokenizer config file saved in ./model/checkpoint-1699/tokenizer_config.json
Special tokens file saved in ./model/checkpoint-1699/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1699, training_loss=1.7263397522152277, metrics={'train_runtime': 642.4143, 'train_samples_per_second': 15.865, 'train_steps_per_second': 2.645, 'total_flos': 131992217133336.0, 'train_loss': 1.7263397522152277, 'epoch': 1.0})

Example

In [28]:

path_to_model = './model/checkpoint-1699/'
tokenizer = AutoTokenizer.from_pretrained(path_to_model)
model = AutoModelForQuestionAnswering.from_pretrained(path_to_model)
device = 'cuda'
model.eval().to(device)

AlbertForQuestionAnswering(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(80000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias

In [32]:
def model_pred(model, questions, contexts, tokenizer,n_best=20,stride=256,no_answer=True):
    answer_max_len = 100
    n = len(contexts)
    tokens = tokenizer(questions, contexts, add_special_tokens=True, 
                            return_token_type_ids=True, return_tensors="pt", padding=True, 
                            return_offsets_mapping=True, truncation="only_second", 
                            max_length=max_length, stride=stride)

    start_logits, end_logits = [], []
    for i in tqdm(range(0, n-batch_size+1, batch_size)):
        with torch.no_grad():
            out = model(tokens['input_ids'][i:i+batch_size].to(device), 
                        tokens['attention_mask'][i:i+batch_size].to(device), 
                        tokens['token_type_ids'][i:i+batch_size].to(device))

            start_logits.append(out.start_logits)
            end_logits.append(out.end_logits)

    tokens, starts, ends = tokens, torch.stack(start_logits).view(n, -1), torch.stack(end_logits).view(n, -1)
    start_indexes = starts.argsort(dim=-1, descending=True)[:, :n_best]
    end_indexes = ends.argsort(dim=-1, descending=True)[:, :n_best]

    preds = {}
    for i, (c, q) in enumerate(zip(contexts, questions)):  
        min_null_score = starts[i][0] + ends[i][0]
        start_context = tokens['input_ids'][i].tolist().index(tokenizer.sep_token_id)
        
        offset = tokens['offset_mapping'][i]
        valid_answers = []
        for start_index in start_indexes[i]:

            if start_index<start_context:
                continue
            for end_index in end_indexes[i]:
               
                if (start_index >= len(offset) or end_index >= len(offset)
                    or offset[start_index] is None or offset[end_index] is None):
                    continue
 
                if end_index < start_index or (end_index-start_index+1) > answer_max_len:
                  continue

                start_char = offset[start_index][0]
                end_char = offset[end_index][1]
                valid_answers.append({"score": (starts[i][start_index] + ends[i][end_index]).item(),
                                    "text": c[start_char: end_char],
                                    "loc": [start_char , end_char]})
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"text": "", "score": min_null_score,"loc": [torch.tensor(0) , torch.tensor(0)]}

        if no_answer:
            preds[i] = best_answer if best_answer["score"] >= min_null_score else {"text": "", "score": min_null_score
                                                                                   ,"loc": [torch.tensor(0) , torch.tensor(0)]}
        else:
            preds[i] = best_answer

    return preds

In [33]:
contexts = test_dataset[:8]['context'] 
questions = test_dataset[:8]['question'] 

preds = model_pred(model, questions, contexts, tokenizer,n_best=20,stride=256,no_answer=True)


100%|██████████| 1/1 [00:02<00:00,  2.45s/it]


In [34]:
preds

{0: {'score': 11.723699569702148,
  'text': ' جمهوری\u200cخواه',
  'loc': [tensor(768), tensor(780)]},
 1: {'score': 5.329157829284668,
  'text': '،',
  'loc': [tensor(264), tensor(265)]},
 2: {'score': 7.334939956665039,
  'text': ' رئا از نژاد تیتان\u200cها بود. معادل آن در اسطوره\u200cهای رومی ژوپیتر و در دین کرت',
  'loc': [tensor(115), tensor(189)]},
 3: {'score': 9.489328384399414,
  'text': ' به کار رفته است',
  'loc': [tensor(306), tensor(322)]},
 4: {'score': 10.123342514038086,
  'text': ' لاتین "imperium" گرفته شده',
  'loc': [tensor(56), tensor(83)]},
 5: {'score': 9.03083610534668,
  'text': 'گرچه عملکرد آنها بیشتر شبیه به',
  'loc': [tensor(602), tensor(632)]},
 6: {'score': 9.621089935302734,
  'text': ' نظارت بر بازی\u200cهای جهانی احساس شد، بنابراین فیفا در ۲۱ می۱۹۰۴ از سوی انجمن ملی فوتبال کشورهای بلژیک، دانمارک، فرانسه، هلند، اسپانیا، سوئد و سوئیس در پاریس تأسیس شد. (اسم فرانسوی فیفا هم به همین دلیل است و حتی',
  'loc': [tensor(604), tensor(812)]},
 7: {'score': -0.6