In [5]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Using cached fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [None]:
!pip install datasets

In [1]:
import pandas as pd

In [2]:
from tqdm.notebook import trange, tqdm
import json

In [3]:
import torch
import gc
from functools import reduce

In [4]:
from sklearn.utils import shuffle
from fuzzywuzzy import fuzz



In [5]:
def standardize(txt):
    return txt.replace('\n', '').replace('@highlight', '')

In [6]:
import codecs
import json
def read_jsonl(path):
    with codecs.open(path, encoding='utf-8-sig') as reader:
            lines = reader.read().split("\n")
            lines = list(map(json.loads, filter(None, lines)))
    return lines

In [7]:
def save_solution(arr, name=None):
    name = (str(int(time.time())) if name == None else name) + ".jsonl"
    with open(name, 'w') as fp:
          for i, o in enumerate(arr):
              slovar = {"idx":i, "text":o}
              fp.write(json.dumps(slovar, ensure_ascii=False)+"\n")
    print("solution was saved to ", name)

In [8]:
def to_t5_format(item, translate=False):
    entities = list(set([item["passage"]["text"][e["start"]:e["end"]] for e in item["passage"]["entities"]]))
    en_entities = [answers_en[e] if translate else e for e in entities]
    translated_text = texts_en[standardize(item["passage"]["text"])] if translate else standardize(item["passage"]["text"])
    tanslated_query = questions_en[standardize(item["qas"][0]["query"])] if translate else standardize(item["qas"][0]["query"])
    final_text = "record query: " + (tanslated_query + ". entities: " if tanslated_query[-1] != "." else tanslated_query + " entities: ") + ", ".join(en_entities) + " passage: " + (translated_text + "." if translated_text[-1] != "." else translated_text)
    return final_text

In [9]:
def to_finetune_format(item, entity, translate=False):
    translated_text = texts_en[standardize(item["passage"]["text"])] if translate else standardize(item["passage"]["text"])
    tanslated_query = questions_en[standardize(item["qas"][0]["query"])] if translate else standardize(item["qas"][0]["query"])
    final_text = "record query: " + (tanslated_query + ". entity: " if tanslated_query[-1] != "." else tanslated_query + " entity: ") + entity + " passage: " + (translated_text + "." if translated_text[-1] != "." else translated_text)
    return final_text, entity in set([k["text"] for k in item["qas"][0]["answers"]])

In [10]:
def find_closest_string(string, arr, r=0):
    maxx = r
    most_simillar = ""
    for w in arr:
        if w == string:
            return string
        ratio = fuzz.ratio(string, w)
        if ratio > maxx:
            most_simillar = w
            maxx = ratio
    return most_simillar


In [11]:
!pip install transformers



In [11]:
record_train = json.loads(open("train.json").read())

In [None]:
train = read_jsonl("/content/drive/MyDrive/final/rucos_train.jsonl")
val = read_jsonl("/content/drive/MyDrive/final/rucos_val.jsonl")
test = read_jsonl("/content/drive/MyDrive/final/rucos_test.jsonl")
answers_en = json.loads(open("/content/drive/MyDrive/final/answers_all.json").read())
questions_en = json.loads(open("/content/drive/MyDrive/final/questions_test.json").read())
texts_en = json.loads(open("/content/drive/MyDrive/final/texts_test.json").read())

In [18]:
!pip install transformers



In [12]:
from transformers import DebertaTokenizer, DebertaForSequenceClassification
import torch

tokenizer = DebertaTokenizer.from_pretrained('microsoft/deberta-base')
model = DebertaForSequenceClassification.from_pretrained('microsoft/deberta-base')

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'config', 'deberta.embeddings.position_embeddings.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base

In [20]:
train = record_train["data"][:10000]
val = record_train["data"][10000:11000]

In [21]:
torch.cuda.empty_cache()
gc.collect()

63

In [22]:
from collections import Counter

In [23]:
def process_to_learning_data(part):
    X_train = []
    y_train = []

    for item in tqdm(part):
        for entity in item["passage"]["entities"]:
            new_text, is_true = to_finetune_format(item, item["passage"]["text"][entity["start"]:entity["end"]+1])
            X_train.append(new_text)
            y_train.append(int(is_true))
    
    return X_train, y_train


In [24]:
X_train, y_train = process_to_learning_data(train)
X_val, y_val = process_to_learning_data(val)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




In [25]:
X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)

In [26]:
train_encodings = tokenizer(list(X_train), padding=True, truncation=True)

In [27]:
val_encodings = tokenizer(list(X_val), padding=True, truncation=True)

In [28]:
import torch

class MuSeRCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MuSeRCDataset(train_encodings, y_train)
val_dataset = MuSeRCDataset(val_encodings, y_val)

In [29]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=0.00001, weight_decay=0.01)

from transformers import get_cosine_with_hard_restarts_schedule_with_warmup
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, 500, 15618)

In [30]:
from transformers import Trainer, TrainingArguments

# gradient_accumulation_steps 
# max_grad_norm 

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    max_grad_norm=0.5,
    do_train = True,
    overwrite_output_dir = True
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    optimizers = (optimizer, scheduler)
)

trainer.train()



Step,Training Loss
10,0.7059
20,0.7001
30,0.7003
40,0.7004
50,0.6953
60,0.6944
70,0.6892
80,0.6832
90,0.6787
100,0.6612


  from collections import Container


KeyboardInterrupt: 

In [34]:
from scipy.special import softmax

In [31]:
preds = trainer.predict(val_dataset)

  and should_run_async(code)


In [45]:
import numpy as np

In [47]:
from sklearn.metrics import f1_score

In [None]:
f1_score([np.round(i[1]) for i in list(map(softmax, preds.predictions))], y_val)