In [42]:
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import os
import re
from sklearn.model_selection import train_test_split
from datasets import load_metric
import numpy as np
import gc

In [4]:
def read_train_set(train_set_file_path):
  with open(train_set_file_path, "r") as fd:
    raw_text = fd.read().strip()
    raw_docs = re.split(r"\n\t?\n", raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
      tokens = []
      tags = []
      for line in doc.split("\n"):
        token, tag = line.split("\t")
        tokens.append(token)
        tags.append(tag)
      token_docs.append(tokens)
      tag_docs.append(tags)
    return token_docs, tag_docs

In [5]:
full_texts, full_tags = read_train_set(os.path.join(os.getcwd(), "data", "conll", "full_ner_dataset.conll"))

partial_texts, partial_tags = read_train_set(os.path.join(os.getcwd(), "data", "conll", "ner_dataset_one_left_out.conll"))

In [6]:
partial_train_texts, partial_val_texts, partial_train_tags, partial_val_tags = train_test_split(partial_texts, partial_tags, test_size=0.2)

own_test_txt_path = os.path.join(os.getcwd(), "data", "own_test_set", "XLNet.txt")
actual_test_txt_path = os.path.join(os.getcwd(), "data", "final_test_set", "anlp-sciner-test.txt")

own_test_paragraphs = None
actual_test_paragraphs = None

with open(own_test_txt_path, "r") as fd:
  content = fd.read()
  own_test_paragraphs = content.split("\n")

with open(actual_test_txt_path, "r") as fd:
  content = fd.read()
  actual_test_paragraphs = content.split("\n")

In [7]:
unique_tags = set(tag for doc in full_tags for tag in doc)
tag2id = { tag: id for id, tag in enumerate(unique_tags) }
id2tag = { id: tag for tag, id in tag2id.items() }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased", do_lower_case=False)
# tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

partial_train_encodings = tokenizer(partial_train_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)
partial_val_encodings = tokenizer(partial_val_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)

full_train_encodings = tokenizer(full_texts, is_split_into_words=True, truncation=True, padding=True, max_length=512)

In [169]:
def align_labels(tags, encodings):
  labels = []
  for i, label in enumerate(tags):
    word_ids = encodings.word_ids(batch_index=i) 
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
      if word_idx is None:
          label_ids.append(-100)
      elif word_idx != previous_word_idx:
          label_ids.append(tag2id[label[word_idx]])
      else:
          label_ids.append(-100)
      previous_word_idx = word_idx
    labels.append(label_ids)
  return labels

In [170]:
partial_train_labels = align_labels(partial_train_tags, partial_train_encodings)
partial_val_labels = align_labels(partial_val_tags, partial_val_encodings)

full_train_labels = align_labels(full_tags, full_train_encodings)

In [171]:
metric = load_metric("seqeval")

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
  true_predictions = [
      [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": all_metrics["overall_precision"],
      "recall": all_metrics["overall_recall"],
      "f1": all_metrics["overall_f1"],
      "accuracy": all_metrics["overall_accuracy"],
  }

In [172]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_cased", num_labels=len(unique_tags), ignore_mismatched_sizes=True)
# model = AutoModelForTokenClassification.from_pretrained("allenai/scibert_scivocab_uncased", num_labels=len(unique_tags), ignore_mismatched_sizes=True)

In [175]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

partial_train_dataset = NERDataset(partial_train_encodings, partial_train_labels)
partial_val_dataset = NERDataset(partial_val_encodings, partial_val_labels)

full_train_dataset = NERDataset(full_train_encodings, full_train_labels)

In [176]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=25,
    weight_decay=0.01,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=partial_train_dataset,
    eval_dataset=partial_val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
trainer.evaluate()

In [None]:
trainer.save_model('./partial_saved_model')

In [180]:
model_checkpoint = "./partial_saved_model"

In [181]:
def process_test_set(paragraphs):
  test_set_result = []

  for i, paragraph in enumerate(paragraphs):
    encoded_paragraph = tokenizer.encode(paragraph, truncation=True, max_length=512)
    input_ids = torch.tensor(encoded_paragraph).unsqueeze(0)
    word_ids = tokenizer(paragraph.split(" "), return_tensors="pt", is_split_into_words=True).word_ids()

    if all(id is None for id in word_ids):
        continue
    
    with torch.no_grad():
      input_ids = input_ids.to("cuda")
      outputs = model(input_ids)

    predictions = outputs[0].argmax(axis=-1)[0][1:-1]
    paragraph_words = paragraph.split(" ")

    for j, word in enumerate(paragraph_words):
      if j not in word_ids:
        test_set_result.append(f"{word}\tO\n")
      else:
        first_subword_index = word_ids.index(j) - 1
        if first_subword_index >= len(predictions):
          test_set_result.append(f"{word}\tO\n")
        else:
          first_subword_tag = id2tag[predictions[first_subword_index].item()]
          test_set_result.append(f"{word}\t{first_subword_tag}\n")

    test_set_result.append("\n")
  
  return test_set_result

In [182]:
own_test_set_result = process_test_set(own_test_paragraphs)

In [183]:
content = "".join(own_test_set_result)
output_s_conll_file_path = os.path.join(os.getcwd(), f"XLNet-result.conll")
with open(output_s_conll_file_path, 'w') as fd:
  fd.write(content)

In [184]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=25,
    weight_decay=0.01,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('./full_saved_model')
model_checkpoint = "./full_saved_model"

In [188]:
actual_test_set_result = process_test_set(actual_test_paragraphs)

In [189]:
content = "".join(actual_test_set_result)
output_s_conll_file_path = os.path.join(os.getcwd(), f"sciner-mysys.conll")
with open(output_s_conll_file_path, 'w') as fd:
  fd.write(content)