In [1]:
!pip install -qqq torch pandas numpy transformers[torch] pytorch_lightning wandb datasets lightning[extra]

In [2]:
!git clone https://github.com/BartlomiejOlber/xlnet_ists.git /content/repo

In [4]:
import os
import time
import wandb

WANDB_PROJECT = "ENTER_YOUR_VALUE" # @param {type: "string"}
WANDB_ENTITY = "ENTER_YOUR_VALUE" # @param {type: "string"}
WANDB_EXPERIMENT_RUN_NAME = "run " # @param {type: "string"}
WANDB_EXPERIMENT_RUN_NAME_APPEND_TIME = True # @param {type: "boolean"}

WANDB_RUN_FULLNAME = f"{WANDB_EXPERIMENT_RUN_NAME}{time.ctime() if WANDB_EXPERIMENT_RUN_NAME_APPEND_TIME else ''}"

os.environ["WANDB_PROJECT"] = WANDB_PROJECT
os.environ["WANDB_NOTEBOOK_NAME"] = "./NLP_Proj.ipynb"
os.environ["WANDB_ENTITY"] = WANDB_ENTITY
os.environ["WANDB_API_KEY"] = "ENTER_YOUR_VALUE"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msafednn[0m (use `wandb login --relogin` to force relogin)


True

In [5]:
import math

from datasets import load_dataset

from transformers.trainer_utils import get_last_checkpoint
from transformers import (
    AutoConfig,
    AutoTokenizer,
    DataCollatorForPermutationLanguageModeling,
    XLNetLMHeadModel,
    XLNetModel
)
from transformers import Trainer as Trainer_hf
from torch.utils.data import random_split
from transformers.training_args import TrainingArguments

CHECKPOINT_DIR = "../test-plm"
TRUNCATION = True # @param {type: "boolean"}
MAX_LENGTH = 16 # @param {type: "slider", min: 64, max: 512}
BASE_MODEL = "xlnet-base-cased" # @param ["xlnet-base-cased"]

def split(dataset, prop):
    a = int(len(dataset) * prop)
    b = len(dataset) - a
    return random_split(dataset, (a, b))

def pretrain_unsupervised():
  raw_datasets = load_dataset("BeIR/quora", "corpus")

  config = AutoConfig.from_pretrained(BASE_MODEL)

  model = XLNetLMHeadModel.from_pretrained(
      BASE_MODEL,
      config=config
  )

  embedding_size = model.get_input_embeddings().weight.shape[0]
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
  if len(tokenizer) > embedding_size:
      model.resize_token_embeddings(len(tokenizer))

  text_column_name = "text"
  max_seq_length = min(MAX_LENGTH, tokenizer.model_max_length)

  def tokenize_function(examples):
      examples[text_column_name] = [line for line in examples[text_column_name] if len(line) > 0 and not line.isspace()]
      return tokenizer(examples[text_column_name], padding="max_length", truncation=TRUNCATION, max_length=max_seq_length)

  tokenized_datasets = raw_datasets.map(
      tokenize_function,
      batched=True,
      remove_columns=[text_column_name],
      load_from_cache_file=True,
      desc="Running tokenizer on dataset line_by_line",
  )
  data_collator = DataCollatorForPermutationLanguageModeling(
      tokenizer=tokenizer,
      plm_probability=1/3,
      max_span_length=5,
  )

  # split 80:20
  prop = 0.8
  train_dataset, eval_dataset = split(tokenized_datasets["corpus"], prop)

  training_args = TrainingArguments(**{
      "per_device_train_batch_size": 64,
      "output_dir": CHECKPOINT_DIR,
      "do_train": True,
      "do_eval": True,
      "overwrite_output_dir": False,
      "full_determinism": False,
      "save_safetensors": False,
      "seed": 1,
      "report_to": "wandb",
      "run_name": WANDB_RUN_FULLNAME,
      "logging_steps": 50
  })
  training_args = training_args.set_save(strategy="steps", steps=10000)
  # Initialize our Trainer
  trainer = Trainer_hf(
      model=model,
      args=training_args,
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      tokenizer=tokenizer,
      data_collator=data_collator,
  )

  last_checkpoint = None
  if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
    last_checkpoint = get_last_checkpoint(training_args.output_dir)
  # Training
  if training_args.do_train:
    checkpoint = None
    if last_checkpoint is not None:
        checkpoint = last_checkpoint
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
    trainer.save_model()
    metrics = train_result.metrics

    max_train_samples = len(train_dataset)
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

  if training_args.do_eval:

    metrics = trainer.evaluate()
    metrics["eval_samples"] = len(eval_dataset)
    try:
        perplexity = math.exp(metrics["eval_loss"])
    except OverflowError:
        perplexity = float("inf")
    metrics["perplexity"] = perplexity

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from transformers import XLNetTokenizerFast, XLNetModel
from pytorch_lightning import LightningDataModule, LightningModule, Trainer

LR = 1e-5# @param
types_map = {
    'EQUI': 7,
    'OPPO': 6,
    'SPE1': 5,
    'SPE2': 4,
    'SIMI': 3,
    'REL': 2,
    'ALIC': 1,
    'NOALI': 0,
}

class NLPModel(LightningModule):
    def __init__(self, pretrained: str = BASE_MODEL, num_classes=len(types_map), lr: float = 0.001):
        super().__init__()
        self.xlnet = XLNetModel.from_pretrained(pretrained)
        self.scoring_head = torch.nn.Linear(in_features=768 * 2, out_features=1)
        self.class_head = torch.nn.Linear(in_features=768 * 2, out_features=num_classes)
        self.lr = lr
        self.save_hyperparameters()

    def _step(self, batch, batch_idx, id: str):
        x, y = batch
        y_hat = self.forward(x)
        return self.loss(y, y_hat, id)

    def _forward_xlnet(self, *, input_ids, attention_mask, token_type_ids):
        return torch.concat([
            torch.mean(self.xlnet(
                input_ids=input_ids[i],
                attention_mask=attention_mask[i],
                token_type_ids=token_type_ids[i]
            ).last_hidden_state, dim=1) for i in range(2)
        ], dim=1)

    def forward(self, x):
        input_ids, attention_mask, token_type_ids = torch.permute(x, (1, 2, 0, 3))
        last_hidden_state = self._forward_xlnet(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        score = torch.reshape(self.scoring_head(last_hidden_state), (-1,))
        cls = torch.nn.functional.softmax(self.class_head(last_hidden_state), dim=1)
        return cls, score

    def loss(self, y, y_hat, id):
        scoring_loss = torch.nn.functional.mse_loss(y_hat[1], y[1])
        class_loss = torch.nn.functional.binary_cross_entropy_with_logits(y_hat[0], y[0])
        wandb.log({f"{id}_scoring_loss": scoring_loss, f"{id}_class_loss": class_loss})
        return scoring_loss + class_loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "train")
        wandb.log({'train_loss': loss})
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "val")
        wandb.log({'val_loss': loss})
        return loss

    def test_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "test")
        wandb.log({'test_loss': loss})
        return loss

    def predict_step(self, batch, batch_idx):
        x, y = batch
        types, scores = self.forward(x)

        return \
            torch.argmax(types, dim=1), \
            torch.clamp(torch.round(scores).int(), min=0, max=5)

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

In [7]:
class QuoraDataset(Dataset):
    def __init__(self, quora_hf_dataset):
        self.hf_dataset = quora_hf_dataset["train"]

        def tokenize_function(examples):
          examples["text"] = [line["text"][tokenize_qid] for line in examples["questions"]]
          return tokenizer(examples["text"], truncation=TRUNCATION, padding='max_length', max_length=MAX_LENGTH)

        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
        tokenize_qid = 0
        self.enc1 = self.hf_dataset.map(
            tokenize_function,
            batched=True,
            load_from_cache_file=True,
            desc="Running tokenizer on dataset line_by_line",
        )
        tokenize_qid = 1
        self.enc2 = self.hf_dataset.map(
            tokenize_function,
            batched=True,
            load_from_cache_file=True,
            desc="Running tokenizer on dataset line_by_line",
        )

        types = []
        for i, q in enumerate(zip(self.hf_dataset["questions"], self.hf_dataset["is_duplicate"])):
          types.append(int(q[1]))

        assert len(self.enc1) == len(types) == len(self.enc2)
        self.types = torch.nn.functional.one_hot(torch.tensor(types), num_classes=2).float()
        self.scores = torch.tensor(types).float()

    def __getitem__(self, index):
        a, b = self.enc1[index], self.enc2[index]
        x = torch.tensor([
            [a["input_ids"], b["input_ids"]],
            [a["attention_mask"], b["attention_mask"]],
            [a["token_type_ids"], b["token_type_ids"]]
        ])
        y = (self.types[index], self.scores[index])
        return x, y

    def __len__(self):
        return self.types.shape[0]

In [8]:
def types_to_int(types):
    return list(map(lambda x: types_map[x], types))

class NLPDataset(Dataset):
    def __init__(self, file_path):
        self.data = pd.read_csv(file_path, sep='\t', keep_default_na=False, quoting=3)
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
        types = types_to_int(self.data['y_type'].tolist())
        self.enc1 = tokenizer(
            self.data['x1'].tolist(),
            truncation=TRUNCATION,
            padding='max_length',
            max_length=MAX_LENGTH
        )

        self.enc2 = tokenizer(
            self.data['x2'].tolist(),
            truncation=TRUNCATION,
            padding='max_length',
            max_length=MAX_LENGTH
        )

        self.types = torch.nn.functional.one_hot(torch.tensor(types), num_classes=len(types_map)).float()
        self.scores = torch.tensor(self.data['y_score']).float()

    def __getitem__(self, index):
        a, b = self.enc1[index], self.enc2[index]

        x = torch.tensor([
            [a.ids, b.ids],
            [a.attention_mask, b.attention_mask],
            [a.type_ids, b.type_ids]
        ])
        y = (self.types[index], self.scores[index])
        return x, y

    def __len__(self):
        return self.types.shape[0]

In [9]:
class NLPDataModule(LightningDataModule):
    def __init__(self, train_dataset, val_dataset, test_dataset, batch_size: int, train_batch_size: int, num_workers: int):
        super().__init__()
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.test_dataset = test_dataset
        self.batch_size = batch_size
        self.train_batch_size = train_batch_size
        self.num_workers = num_workers
        self.prepare_data_per_node = True

    def prepare_data(self):
        pass

    def setup(self, stage):
        if self.train_dataset is not None:
            return

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.train_batch_size, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers, persistent_workers=True)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

In [10]:
TRAIN_BATCH_SIZE = 16 # @param {type: "slider", min:1, max:128}
BATCH_SIZE = 16 # @param {type: "slider", min:1, max:128}
NUM_WORKERS = 2 # @param {type: "slider", min:1, max:16}
QUORA_EPOCHS = 3 # @param {type: "slider", min:1, max:128}
ISTS_EPOCHS = 20 # @param {type: "slider", min:1, max:128}
ACCELERATOR = "auto" # @param ["auto", "gpu", "tpu", "cpu"]
def train_supervised(model, data, save_path, epochs):

  trainer = Trainer(accelerator=ACCELERATOR, max_epochs=epochs)
  WANDB_RUN_FULLNAME = f"{WANDB_EXPERIMENT_RUN_NAME}{time.ctime() if WANDB_EXPERIMENT_RUN_NAME_APPEND_TIME else ''}"
  wandb.init(
    project=WANDB_PROJECT,
    entity=WANDB_ENTITY,
    name=WANDB_RUN_FULLNAME
  )
  wandb.watch(model)

  trainer.fit(model, data)

  trainer.test(model, data)
  torch.save(model.xlnet.state_dict(), save_path)
  return trainer

In [11]:
def hf2pl(state_dict):
  for k in list(state_dict.keys()):
    if "transformer." in k:
      state_dict[k.replace("transformer.", "")] = state_dict[k]
      del state_dict[k]
  return state_dict

SUPERVISED_PRETRAINED_CHECKPOINT = "../supervised_pretrained.pth"
SUPERVISED_FINETUNED_CHECKPOINT = "../supervised_finetuned.pth"
UNSUPERVISED_FINETUNED_CHECKPOINT = "../unsupervised_finetuned.pth"

def load_pretrained(unsupervised, no_pretrain=False):
  model_mazury = NLPModel(lr=LR)
  if unsupervised:
    model_mazury.xlnet.load_state_dict(hf2pl(torch.load(f"{CHECKPOINT_DIR}/pytorch_model.bin")), strict=False)
  else:
    model_mazury.xlnet.load_state_dict(torch.load(SUPERVISED_PRETRAINED_CHECKPOINT))

  if no_pretrain:
    model_mazury.xlnet = XLNetModel.from_pretrained('xlnet-base-cased')
  return model_mazury

In [12]:
def finetune(unsupervised, data_dir):
  train_dataset, val_dataset = split(NLPDataset(f'{data_dir}/train.tsv'), 0.8)
  test_dataset = NLPDataset(f'{data_dir}/test.tsv')
  data = NLPDataModule(
      train_dataset,
      val_dataset,
      test_dataset,
      batch_size=BATCH_SIZE,
      train_batch_size=TRAIN_BATCH_SIZE,
      num_workers=NUM_WORKERS
  )
  model = load_pretrained(unsupervised=unsupervised)
  save_path = UNSUPERVISED_FINETUNED_CHECKPOINT if unsupervised else SUPERVISED_FINETUNED_CHECKPOINT
  return train_supervised(model, data, save_path, epochs=ISTS_EPOCHS), model, data

In [13]:
def pretrain_supervised():
  quora_dataset = QuoraDataset(load_dataset("quora", "default"))
  model = NLPModel(pretrained=BASE_MODEL, num_classes=2, lr=LR)
  train_dataset, val_dataset, test_dataset = random_split(quora_dataset, [int(len(quora_dataset)*0.8), int(len(quora_dataset)*0.1), int(len(quora_dataset)*0.1)])
  data = NLPDataModule(
      train_dataset,
      val_dataset,
      test_dataset,
      batch_size=BATCH_SIZE,
      train_batch_size=TRAIN_BATCH_SIZE,
      num_workers=NUM_WORKERS
  )
  return train_supervised(model, data, SUPERVISED_PRETRAINED_CHECKPOINT, epochs=QUORA_EPOCHS)

In [14]:
fields_sep = ' // '

def preds_to_wa(wa_content: str, preds_lines):
    wa_lines = wa_content.splitlines()

    idx = 0
    result = []

    for line in wa_lines:
        line_res = line

        if '<==>' in line:
            fields = line.split(fields_sep)
            preds_fields = preds_lines[idx].split()

            fields[1] = preds_fields[1]
            fields[2] = preds_fields[2]

            line_res = fields_sep.join(fields)
            idx += 1

        result.append(line_res)

    return '\n'.join(result)

def flatten(t):
    return [item for sublist in t for item in sublist]

types_inv_map = {v: k for k, v in types_map.items()}

In [None]:
from subprocess import check_output

DATASETS = ["images", "headlines", "answers-students"]

pretrain_unsupervised()
pretrain_supervised()
for pretrain_type in [False, True]:
    for dataset in DATASETS:
        data_dir = os.path.join('./data/', dataset)
        
        trainer_finetuned, model, data = finetune(pretrain_type, data_dir)
        predictions = trainer_finetuned.predict(model, data)

        types = list(map(lambda t: types_inv_map[t], flatten([t.tolist() for t, s in predictions])))
        scores = flatten([s.tolist() for t, s in predictions])
        
        predictions = [
            f"{index}\t{item[0]} {item[1]}\n" for index, item in enumerate(zip(types, scores))
        ]

        wa_file = os.path.join(data_dir, f"STSint.testinput.{dataset}.wa")
        wa_output_file = os.path.join(data_dir, f"STSint.testinput.{dataset}.unsupervised-{pretrain_type}-predictions.wa")
        
        with open(wa_file) as file:
            wa_test = file.read()
        
        wa_predictions = preds_to_wa(wa_test, predictions)
        
        with open(wa_output_file, "w") as file:
            file.write(wa_predictions)
        
        cmds = [
            f"perl evalF1_penalty.pl {wa_file} {wa_output_file}",
            f"perl evalF1_no_penalty.pl {wa_file} {wa_output_file}",
        ]
        
        print(f"{dataset}.unsupervised-{pretrain_type}")
        
        for cmd in cmds:
            print(f"Executing {cmd}")
            output = check_output(cmd.split(), cwd="./").decode()
            print(output)
            with open(os.path.join(data_dir, f"STS.{dataset}.unsupervised-{pretrain_type}-output.txt"), "w") as f:
                f.write(output)

Downloading data:   0%|          | 0.00/25.3M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/522931 [00:00<?, ? examples/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Running tokenizer on dataset line_by_line:   0%|          | 0/522931 [00:00<?, ? examples/s]

Step,Training Loss
50,6.0984
100,5.2967
150,4.9437
200,4.893
250,4.8025
300,4.672
350,4.803
400,4.6034
450,4.6176
500,4.7668
