In [1]:
!git clone https://github.com/L0czek/Mazury.git /content/Mazury

fatal: destination path '/content/Mazury' already exists and is not an empty directory.


In [2]:
!pip install -qqq torch pandas numpy transformers pytorch_lightning wandb

In [3]:
import os
import time
import typing as t

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
from transformers import XLNetTokenizerFast, XLNetModel
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

In [4]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mbluealien99[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
DATASET = "headlines" # @param ["images", "headlines", "answers-students"]
DATA_DIR = os.path.join('/content/Mazury/data/', DATASET) 

In [6]:
types_map = {
    'EQUI': 7,
    'OPPO': 6,
    'SPE1': 5,
    'SPE2': 4,
    'SIMI': 3,
    'REL': 2,
    'ALIC': 1,
    'NOALI': 0,
}

def types_to_int(types):
    return list(map(lambda x: types_map[x], types))

In [7]:
TRUNCATION = True # @param {type: "boolean"}
MAX_LENGTH = 16 # @param {type: "slider", min: 1, max: 128}

class NLPDataset(Dataset):

    def __init__(self, file_path):
        # quoting=3, 3 is QUOTE_NONE
        self.data = pd.read_csv(file_path, sep='\t', keep_default_na=False, quoting=3)
        tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased')
        types = types_to_int(self.data['y_type'].tolist())

        self.enc1 = tokenizer(
            self.data['x1'].tolist(), 
            truncation=TRUNCATION, 
            padding='max_length', 
            max_length=MAX_LENGTH
        )
        
        self.enc2 = tokenizer(
            self.data['x2'].tolist(), 
            truncation=TRUNCATION, 
            padding='max_length', 
            max_length=MAX_LENGTH
        )
        
        self.types = torch.nn.functional.one_hot(torch.tensor(types), num_classes=len(types_map)).float()
        self.scores = torch.tensor(self.data['y_score']).float()
            
    def __getitem__(self, index):
        a, b = self.enc1[index], self.enc2[index]
        x = torch.tensor([
            [a.ids, b.ids],
            [a.attention_mask, b.attention_mask],
            [a.type_ids, b.type_ids]
        ])
        y = (self.types[index], self.scores[index])
        return x, y  
        
    def __len__(self):
        return self.types.shape[0];


In [8]:
class NLPDataModule(LightningDataModule):
    def __init__(self, train_path: str, test_path: str, batch_size: int, train_batch_size: int, num_workers: int):
        super().__init__()
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.batch_size = batch_size
        self.train_batch_size = train_batch_size
        self.num_workers = num_workers
        self.train_path = train_path
        self.test_path = test_path
        self.prepare_data_per_node = True

    def _split(self, dataset, prop):
        a = int(len(dataset) * prop)
        b = len(dataset) - a
        return random_split(dataset, (a, b))

    def prepare_data(self):
        pass

    def setup(self, stage):
        if self.train_dataset is not None:
            return
      
        self.train_dataset, self.val_dataset = self._split(NLPDataset(self.train_path), 0.8)
        self.test_dataset = NLPDataset(self.test_path)
  
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.train_batch_size, num_workers=self.num_workers)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

In [9]:
class NLPModel(LightningModule):
    def __init__(self, pretrained: str = 'xlnet-base-cased', lr: float = 0.001):
        super().__init__()
        self.xlnet = XLNetModel.from_pretrained(pretrained)
        self.scoring_head = torch.nn.Linear(in_features=768 * 2, out_features=1)
        self.class_head = torch.nn.Linear(in_features=768 * 2, out_features=len(types_map))
        self.lr = lr
        self.save_hyperparameters()
  
    def _step(self, batch, batch_idx, id: str):
        x, y = batch
        y_hat = self.forward(x)
        return self.loss(y, y_hat, id)

    def _forward_xlnet(self, *, input_ids, attention_mask, token_type_ids):
        return torch.concat([ 
            torch.mean(self.xlnet(
                input_ids=input_ids[i], 
                attention_mask=attention_mask[i], 
                token_type_ids=token_type_ids[i]
            ).last_hidden_state, dim=1) for i in range(2)
        ], dim=1)

    def forward(self, x):
        input_ids, attention_mask, token_type_ids = torch.permute(x, (1, 2, 0, 3))
        last_hidden_state = self._forward_xlnet(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        score = torch.reshape(self.scoring_head(last_hidden_state), (-1,))
        cls = torch.nn.functional.softmax(self.class_head(last_hidden_state), dim=1)
        return cls, score

    def loss(self, y, y_hat, id):
        scoring_loss = torch.nn.functional.mse_loss(y_hat[1], y[1])
        class_loss = torch.nn.functional.binary_cross_entropy_with_logits(y_hat[0], y[0])
        wandb.log({f"{id}_scoring_loss": scoring_loss, f"{id}_class_loss": class_loss})
        return scoring_loss + class_loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "train")
        wandb.log({'train_loss': loss})
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "val")
        wandb.log({'val_loss': loss})
        return loss

    def test_step(self, batch, batch_idx):
        loss = self._step(batch, batch_idx, "test")
        wandb.log({'test_loss': loss})
        return loss

    def predict_step(self, batch, batch_idx):
        x, y = batch
        types, scores = self.forward(x)

        return \
            torch.argmax(types, dim=1), \
            torch.clamp(torch.round(scores).int(), min=0, max=5)
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

In [10]:
LR = 1e-5# @param
BASE_MODEL = "xlnet-base-cased" # @param ["xlnet-base-cased"]

model = NLPModel(pretrained=BASE_MODEL, lr=LR)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
DO_LOAD_MODEL = False # @param {type: "boolean"}
SAVED_MODEL_FILE = "model" # @param {type: "string"}

if DO_LOAD_MODEL:
    if os.path.isfile(SAVED_MODEL_FILE):
        model.load_state_dict(torch.load("model"))
    else:
        raise RuntimeError("Model file not found")

In [12]:
TRAIN_BATCH_SIZE = 16 # @param {type: "slider", min:1, max:128}
BATCH_SIZE = 16 # @param {type: "slider", min:1, max:128}
NUM_WORKERS = 2 # @param {type: "slider", min:1, max:16}

data = NLPDataModule(
    f'{DATA_DIR}/train.tsv', 
    f'{DATA_DIR}/test.tsv', 
    batch_size=BATCH_SIZE, 
    train_batch_size=TRAIN_BATCH_SIZE,
    num_workers=NUM_WORKERS
)

In [13]:
EPOCHS = 10 # @param {type: "slider", min:1, max:128}
ACCELERATOR = "auto" # @param ["auto", "gpu", "tpu", "cpu"]

trainer = Trainer(accelerator=ACCELERATOR, max_epochs=EPOCHS)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
WANDB_PROJECT = "mazury" # @param {type: "string"}
WANDB_ENTITY = "bluealien99" # @param {type: "string"}
WANDB_EXPERIMENT_RUN_NAME = "run " # @param {type: "string"}
WANDB_EXPERIMENT_RUN_NAME_APPEND_TIME = True # @param {type: "boolean"}

name = f"{WANDB_EXPERIMENT_RUN_NAME}{time.ctime() if WANDB_EXPERIMENT_RUN_NAME_APPEND_TIME else ''}"

wandb.init(
    project=WANDB_PROJECT, 
    entity=WANDB_ENTITY, 
    name=name
)
wandb.watch(model)

trainer.fit(model, data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type       | Params
--------------------------------------------
0 | xlnet        | XLNetModel | 116 M 
1 | scoring_head | Linear     | 1.5 K 
2 | class_head   | Linear     | 12.3 K
--------------------------------------------
116 M     Trainable params
0         Non-trainable params
116 M     Total params
466.929   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [15]:
trainer.test(model, data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

[{}]

In [16]:
DO_SAVE_MODEL = True # @param {type: "boolean"}
DO_UPLOAD_MODEL = False # @param {type: "boolean"}

if DO_SAVE_MODEL:
    torch.save(model.state_dict(), "model")

if DO_UPLOAD_MODEL:
    wandb.save("model")

In [17]:
predictions = trainer.predict(model, data)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 199it [00:00, ?it/s]

In [18]:
fields_sep = ' // '

def preds_to_wa(wa_content: str, preds_lines: t.List[str]):
    wa_lines = wa_content.splitlines()

    idx = 0
    result = []

    for line in wa_lines:
        line_res = line

        if '<==>' in line:
            fields = line.split(fields_sep)
            preds_fields = preds_lines[idx].split()

            fields[1] = preds_fields[1]
            fields[2] = preds_fields[2]

            line_res = fields_sep.join(fields)
            idx += 1

        result.append(line_res)
    
    return '\n'.join(result)

In [19]:
def flatten(t):
    return [item for sublist in t for item in sublist]

types_inv_map = {v: k for k, v in types_map.items()}

types = list(map(lambda t: types_inv_map[t], flatten([t.tolist() for t, s in predictions])))
scores = flatten([s.tolist() for t, s in predictions])

predictions = [
    f"{index}\t{item[0]} {item[1]}\n" for index, item in enumerate(zip(types, scores))
]

In [20]:
wa_file = os.path.join(DATA_DIR, f"STSint.testinput.{DATASET}.wa")
wa_output_file = os.path.join(DATA_DIR, f"STSint.testinput.{DATASET}-predictions.wa")

with open(wa_file) as file:
    wa_test = file.read()

wa_predictions = preds_to_wa(wa_test, predictions)

with open(wa_output_file, "w") as file:
    file.write(wa_predictions)

In [21]:
from subprocess import check_output

cmds = [
    f"perl evalF1_penalty.pl {wa_file} {wa_output_file}",
    f"perl evalF1_no_penalty.pl {wa_file} {wa_output_file}",
]

for cmd in cmds:
    print(f"Executing {cmd}")
    print(check_output(cmd.split(), cwd="/content/Mazury").decode())

Executing perl evalF1_penalty.pl /content/Mazury/data/headlines/STSint.testinput.headlines.wa /content/Mazury/data/headlines/STSint.testinput.headlines-predictions.wa
 F1 Ali     1.0000
 F1 Type    0.5619
 F1 Score   0.8780
 F1 Typ+Sco 0.7119

Executing perl evalF1_no_penalty.pl /content/Mazury/data/headlines/STSint.testinput.headlines.wa /content/Mazury/data/headlines/STSint.testinput.headlines-predictions.wa
 F1 Ali     1.0000
 F1 Type    0.5619
 F1 Score   0.8780
 F1 Typ+Sco 0.5229

