In [1]:
!pip install datasets
!pip install loralib
!pip install tiktoken
!pip install rotary-embedding-torch

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m317.4/542.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-16.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none

In [2]:
from datasets import load_dataset
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from tiktoken import get_encoding
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from typing import Tuple
import torch
import torch.nn.functional as F
from torch import nn
import inspect
from rotary_embedding_torch import RotaryEmbedding
import loralib as lora

In [3]:
import tqdm
import gc
from tqdm import tqdm

In [4]:
enc = tiktoken.get_encoding('gpt2')
vocab_size = enc.n_vocab

# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel? #16
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 1024
n_head = 8
n_layer = 8
dropout = 0.0
# ------------
class ModelConfig:
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    dim: int = 512
    n_layers: int = 8
    n_heads: int = 8
    max_seq_len: int = 512
    layer_norm_eps: float = 1e-6
    dropout: float = 0.0
    hidden_dim: int = None
    n_embd: int = 1024
    multiple_of: int = 32
    rope_dim: int = 64
    bias: bool = True
    weight_decay = 1e-1
    betas = (0.9, 0.99)
    lora_rank: int = 4

# Root Mean Square Layer Normalization (https://arxiv.org/abs/1910.07467)
# borrowed from the official Llama implementation:
# https://github.com/facebookresearch/llama/blob/main/llama/model.py
class RMSNorm(torch.nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        """
        Initialize the RMSNorm normalization layer.

        Args:
            dim (int): The dimension of the input tensor.
            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.

        Attributes:
            eps (float): A small value added to the denominator for numerical stability.
            weight (nn.Parameter): Learnable scaling parameter.

        """
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def _norm(self, x):
        """
        Apply the RMSNorm normalization to the input tensor.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The normalized tensor.

        """
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)

    def forward(self, x):
        """
        Forward pass through the RMSNorm layer.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor after applying RMSNorm.

        """
        output = self._norm(x.float()).type_as(x)
        return output * self.weight

class FeedForward(nn.Module):
    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
        super().__init__()
        if hidden_dim is None:
            hidden_dim = 4 * dim
            hidden_dim = int(2 * hidden_dim / 3)
            hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
        self.dropout = nn.Dropout(dropout)

    def SwiGLU(self, x: torch.Tensor) -> torch.Tensor:
        '''
        Compute the SwiGLU activation function (see Section 2 in
        https://arxiv.org/abs/2204.02311
        '''
        return F.silu(self.w1(x)) * self.w3(x)

    def forward(self, x):
        return self.dropout(self.w2(self.SwiGLU(x)))


class CausalSelfAttention(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        assert config.n_embd % config.n_heads == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
        # regularization
        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_heads
        self.n_embd = config.n_embd
        self.dropout = config.dropout
        self.rotary = RotaryEmbedding(config.rope_dim)
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # apply RoPE, see https://arxiv.org/abs/2104.09864
        k = self.rotary.rotate_queries_or_keys(k)
        q = self.rotary.rotate_queries_or_keys(q)


        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
        else:
            # manual implementation of attention
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)
            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.rn_1 = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        self.attn = CausalSelfAttention(config)
        self.rn_2 = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        self.mlp = FeedForward(config.n_embd, config.hidden_dim, config.multiple_of, config.dropout)

    def forward(self, x):
        x = x + self.attn(self.rn_1(x))
        x = x + self.mlp(self.rn_2(x))
        return x

class GPT(nn.Module):

    def __init__(self, config: ModelConfig):
        super().__init__()
        assert config.vocab_size is not None
        assert config.block_size is not None
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            drop = nn.Dropout(config.dropout),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
            ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_eps)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.transformer.wte.weight = self.lm_head.weight

        # init all weights
        self.apply(self._init_weights)
        # apply special scaled init to the residual projections, per GPT-2 paper
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layers))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        x = self.transformer.drop(tok_emb)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss

    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
        # start with all of the candidate parameters
        param_dict = {pn: p for pn, p in self.named_parameters()}
        # filter out those that do not require grad
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
        print(f"using fused AdamW: {use_fused}")

        return optimizer

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)
            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            # optionally crop the logits to only the top k options
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = -float('Inf')
            # apply softmax to convert logits to (normalized) probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

config = ModelConfig()
model = GPT(config)
m = model.to(device)

# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
#optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
optimizer = model.configure_optimizers(weight_decay=1e-1, learning_rate=0.001, betas= (0.9, 0.99), device_type=device)


152.749312 M parameters
num decayed parameter tensors: 41, with 152,698,880 parameters
num non-decayed parameter tensors: 33, with 50,176 parameters
using fused AdamW: True


In [7]:
checkpoint = torch.load("/content/checkpoint_iter_3600_3_99_v3.pth", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

# Dataset
## Helper function that aligns encoding between dataset and pretrained model
Since the tokenizer we are using differs from the dataset tokenizer. Out tokenizer is more finegrained than the dataset's. Below is what we do to align the two:
If a dataset token is split to N tokens by the model tokenizer, then all N tokens will be padded with same NER token of the dataset token. The helper will also return a mask. Only the first of the N tokens are assigned 1 and all other are assined 0. This mask is for testing stage, so that we can align the classifier prediction to the same length of the input dataset token length and calculate F-1 score.

In [8]:
def ner_tokenizer(tokens, tags, enc=enc):
    ner_tokens = []
    ner_tags = []
    ner_mask = []
    for i, token in enumerate(tokens):
        encoded = enc.encode(token)
        ner_tag = [tags[i]] * len(encoded)
        mask = [1] + [0] * (len(encoded) - 1)
        ner_tokens += encoded
        ner_mask += mask
        ner_tags += ner_tag
    assert len(ner_tokens) == len(ner_mask)
    return ner_tokens, ner_tags, ner_mask

## Load the dataset
https://huggingface.co/datasets/conll2003

In [9]:
dataset = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/312k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/283k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [10]:
class NERClassficationDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, encoder, model):
        self.dataset = []
        self.labels = []
        self.encoder = encoder
        self.masks = []
        self.model = model
        self.model.eval()
        for entry in tqdm(dataset):
            ner_tokens, ner_tags, ner_mask = ner_tokenizer(entry['tokens'], entry['ner_tags'], encoder)
            self.dataset.append(ner_tokens)
            self.labels.append(ner_tags)
            self.masks.append(ner_mask)
        self.length = len(self.labels)

    def __len__(self):
        return self.length

    def __getitem__(self, ind):
        ner_tokens = self.dataset[ind]
        text = torch.tensor(ner_tokens, dtype=torch.long).to(device)
        embed = torch.stack([text]).to(device)
        with torch.no_grad():
            output = torch.squeeze(self.model(embed, embed)[0])
            torch.cuda.empty_cache()
        label = self.labels[ind]
        label = torch.tensor(label, dtype=torch.long)

        mask = self.masks[ind]
        mask = torch.tensor(mask, dtype=torch.long)

        return output, label

    def collate_fn(self, batch):
        inputs, labels = zip(*batch)
        temp = []
        for input in inputs:
            if input.dim() == 1:
                input = input.view(1, -1)
            temp.append(input)
        inputs = torch.cat(temp)
        labels = torch.cat(labels)

        return inputs, labels

    def get_dataset(self):
        return self.dataset

    def get_labels(self):
        return self.labels

    def get_masks(self):
        return self.masks

train_dataset = NERClassficationDataset(dataset['train'], enc, m)
val_dataset = NERClassficationDataset(dataset['validation'], enc, m)

sample_sentence, sample_label = train_dataset[0]
print("Train data sample: ")
print("Encoded Sentence:", sample_sentence.shape)
print("Label:", sample_label)
sample_sentence, sample_label = val_dataset[0]
print("Val data sample: ")
print("Encoded Sentence:", sample_sentence.shape)
print("Label:", sample_label)

100%|██████████| 14041/14041 [00:02<00:00, 6461.70it/s]
100%|██████████| 3250/3250 [00:00<00:00, 6649.81it/s]


Train data sample: 
Encoded Sentence: torch.Size([13, 50304])
Label: tensor([3, 0, 0, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0])
Val data sample: 
Encoded Sentence: torch.Size([24, 50304])
Label: tensor([0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [11]:
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, collate_fn=train_dataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=True, collate_fn=val_dataset.collate_fn)
for batch in tqdm(train_loader):
    inputs, labels = batch
    print(inputs.shape)
    print(labels.shape)
    break

  0%|          | 0/55 [00:03<?, ?it/s]

torch.Size([5413, 50304])
torch.Size([5413])





In [12]:
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim=50304, output_dim=9):
        super(SimpleClassifier, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.linear(x)
        return x
ner_model = SimpleClassifier()
ner_model.to(device)

SimpleClassifier(
  (linear): Linear(in_features=50304, out_features=9, bias=True)
)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(ner_model.parameters(), lr=5e-5)

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    ner_model.train()
    total_loss = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Training", leave=False)

    for inputs, labels in train_loader_tqdm:
        # inputs = inputs.squeeze(0)
        # labels = labels.squeeze(0)
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = ner_model(inputs)

        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        train_loader_tqdm.set_postfix({'Train Loss': f'{loss.item():.4f}'})

    avg_train_loss = total_loss / len(train_loader)
    tqdm.write(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}')

    ner_model.eval()
    total_val_loss = 0
    correct = 0
    total = 0
    val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Validation", leave=True)

    with torch.no_grad():
        for inputs, labels in val_loader_tqdm:
            # inputs = inputs.squeeze(0)
            # labels = labels.squeeze(0)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = ner_model(inputs)
            # if outputs.dim() == 1:
            #     outputs = outputs.view(1, -1)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()


    avg_val_loss = total_val_loss / len(val_loader)
    accuracy = 100 * correct / total
    if epoch == num_epochs - 1:
        print()
    tqdm.write(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%')


In [16]:
# logging
import wandb
wandb.login(key="197d96ebfe1ad37dfd2180d901ca0f779e76bdfe", relogin=True)
# wandb.init(project='HW5', name='NER-fineTune_v2')


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [17]:
wandb.init(
    name    = "NER-fineTune_v2", ### Wandb creates random run names if you skip this field, we recommend you give useful names
    reinit  = True, ### Allows reinitalizing runs when you re-run this cell
    # id     = "y28t31uh", ### Insert specific run id here if you want to resume a previous run
    #resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "HW5", ### Project should be created in your wandb account
)

[34m[1mwandb[0m: Currently logged in as: [33mabasrith[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [18]:
from sklearn.metrics import f1_score

In [19]:
print(len(val_loader))

13


In [None]:
num_epochs = 25

for epoch in range(num_epochs):
    ner_model.train()
    total_loss = 0
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Training", leave=False)

    for inputs, labels in train_loader_tqdm:
        # inputs = inputs.squeeze(0)
        # labels = labels.squeeze(0)
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = ner_model(inputs)

        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        train_loader_tqdm.set_postfix({'Train Loss': f'{loss.item():.4f}'})

    avg_train_loss = total_loss / len(train_loader)
    tqdm.write(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}')
    wandb.log({'Epoch': (epoch + 1),'Train Loss': avg_train_loss})

    ner_model.eval()
    total_val_loss = 0
    correct = 0
    total = 0
    val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} Validation", leave=True)

    with torch.no_grad():
        f1_score_val_total = 0
        for inputs, labels in val_loader_tqdm:
            # inputs = inputs.squeeze(0)
            # labels = labels.squeeze(0)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = ner_model(inputs)
            # if outputs.dim() == 1:
            #     outputs = outputs.view(1, -1)
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            labels_list = labels.tolist()
            predicted_list = predicted.tolist()
            f1_score_val = f1_score(labels_list, predicted_list, average='micro')
            f1_score_val_total += f1_score_val
            print(f"F1 score:{f1_score_val}")
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    f1_score_val_total = f1_score_val_total / len(val_loader)
    wandb.log({'F1 score per Epoch':f1_score_val_total})
    avg_val_loss = total_val_loss / len(val_loader)
    accuracy = 100 * correct / total
    tqdm.write(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%')
    wandb.log({'val_acc': accuracy, 'Val Loss': avg_val_loss})

In [21]:
torch.save(
        {'model_state_dict'         : ner_model.state_dict()},
         "ner_weights.pth"
    )

In [22]:
test_dataset = train_dataset = NERClassficationDataset(dataset['test'], enc, m)
test_data = test_dataset.get_dataset()
test_labels = test_dataset.get_labels()
test_mask = test_dataset.get_masks()

100%|██████████| 3453/3453 [00:00<00:00, 7180.94it/s]


# Generate Test output for evaluation
Here we use mask to remove padded encodings we added for alignment. If a dataset token is converted to n gpt2 tokens, then the first of the n will be written to the result for evaluation. This ensures the predicted seqence length match the original length.

In [32]:
import math
import inspect
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F


def generate(idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -128:]
        # get the predictions
        logits, loss = ner_model(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx


In [34]:
enc = tiktoken.get_encoding("gpt2")

In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_tokens = generate(context, max_new_tokens=2000)[0].tolist()

generated_text = enc.decode(generated_tokens)
print(f"Generated text: {generated_text}")

In [41]:
ner_model.eval()
m.eval()
res = []
for i in tqdm(range(len(test_data))):
    text = torch.tensor(test_data[i], dtype=torch.long).to(device)
    # print("text",text)
    embed = torch.stack([text]).to(device)
    with torch.no_grad():
        input = torch.squeeze(m(embed, embed)[0])
        torch.cuda.empty_cache()
    output = ner_model(input)
    # print(input)
    if output.dim() == 1:
        output = output.view(1, -1)
    _, predicted = torch.max(output, 1)
    # print(predicted)
    predicted = predicted.to(device)

    res_pred = []
    for j in range(len(test_mask[i])):
        if test_mask[i][j] == 1:
            res_pred.append(int(predicted[j]))
    res.append(res_pred)

100%|██████████| 3453/3453 [00:45<00:00, 75.66it/s]


In [42]:
for i in range(len(res)):
    assert len(res[i]) == len(dataset['test'][i]['tokens'])

In [43]:
import csv

filename = "output.csv"
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(res)

In [None]:
preds = []
with open("output.csv", mode='r', newline='') as file:
    for line in file:
        pred = line.split(',')
        pred = [int(s) for s in pred]
        preds.append(pred)
        full_preds = []
        for p in preds:
            full_preds += p

total = 0
for i in range(len(dataset)):
    total += f1_score(dataset[i]['ner_tags'], preds[i], average='micro')
total / len(dataset)

In [None]:
full_preds = []
for p in preds:
    full_preds += p
full_truth = []
for data in dataset:
    full_truth += data['ner_tags']

assert len(full_preds) == len(full_truth)

In [None]:
for inputs, labels in val_loader_tqdm:
    # inputs = inputs.squeeze(0)
    # labels = labels.squeeze(0)
    inputs, labels = inputs.to(device), labels.to(device)
    outputs = ner_model(inputs)
    # if outputs.dim() == 1:
    #     outputs = outputs.view(1, -1)
    print(inputs)
    # print("input",enc.decode(inputs))
    # print(labels)
    # print(outputs)
    loss = criterion(outputs, labels)
    total_val_loss += loss.item()
    _, predicted = torch.max(outputs, 1)
    labels_list = labels.tolist()
    predicted_list = predicted.tolist()
    f1_score_val = f1_score(labels_list, predicted_list, average='micro')
    f1_score_val_total += f1_score_val
    # print(f"F1 score:{f1_score_val}")
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

In [37]:
test_sample_dataset = load_dataset("conll2003")['test']

In [44]:
preds = []
with open("output.csv", mode='r', newline='') as file:
    for line in file:
        pred = line.split(',')
        pred = [int(s) for s in pred]
        preds.append(pred)

total = 0
for i in range(len(test_sample_dataset)):
    total += f1_score(test_sample_dataset[i]['ner_tags'], preds[i], average='micro')
f1 = total / len(test_sample_dataset)
f1 = format(f1, ".4f")
print("Total f1 score:", f1)

Total f1 score: 0.8431


In [46]:
# idx = 3
for idx in range(50):
  input_tokens = test_sample_dataset[idx]['tokens']
  input_tags = test_sample_dataset[idx]['ner_tags']
  input_prediction = preds[idx]
  print("Input tokens:    ", input_tokens)
  print("Input sentence:  ", ' '.join(input_tokens))
  print("Actual tags:     ", input_tags)
  print("Predicted tags:  ", input_prediction)

Input tokens:     ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',', 'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.']
Input sentence:   SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Actual tags:      [0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Predicted tags:   [0, 0, 1, 0, 0, 0, 0, 0, 5, 0, 0, 0]
Input tokens:     ['Nadim', 'Ladki']
Input sentence:   Nadim Ladki
Actual tags:      [1, 2]
Predicted tags:   [5, 2]
Input tokens:     ['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06']
Input sentence:   AL-AIN , United Arab Emirates 1996-12-06
Actual tags:      [5, 0, 5, 6, 6, 0]
Predicted tags:   [5, 0, 5, 7, 0, 0]
Input tokens:     ['Japan', 'began', 'the', 'defence', 'of', 'their', 'Asian', 'Cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'Syria', 'in', 'a', 'Group', 'C', 'championship', 'match', 'on', 'Friday', '.']
Input sentence:   Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
Actua