In [1]:
!pip install transformers
!pip install datasets
!pip install torchdata



In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoModelForCausalLM, DataCollatorForTokenClassification, TrainingArguments, Trainer, GenerationConfig
# from datasets import load_dataset
import torch
from torchdata.datapipes.iter import IterableWrapper
import random

from torch.utils.data import Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import json

random_state = 33
random.seed(random_state)
torch.random.manual_seed(random_state)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

2024-07-28 22:12:39.992859: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-28 22:12:39.992976: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-28 22:12:40.121839: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
with open('/kaggle/input/arithmetic/_n_20_m_20_examples_20000000.txt') as f:
    lines = f.readlines()
lines = [i[:-1] for i in lines]
random.shuffle(lines)

In [4]:
def get_op(x):
  sp = x.split('+')
  return [sp[0]] + sp[1].split('=')

test_data, train_data = [], []
counter = [[0] * 21 for i in range(21)]

max_for_test = 250
for i in lines:
  op = get_op(i)
  len_op = [len(op[0]), len(op[1])]
  if counter[len_op[0]][len_op[1]] < max_for_test:
    counter[len_op[0]][len_op[1]] += 1
    test_data.append(i)
  else:
    train_data.append(i)

assert len(test_data) + len(train_data) == len(lines)

random.shuffle(test_data)
random.shuffle(train_data)
del lines

In [None]:
model_name = "numind/NuExtract-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_config = AutoConfig.from_pretrained(model_name)

model_config.bos_token_id = tokenizer.bos_token_id = 151644
model_config.eos_token_id = tokenizer.eos_token_id
model_config.pad_token_id = tokenizer.pad_token_id
assert model_config.eos_token_id == tokenizer.eos_token_id
assert model_config.bos_token_id == tokenizer.bos_token_id
assert model_config.pad_token_id == tokenizer.pad_token_id
print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token)
tokenizer.padding_side = "left"

In [6]:
class ArithmeticDataset(Dataset):
    def __init__(
        self,
        original_records: list,
        tokenizer: AutoTokenizer,
        add_global_bos: bool = True,
        add_global_eos: bool = True,
        labels_pad_token_id: int = -100,
        truncation_side: str = "left",
        train: bool = True
    ):
        self.original_records = original_records
        self.tokenizer = tokenizer
        self.labels_pad_token_id = labels_pad_token_id
        self.add_global_bos = add_global_bos
        self.add_global_eos = add_global_eos
        self.truncation_side = truncation_side
        self.is_printed = False
        self.train = train


        self.records = []
        for record in tqdm(original_records):
            tensors = self.convert_record(record)
            if tensors is None:
                continue
            self.records.append(tensors)

    def __len__(self):
        return len(self.records)

    def __getitem__(self, index):
        return self.records[index]

    def get_tokens(self, text):
        return self.tokenizer.convert_tokens_to_ids([i for i in text])

    def convert_record(self, record):

        input_ids, labels = [], []
        input_ids = self.get_tokens(record)
        labels = input_ids.copy()
        
        idx = labels.index(self.tokenizer.convert_tokens_to_ids(["="])[0])
        labels[:idx + 1] = [self.labels_pad_token_id] * (idx + 1)

        if not input_ids:
            return None

        if self.add_global_bos and input_ids[0] != self.tokenizer.bos_token_id:
            input_ids.insert(0, self.tokenizer.bos_token_id)
            labels.insert(0, self.labels_pad_token_id)

        if self.add_global_eos and input_ids[-1] != self.tokenizer.eos_token_id:
            input_ids.append(self.tokenizer.eos_token_id)
            labels.append(self.tokenizer.eos_token_id)
        
        if not self.train:
            idx = input_ids.index(self.tokenizer.convert_tokens_to_ids(["="])[0])
            labels = input_ids[idx + 1:]
            input_ids = input_ids[:idx + 1]
        if not self.is_printed:
            print(input_ids)
            print(labels)
            # print("Full prompt:", self.tokenizer.decode(input_ids, skip_special_tokens=False))
            self.is_printed = True

        input_ids = torch.LongTensor(input_ids)
        labels = torch.LongTensor(labels)
        attention_mask = input_ids.new_ones(input_ids.size())
        if self.train:
            assert input_ids.size(0) == labels.size(0) == attention_mask.size(0)
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask
        }

train_dataset = ArithmeticDataset(
    train_data[:int(5e6)],
    tokenizer
)

test_dataset = ArithmeticDataset(
    test_data,
    tokenizer,
    train=False
)

  0%|          | 0/5000000 [00:00<?, ?it/s]

[151644, 18, 15, 20, 22, 20, 22, 20, 17, 22, 10, 18, 21, 20, 28, 21, 21, 15, 23, 20, 22, 20, 17, 22, 151646]
[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 21, 21, 15, 23, 20, 22, 20, 17, 22, 151646]


100%|██████████| 5000000/5000000 [06:34<00:00, 12666.89it/s]
  3%|▎         | 2845/100000 [00:00<00:06, 14329.99it/s]

[151644, 24, 16, 10, 23, 19, 17, 24, 18, 17, 22, 16, 18, 18, 19, 16, 24, 17, 28]
[22, 21, 17, 24, 18, 17, 22, 16, 18, 18, 19, 16, 24, 17, 151646]


100%|██████████| 100000/100000 [00:06<00:00, 14928.92it/s]


In [7]:
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)

In [8]:
model = AutoModelForCausalLM.from_config(model_config)

In [9]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024, padding_idx=151643)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_attention_layernorm): Qwen2RMSNorm()
      )
    )
    (norm): Qwen2RMSNorm()
 

In [10]:
class Abacus(torch.nn.Module):
    def __init__(self, digit_tokens, embedding_dim, embedding_layer, max_seq_length=1024, max_k=99):
        super().__init__()
        self.embedding_layer = embedding_layer
        self.embedding = torch.nn.Embedding(max_seq_length, embedding_dim)
        self.register_buffer("digits", torch.tensor(digit_tokens), persistent=False)

        self.max_k = max_k

    def helper(self, mask, device):
        mask_shape = mask.shape
        
        shifted_mask = torch.cat([torch.zeros((mask_shape[0], 1), device=device, dtype=mask.dtype), mask[:, :-1]], dim=1)
        starts = (shifted_mask != mask) & mask

        segment_ids = torch.cumsum(starts, dim=1)

        index = torch.arange(mask.size(1)).repeat(mask.size(0), 1).to(device)

        reset_index = torch.zeros_like(mask).long()
        second_term = index * starts.long()
        reset_index = reset_index.scatter_add(1, segment_ids, second_term)

        positions = index - reset_index.gather(1, segment_ids) + 1

        result = positions * mask

        return result

    def forward(self, input_ids):
        mask = torch.isin(input_ids, self.digits)
        output = self.helper(mask, input_ids.device)

        k=0
        if self.training:
            k = random.randint(0, self.max_k)
            output[output>0] += k
        return self.embedding_layer(input_ids) + self.embedding(output)

new_embed_layer = Abacus(tokenizer.convert_tokens_to_ids(['0','1','2','3','4','5','6','7','8','9']), 1024, model.model.embed_tokens, max_seq_length=50, max_k = 19)
model.model.embed_tokens = new_embed_layer
model.to(device)
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Abacus(
      (embedding_layer): Embedding(151936, 1024, padding_idx=151643)
      (embedding): Embedding(50, 1024)
    )
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_a

In [11]:
!mkdir /kaggle/working/train_dir

  pid, fd = os.forkpty()


In [12]:
training_args = TrainingArguments(
  output_dir = '/kaggle/working/train_dir',
  evaluation_strategy = 'no',
  learning_rate=1e-4,
  weight_decay=0.001,
  num_train_epochs=1,
#   max_steps=12000,
  lr_scheduler_type='cosine',
  save_steps=1000,
  save_total_limit=1,
  seed=random_state,
  report_to='none',
  auto_find_batch_size = True,
  use_cpu = False
)



In [14]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator
)

In [15]:
trainer.train()

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Step,Training Loss
500,2.4299
1000,2.1822
1500,2.1737
2000,2.1724
2500,2.1645
3000,2.1635
3500,2.1691
4000,2.1612
4500,2.1599
5000,2.1606


KeyboardInterrupt: 

In [24]:
from safetensors.torch import load_model, save_model

load_model(model, "/kaggle/working/train_dir/checkpoint-7000/model.safetensors")
model.eval()
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Abacus(
      (embedding_layer): Embedding(151936, 1024, padding_idx=151643)
      (embedding): Embedding(50, 1024)
    )
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_a

In [38]:
dp = IterableWrapper(test_dataset[:10000])
dp = dp.batch(batch_size=8, wrapper_class=data_collator)
model.eval() 
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Abacus(
      (embedding_layer): Embedding(151936, 1024, padding_idx=151643)
      (embedding): Embedding(50, 1024)
    )
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm()
        (post_a

In [39]:
generation_config = {
  "_from_model_config": True,
  "bos_token_id": tokenizer.bos_token_id,
  "eos_token_id": tokenizer.eos_token_id,
  "pad_token_id": tokenizer.pad_token_id,
  "transformers_version": "4.42.4",
  "max_new_tokens": 100,
  "use_cache": False
}

with open('/kaggle/working/generation_config.json', 'w') as fp:
    json.dump(generation_config, fp)

generation_config = GenerationConfig.from_pretrained(pretrained_model_name=model_name, config_file_name='/kaggle/working/generation_config.json')
generation_config

GenerationConfig {
  "bos_token_id": 151644,
  "eos_token_id": 151646,
  "max_new_tokens": 100,
  "pad_token_id": 151643,
  "use_cache": false
}

In [41]:
def convert_to_list(x, from_model=True):
    x = x.cpu().tolist()
    res = []
    for i in x:
        l = 0
        while i[l] in [tokenizer.pad_token_id, tokenizer.bos_token_id, -100]:
            l += 1
        r = len(i) - 1
        while i[r] == tokenizer.pad_token_id and r > l:
            r -= 1
        res.append(i[l:r + 1])
    if from_model:
        return [i[i.index(tokenizer.convert_tokens_to_ids(["="])[0]) + 1:] for i in res]   
    return res
        
def compute_metric(x, labels):
    assert len(x) == len(labels)
    x, labels = convert_to_list(x), convert_to_list(labels, from_model=False)
    res = 0
    for i, j in zip(x, labels):
        if len(i) != len(j):
            continue
        for k in range(len(i)):
            if i[k] != j[k]:
                break
            if k == len(i) - 1:
                res += 1
    return res

In [42]:
res = 0
for batch in tqdm(dp):
    with torch.no_grad():
        output = model.generate(input_ids=batch["input_ids"].to(device),generation_config=generation_config)
        a = compute_metric(output, batch["labels"])
        res += a

100%|██████████| 1250/1250 [28:46<00:00,  1.38s/it]


In [50]:
print("Accuracy =", res / 1e4)

Accuracy = 0.0513
