In [None]:
# !zip -r phrase_tokenizer phrase_tokenizer

In [None]:
!pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
import torch
import time
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt


import torch.nn.functional as F
import torch.nn as nn

from datasets import Dataset, load_dataset, DatasetDict
from transformers import pipeline, set_seed
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import os
import string
from nltk import word_tokenize
from nltk.corpus import stopwords


import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Load and preprocess bigrams

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("toddcook/bert-english-uncased-bigrams")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/toddcook/bert-english-uncased-bigrams/versions/3


In [None]:
filename = 'bert.eng.uncased.bigrams.csv'

stopwords_set = set(stopwords.words('english'))
punctuation_set = set(string.punctuation)

def is_valid_token(token):
    not_alphas = sum([not char.isalpha() for char in token])
    return token.lower() not in stopwords_set and token not in punctuation_set and not_alphas == 0 and not token.isdigit()


input_file_path = os.path.join(path, filename)

output_file_path = os.path.join('valid_bigrams.csv')

with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    outfile.write("bigram_1,bigram_2,count\n")

    for idx, line in tqdm(enumerate(infile)):
        parts = line.strip().split()

        if idx == 0 or "'" in parts[0]:
          continue

        if len(parts) == 3:
            bigram_1, bigram_2, count = parts

            if int(count) < 1000 :
              break

            if is_valid_token(bigram_1) and is_valid_token(bigram_2):
                outfile.write(f"{bigram_1},{bigram_2},{count}\n")

251195it [00:00, 391722.64it/s]


In [None]:
df = pd.read_csv(output_file_path, on_bad_lines='skip')
bigram_set = set(df.apply(lambda row: ' '.join([row['bigram_1'], row['bigram_2']]), axis=1))
df

Unnamed: 0,bigram_1,bigram_2,count
0,united,states,1026737
1,new,york,791984
2,world,war,378418
3,first,time,298847
4,also,known,228084
...,...,...,...
39241,inflicted,upon,1000
39242,mind,telling,1000
39243,toured,throughout,1000
39244,la,fontaine,1000


In [None]:
df[(df['bigram_1'] == 'grow') & (df['bigram_2'] == "old")]


Unnamed: 0,bigram_1,bigram_2,count
23639,grow,old,1498


## Create custom tokenizer

A tokenizer splits text into phrases (is present at vocabulary) or words

In [None]:
from transformers import PreTrainedTokenizer
from typing import Dict, List, Optional, Union
import torch
import json
import os


class PhraseTokenizer(PreTrainedTokenizer):
    def __init__(self, phrase_set, **kwargs):

        self.split_special_tokens = False
        self.verbose = False

        self.unk_token="[UNK]"
        self.sep_token="[SEP]"
        self.pad_token="[PAD]"
        self.cls_token="[CLS]"
        self.mask_token="[MASK]"

        self.max_phrase_length = 2    #  !!!!!may change later!!!!!!
        self.phrase_set = phrase_set
        self.vocab = self._build_vocab()
        self.id_to_token = {v: k for k, v in self.vocab.items()}

        super().__init__(
            unk_token="[UNK]",
            sep_token="[SEP]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            mask_token="[MASK]",
            **kwargs
        )


    def _build_vocab(self) -> Dict[str, int]:
        vocab = {}
        idx = 0

        special_tokens = [
            self.unk_token, self.sep_token, self.pad_token, self.cls_token, self.mask_token
        ]
        for token in special_tokens:
            vocab[token] = idx
            idx += 1

        for phrase in self.phrase_set:
            if phrase not in vocab:
                vocab[phrase] = idx
                idx += 1

        return vocab

    def _tokenize(self, text):
        lowercase_words = word_tokenize(text.lower())
        original_words = word_tokenize(text)

        tokens = []
        i = 0

        while i < len(lowercase_words):
            # match the longest possible phrase
            max_phrase_length = min(self.max_phrase_length, len(lowercase_words) - i)
            for j in range(max_phrase_length, 0, -1):
                phrase = " ".join(lowercase_words[i:i+j])
                if phrase in self.vocab:
                    original_phrase = " ".join(original_words[i:i+j])
                    tokens.append(original_phrase)
                    i += j
                    break
            else:
                # add as a single token
                tokens.append(original_words[i])
                i += 1

        return tokens

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab["[UNK]"])

    def _convert_id_to_token(self, index):
        return self.id_to_token.get(index, "[UNK]")

    def get_vocab(self):
        return self.vocab

    def save_pretrained(self, save_directory: str, **kwargs):
        if not os.path.isdir(save_directory):
            os.mkdir(save_directory)
        vocab_file = os.path.join(save_directory, "vocab.txt")
        with open(vocab_file, "w", encoding="utf-8") as f:
            for token in self.vocab:
                f.write(token + "\n")

        config = {
            "unk_token": self.unk_token,
            "sep_token": self.sep_token,
            "pad_token": self.pad_token,
            "cls_token": self.cls_token,
            "mask_token": self.mask_token,
            "max_phrase_length": self.max_phrase_length,
            "phrase_set": list(self.phrase_set)
        }
        config_file = os.path.join(save_directory, "tokenizer_config.json")
        with open(config_file, "w", encoding="utf-8") as f:
            json.dump(config, f, indent=2)

    @classmethod
    def from_pretrained(cls, save_directory: str, **kwargs):
        config_file = os.path.join(save_directory, "tokenizer_config.json")
        with open(config_file, "r", encoding="utf-8") as f:
            config = json.load(f)

        # Load vocabulary (preserve order!)
        vocab_file = os.path.join(save_directory, "vocab.txt")
        with open(vocab_file, "r", encoding="utf-8") as f:
            # Read phrases in order (skip special tokens already in config)
            phrase_set = [line.strip() for line in f if line.strip() not in {
                config["unk_token"], config["sep_token"],
                config["pad_token"], config["cls_token"],
                config["mask_token"]
            }]

        return cls(
            phrase_set=set(phrase_set),
            **kwargs
        )


    def __call__(
        self,
        text: Union[str, List[str]],
        text_pair: Optional[Union[str, List[str]]] = None,
        add_special_tokens: bool = False,
        padding: Union[bool, str] = False,
        truncation: Union[bool, str] = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[str] = None,
        **kwargs
    ) -> Dict[str, List[int]]:
        tokens = self._tokenize(text)

        if add_special_tokens:
            tokens = [self.cls_token] + tokens + [self.sep_token]

        input_ids = [self._convert_token_to_id(token) for token in tokens]

        attention_mask = [1] * len(input_ids)

        if padding:
            max_len = max_length if max_length else self.model_max_length
            if len(input_ids) < max_len:
                pad_length = max_len - len(input_ids)
                input_ids += [self.pad_token_id] * pad_length
                attention_mask += [0] * pad_length
            elif truncation and len(input_ids) > max_len:
                input_ids = input_ids[:max_len]
                attention_mask = attention_mask[:max_len]

        if return_tensors == "pt":
            input_ids = torch.tensor([input_ids])
            attention_mask = torch.tensor([attention_mask])

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
        }

    @property
    def pad_token_id(self) -> int:
        return self.vocab[self.pad_token]

    @property
    def cls_token_id(self) -> int:
        return self.vocab[self.cls_token]

    @property
    def sep_token_id(self) -> int:
        return self.vocab[self.sep_token]



Check how the tokenizer works

In [None]:
text = "New York is a big city. Machine learning is a part of artificial intelligence."

custom_tokenizer = PhraseTokenizer(bigram_set)
tokenized_text = custom_tokenizer(text)
custom_tokenizer.save_pretrained("phrase_tokenizer")

phrase_tok = PhraseTokenizer.from_pretrained("phrase_tokenizer")
tokenized_text_1 = phrase_tok(text)

print(tokenized_text)
print(tokenized_text_1)
print(custom_tokenizer.tokenize(text))

{'input_ids': [0, 0, 0, 14622, 0, 0, 0, 0, 0, 0, 9751, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [0, 0, 0, 14622, 0, 0, 0, 0, 0, 0, 9751, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['New York', 'is', 'a', 'big city', '.', 'Machine learning', 'is', 'a', 'part', 'of', 'artificial intelligence', '.']


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

base_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
QwenModel = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

embeddings = QwenModel.get_input_embeddings()

In [None]:
print(base_tokenizer.tokenize(text))

['New', 'ĠYork', 'Ġis', 'Ġa', 'Ġbig', 'Ġcity', '.', 'ĠMachine', 'Ġlearning', 'Ġis', 'Ġa', 'Ġpart', 'Ġof', 'Ġartificial', 'Ġintelligence', '.']


In [None]:
def get_embedding_for_token(token, base_tokenizer, embeddings):
    """
    Get the embedding for a token using the pre-trained model's tokenizer and embeddings.
    If the token is not in the vocabulary, use the mean of its subword embeddings.
    """
    subwords = base_tokenizer.tokenize(token)

    if not subwords:
        return torch.zeros(embeddings.embedding_dim)  #!!! change to rand!!!

    subword_ids = base_tokenizer.convert_tokens_to_ids(subwords)
    subword_embeddings = embeddings(torch.tensor(subword_ids))

    return torch.mean(subword_embeddings, dim=0)

In [None]:
def get_embeddings_for_sequence(text, custom_tokenizer, base_tokenizer, embeddings):
    """
    Get embeddings for a sequence of tokens using the pre-trained model's tokenizer and embeddings.
    """
    custom_tokenized_tokens = custom_tokenizer.tokenize(text)
    sequence_embeddings = []

    for token in custom_tokenized_tokens:
        embedding = get_embedding_for_token(token, base_tokenizer, embeddings)
        sequence_embeddings.append(embedding[:][:3])

    return torch.stack(sequence_embeddings)

In [None]:
sw = base_tokenizer.tokenize("New York is")
sw_i = base_tokenizer.convert_tokens_to_ids(sw)
sw_e = embeddings(torch.tensor(sw_i))

In [None]:
print(sw)

['New', 'ĠYork', 'Ġis']


In [None]:
print(sw_e[0][:3])
print(sw_e[1][:3])
print(sw_e[2][:3])


tensor([-0.0176,  0.0114,  0.0320], grad_fn=<SliceBackward0>)
tensor([-0.0087, -0.0189,  0.0021], grad_fn=<SliceBackward0>)
tensor([ 0.0156, -0.0188,  0.0114], grad_fn=<SliceBackward0>)


In [None]:
get_embeddings_for_sequence('New York is', custom_tokenizer, base_tokenizer, embeddings)

tensor([[-0.0131, -0.0038,  0.0170],
        [-0.0221, -0.0322,  0.0106]], grad_fn=<StackBackward0>)

### Load dataset for summarization fune tuning

In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

## Initialize the model

In [None]:
class QwenWithPhrases(QwenModel.__class__):
    def __init__(self, config):
        super().__init__(config)
        self.model = AutoModelForCausalLM.from_config(config)
        self.subword_embed = self.model.get_input_embeddings()
        self.phrase_projector = nn.Linear(config.hidden_size, config.hidden_size)

    def forward(
        self,
        subword_ids: torch.Tensor,
        phrase_embeddings: torch.Tensor,
        attention_mask: torch.Tensor
    ):
        subword_embeds = self.subword_embed(subword_ids)
        phrase_embeds = self.phrase_projector(phrase_embeddings)

        fused_embeds = 0.7 * subword_embeds + 0.3 * phrase_embeds

        outputs = self.model(
            inputs_embeds=fused_embeds,
            attention_mask=attention_mask
        )
        return outputs

In [None]:
###### TEMP  ######

short_dataset = DatasetDict()
for s in dataset:
  short_dataset[s] = dataset[s].select(range(10))

In [None]:
text = dataset['train']['article'][0]
get_embeddings_for_sequence(text=text, custom_tokenizer=custom_tokenizer, base_tokenizer=base_tokenizer, embeddings=embeddings)

tensor([[ 0.0096, -0.0073,  0.0090],
        [-0.0325, -0.0471,  0.0238],
        [ 0.0037,  0.0154, -0.0240],
        ...,
        [ 0.0177, -0.0269, -0.0025],
        [ 0.0075,  0.0049, -0.0067],
        [-0.0371, -0.0261,  0.0200]], grad_fn=<StackBackward0>)

In [None]:
def preprocess_function(examples):

  ### embed articles ###
    model_inputs = base_tokenizer(
        examples["article"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

  ### phrase embed articles ###
    phrase_embeddings = []
    max_phrase_length = 512

    for article in examples["article"]:
        ph_embeddings = get_embeddings_for_sequence(
            text=article,
            custom_tokenizer=custom_tokenizer,
            base_tokenizer=base_tokenizer,
            embeddings=embeddings
        )

        if ph_embeddings.shape[0] < max_phrase_length:
            padding_length = max_phrase_length - ph_embeddings.shape[0]
            ph_embeddings = F.pad(ph_embeddings, (0, 0, 0, padding_length))  # Pad along the sequence dimension
        else:
            ph_embeddings = ph_embeddings[:max_phrase_length, :]  # Truncate if necessary

        phrase_embeddings.append(ph_embeddings)

    phrase_embeddings = torch.stack(phrase_embeddings)

    model_inputs["phrase_embeddings"] = phrase_embeddings

    ### embed highlights ###
    with base_tokenizer.as_target_tokenizer():
        labels = base_tokenizer(
            examples["highlights"],
            max_length=128,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
    model_inputs["labels"] = labels["input_ids"]


    ### phrase embed highlights ###
    phrase_embedded_highlights = []
    max_phrase_length = 128

    for highlight in examples["highlights"]:
        ph_embeddings = get_embeddings_for_sequence(
            text=highlight,
            custom_tokenizer=custom_tokenizer,
            base_tokenizer=base_tokenizer,
            embeddings=embeddings
        )

        if ph_embeddings.shape[0] < max_phrase_length:
            padding_length = max_phrase_length - ph_embeddings.shape[0]
            ph_embeddings = F.pad(ph_embeddings, (0, 0, 0, padding_length))
        else:
            ph_embeddings = ph_embeddings[:max_phrase_length, :]

        phrase_embedded_highlights.append(ph_embeddings)

    phrase_embedded_highlights = torch.stack(phrase_embedded_highlights)

    model_inputs["phrase_labels"] = phrase_embeddings
    return model_inputs

tokenized_dataset = short_dataset.map(preprocess_function, batched=True)



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'phrase_embeddings', 'labels', 'phrase_labels'],
        num_rows: 10
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'phrase_embeddings', 'labels', 'phrase_labels'],
        num_rows: 10
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'phrase_embeddings', 'labels', 'phrase_labels'],
        num_rows: 10
    })
})

In [None]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

model = QwenWithPhrases(config)

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
)

In [None]:
from datasets import load_metric

rouge = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ImportError: To be able to use evaluate-metric/rouge, you need to install the following dependencies['rouge_score'] using 'pip install rouge_score' for instance'

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)