In [97]:
import sentence_transformers
from beir import util, LoggingHandler
from beir.retrieval import models as beir_models
from beir.datasets.data_loader import GenericDataLoader
from beir.retrieval.evaluation import EvaluateRetrieval
from beir.retrieval.search.dense import DenseRetrievalExactSearch as DRES
from sentence_transformers import models, losses, datasets
from torch.utils.data import Dataset
from typing import List
from sentence_transformers.readers import InputExample
import numpy as np
from transformers.utils.import_utils import is_nltk_available, NLTK_IMPORT_ERROR
from transformers import AutoTokenizer
from nltk import word_tokenize, TreebankWordDetokenizer
import nltk
nltk.download('punkt')
from torch import nn, Tensor
from typing import Iterable, Dict
from sentence_transformers import SentenceTransformer
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, PreTrainedModel,AutoModel
import logging
import os
import random
logger = logging.getLogger(__name__)



[nltk_data] Downloading package punkt to /home/cgrdj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [88]:

model_name = "huawei-noah/TinyBERT_General_4L_312D"
tokenizer = AutoTokenizer.from_pretrained(model_name,do_lower_case=False,clean_up_tokenization_spaces=False,clean_text=False)


In [91]:

class MaskedAutoEncoderDataset(Dataset):
    """
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    :param sentences: A list of sentences
    :param noise_fn: A noise function: Given a string, it returns a string with noise, e.g. deleted words
    """

    def __init__(self, sentences: List[str],tokenizer ):
        if not is_nltk_available():
            raise ImportError(NLTK_IMPORT_ERROR.format(self.__class__.__name__))

        self.sentences = sentences
        self.tokenizer=tokenizer

    def __getitem__(self, item):
        sent = self.sentences[item]
        return InputExample(texts=[self.noisen(sent,MASK_ratio=0.225), self.noisen(sent,MASK_ratio=0.5),sent])

    def __len__(self):
        return len(self.sentences)

    # Masking noise.
    def noisen(self,text, MASK_ratio=0.15):
        mask_id=self.tokenizer.mask_token_id
        words= text.split()#word_tokenize(text)
        # Apply the masking logic to each word and rejoin the sentence
        splitted_tokens = self.tokenizer.batch_encode_plus(words,return_attention_mask=False,return_token_type_ids=False,add_special_tokens=False)['input_ids']#encode each tokens in each
        masked_tokens =[[ mask_id if np.random.rand() < MASK_ratio else tok_id for tok_id in token]  for token in splitted_tokens]
        masked_sentence=' '.join([self.tokenizer.decode(masked_token).replace(" ",'') for masked_token in masked_tokens])
        return masked_sentence

In [92]:

word_embedding_model = models.Transformer(model_name, max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), "cls")
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [None]:

class MaskedAutoEncoderLoss(nn.Module):
    def __init__(self, model: SentenceTransformer, decoder_name_or_path: str = None):
        """
        This loss expects as input a pairs of damaged sentences and the corresponding original ones.
        During training, the decoder reconstructs the original sentences from the encoded sentence embeddings.
        Here the argument 'decoder_name_or_path' indicates the pretrained model (supported by Hugging Face) to be used as the decoder.
        Since decoding process is included, here the decoder should have a class called XXXLMHead (in the context of Hugging Face's Transformers).
        The 'tie_encoder_decoder' flag indicates whether to tie the trainable parameters of encoder and decoder,
        which is shown beneficial to model performance while limiting the amount of required memory.
        Only when the encoder and decoder are from the same architecture, can the flag 'tie_encoder_decoder' work.

        The data generation process (i.e. the 'damaging' process) has already been implemented in ``DenoisingAutoEncoderDataset``,
        allowing you to only provide regular sentences.

        :param model: SentenceTransformer model
        :param decoder_name_or_path: Model name or path for initializing a decoder (compatible with Huggingface's Transformers)
        :param tie_encoder_decoder: whether to tie the trainable parameters of encoder and decoder

        References:
            * TSDAE paper: https://arxiv.org/pdf/2104.06979.pdf
            * `Unsupervised Learning > TSDAE <../../examples/unsupervised_learning/TSDAE/README.html>`_

        Requirements:
            1. The decoder should have a class called XXXLMHead (in the context of Hugging Face's Transformers)
            2. Should use a large corpus

        Inputs:
            +------------------------------------------------------+--------+
            | Texts                                                | Labels |
            +======================================================+========+
            | (damaged\_sentence, original\_sentence) pairs        | none   |
            +------------------------------------------------------+--------+
            | sentence fed through ``DenoisingAutoEncoderDataset`` | none   |
            +------------------------------------------------------+--------+

        Example:
            ::

                from sentence_transformers import SentenceTransformer, losses
                from sentence_transformers.datasets import DenoisingAutoEncoderDataset
                from torch.utils.data import DataLoader

                model_name = "bert-base-cased"
                model = SentenceTransformer(model_name)
                train_sentences = [
                    "First training sentence", "Second training sentence", "Third training sentence", "Fourth training sentence",
                ]
                batch_size = 2
                train_dataset = DenoisingAutoEncoderDataset(train_sentences)
                train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
                train_loss = losses.DenoisingAutoEncoderLoss(
                    model, decoder_name_or_path=model_name, tie_encoder_decoder=True
                )
                model.fit(
                    train_objectives=[(train_dataloader, train_loss)],
                    epochs=10,
                )
        """
        super(MaskedAutoEncoderLoss, self).__init__()
        self.encoder = model  # This will be the final model used during the inference time.
        self.tokenizer_encoder = model.tokenizer

        name_or_path = model[0].auto_model.config._name_or_path

        self.tokenizer_decoder = AutoTokenizer.from_pretrained(name_or_path)

        decoder_config = AutoConfig.from_pretrained(name_or_path)
        decoder_config.is_decoder = True
        decoder_config.add_cross_attention = True
        kwargs_decoder = {"config": decoder_config}
        try:
            self.decoder = AutoModelForCausalLM.from_pretrained(name_or_path, **kwargs_decoder)
        except ValueError as e:
            logger.error(
                f'Model name or path "{name_or_path}" does not support being as a decoder. Please make sure the decoder model has an "XXXLMHead" class.'
            )
            raise e
        if self.tokenizer_decoder.pad_token is None:
            # Needed by GPT-2, etc.
            self.tokenizer_decoder.pad_token = self.tokenizer_decoder.eos_token
            self.decoder.config.pad_token_id = self.decoder.config.eos_token_id

        if len(AutoTokenizer.from_pretrained(name_or_path)) != len(self.tokenizer_encoder):
            logger.warning(
                "WARNING: The vocabulary of the encoder has been changed. One might need to change the decoder vocabulary, too."
            )

    def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        source_features, target_features = tuple(sentence_features)
        if self.need_retokenization:
            # since the sentence_features here are all tokenized by encoder's tokenizer,
            # retokenization by the decoder's one is needed if different tokenizers used
            target_features = self.retokenize(target_features)
        reps = self.encoder(source_features)["sentence_embedding"]  # (bsz, hdim)

        # Prepare input and output
        target_length = target_features["input_ids"].shape[1]
        decoder_input_ids = target_features["input_ids"].clone()[:, : target_length - 1]
        label_ids = target_features["input_ids"][:, 1:]

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            inputs_embeds=None,
            attention_mask=None,
            encoder_hidden_states=reps[:, None],  # (bsz, hdim) -> (bsz, 1, hdim)
            encoder_attention_mask=source_features["attention_mask"][:, 0:1],
            labels=None,
            return_dict=None,
            use_cache=False,
        )

        # Calculate loss
        lm_logits = decoder_outputs[0]
        ce_loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer_decoder.pad_token_id)
        loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), label_ids.reshape(-1))
        return loss


    def forward_(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
        encoder_inputs,decoder_inputs , target_features = tuple(sentence_features)
        reps = self.encoder(encoder_inputs)["sentence_embedding"]  # (bsz, hdim)

        # Prepare input and output
        target_length = target_features["input_ids"].shape[1]
        decoder_input_ids = target_features["input_ids"].clone()[:, : target_length - 1]
        label_ids = target_features["input_ids"][:, 1:]

        # Decode
        decoder_outputs = self.decoder(
            input_ids=decoder_input_ids,
            inputs_embeds=None,
            attention_mask=None,
            encoder_hidden_states=reps[:, None],  # (bsz, hdim) -> (bsz, 1, hdim)
            encoder_attention_mask=encoder_inputs["attention_mask"][:, 0:1],
            labels=None,
            return_dict=None,
            use_cache=False,
        )

        # Calculate loss
        lm_logits = decoder_outputs[0]
        ce_loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer_decoder.pad_token_id)
        loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), label_ids.reshape(-1))
        return loss


In [98]:
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))
corpus, queries, qrels = GenericDataLoader(data_path).load(split="train") # or split = "train" or "dev"
unsupervised_train_data =  list(queries.values())#+[data['title']+' \n '+data['text'] for data in list(corpus.values())]
random.Random(0).shuffle(unsupervised_train_data)


/home/cgrdj/Documents/code/repos/sentence-transformers/datasets/scifact.zip: 100%|██████████| 2.69M/2.69M [00:00<00:00, 4.08MiB/s]


Dataset downloaded here: /home/cgrdj/Documents/code/repos/sentence-transformers/datasets/scifact


100%|██████████| 5183/5183 [00:00<00:00, 150651.96it/s]


In [100]:
MaskedAutoEncoderDataset(unsupervised_train_data,tokenizer)

<__main__.MaskedAutoEncoderDataset at 0x751648e6f8d0>

In [94]:
next(iter(MaskedAutoEncoderDataset(["asdasdf adsf sda fads 'asd gasdg aasdfasdf /dsaf sadfsd,dasfadsf. fadsf.fads,fadsfa"],tokenizer))).texts

['[MASK]dasdf ads[MASK] sda [MASK]ds [MASK]asd gasdg aasdfa[MASK]f /dsaf sadfsd,dasfa[MASK]f. [MASK]ds[MASK].fads,fadsfa',
 '[MASK]dasdf [MASK][MASK] sda fads [MASK]asd gas[MASK]g [MASK]sd[MASK][MASK][MASK] [MASK][MASK]af [MASK][MASK][MASK][MASK]dasfadsf[MASK] [MASK][MASK][MASK].fads[MASK]fads[MASK]',
 "asdasdf adsf sda fads 'asd gasdg aasdfasdf /dsaf sadfsd,dasfadsf. fadsf.fads,fadsfa"]

In [77]:
sentence="asdasdf adsf sda fads 'asd gasdg aasdfasdf /dsaf sadfsd,dasfadsf. fadsf.fads,fadsfa"
tokenizer.decode(tokenizer.encode_plus(sentence,return_attention_mask=False,return_token_type_ids=False,add_special_tokens=False)['input_ids'])



"asdasdf adsf sda fads ' asd gasdg aasdfasdf / dsaf sadfsd, dasfadsf. fadsf. fads, fadsfa"

In [5]:
tokenizer.decode(tokenizer.encode('as [MASK]df',add_special_tokens=False))



'as [MASK] df'

In [6]:

sentences=["asdasdf adsf sda fads 'asd gasdg aasdfasdf /dsaf sadfsd,dasfadsf. fadsf.fads,fadsfa"]*10000
masked_sentences=[]
for sentence in sentences:
    words= word_tokenize(sentence)
    # Apply the masking logic to each word and rejoin the sentence
    splitted_tokens = tokenizer.batch_encode_plus(words,return_attention_mask=False,return_token_type_ids=False,add_special_tokens=False)['input_ids']
    masked_sentence=' '.join([tokenizer.decode([ mask_id if np.random.rand() < mask_probability else tok_id for tok_id in word]).replace(" ",'') for word in splitted_tokens])
    masked_sentences.append(masked_sentence)



# masked_sentence = ' '.join([mask_token if np.random.rand() < mask_probability else word for word in words])
        

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/home/cgrdj/nltk_data'
    - '/home/cgrdj/Documents/code/repos/sentence-transformers/.conda/nltk_data'
    - '/home/cgrdj/Documents/code/repos/sentence-transformers/.conda/share/nltk_data'
    - '/home/cgrdj/Documents/code/repos/sentence-transformers/.conda/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [369]:
masked_sentence

[6904, 5104, 2546, 1012, 6904, 5104]

In [416]:
remove_spaces_table=str.maketrans('', '', ' ')


In [431]:
%%timeit 
tokenizer.decode([ mask_id if np.random.rand() < mask_probability else tok_id for tok_id in splitted_tokens[12]]).replace(" ",'')


53.5 µs ± 2.3 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [429]:
%%timeit -n 100000
tokenizer.decode([ mask_id if np.random.rand() < mask_probability else tok_id for tok_id in splitted_tokens[12]]).translate(remove_spaces_table)


59.8 µs ± 4.79 µs per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [385]:
tokenizer('f')

{'input_ids': [101, 1042, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}

In [227]:
tokenizer.convert_tokens_to_string(tokenizer(sentence)['input_ids'])



TypeError: argument 'tokens': 'int' object cannot be converted to 'PyString'

In [212]:
mask_probability=0.15
mask_token = tokenizer.mask_token  # Get the mask token
tokens = tokenizer.tokenize(sentence)
# Decide randomly which tokens to mask
masked_indices = np.random.rand(len(tokens)) < mask_probability
# Replace selected tokens with the mask token
masked_tokens = [mask_token if mask else token for token, mask in zip(tokens, masked_indices)]
# Convert the list of tokens back to a string
masked_sentence = tokenizer.convert_tokens_to_string(masked_tokens)
# Add the masked sentence to the list
# masked_sentences.append(masked_sentence)
masked_sentence

"asdas [MASK] adsf sda fads ' asd gas [MASK]g aasdfasdf / dsaf sad [MASK]d, [MASK]fadsf. fadsf. fa [MASK], fadsfa"

In [208]:
masked_sentence

"as [MASK]df adsf sd [MASK] fads ' asd gas [MASK]g aa [MASK]fasdf [MASK] [MASK] [MASK] sadfsd, dasfadsf. [MASK] [MASK] [MASK]. fads [MASK] fa [MASK]fa"

In [203]:

word_tokenize(sentence)




['asdasdf',
 'adsf',
 'sda',
 'fads',
 "'asd",
 'gasdg',
 'aasdfasdf',
 '/dsaf',
 'sadfsd',
 ',',
 'dasfadsf',
 '.',
 'fadsf.fads',
 ',',
 'fadsfa']

# Data

In [192]:
dataset = "scifact"
url = "https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{}.zip".format(dataset)
out_dir = os.path.join(os.getcwd(), "datasets")
data_path = util.download_and_unzip(url, out_dir)
print("Dataset downloaded here: {}".format(data_path))

Dataset downloaded here: /Users/cgrdj/Documents/Code/sentence-transformers/datasets/scifact


In [193]:
corpus, queries, qrels = GenericDataLoader(data_path).load(split="train") # or split = "train" or "dev"
unsupervised_train_data =  list(queries.values())#+[data['title']+' \n '+data['text'] for data in list(corpus.values())]
random.Random(0).shuffle(unsupervised_train_data)


2024-03-16 00:49:24 - Loading Corpus...


100%|██████████| 5183/5183 [00:00<00:00, 11485.08it/s]

2024-03-16 00:49:24 - Loaded 5183 TRAIN Documents.
2024-03-16 00:49:24 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 ver




In [194]:
data_path = "datasets/scifact"
test_corpus, test_queries, test_qrels = GenericDataLoader(data_path).load(split="test") # or split = "train" or "dev"

2024-03-16 00:49:25 - Loading Corpus...


  0%|          | 0/5183 [00:00<?, ?it/s]

100%|██████████| 5183/5183 [00:00<00:00, 16216.28it/s]


2024-03-16 00:49:26 - Loaded 5183 TEST Documents.
2024-03-16 00:49:26 - Doc Example: {'text': 'Alterations of the architecture of cerebral white matter in the developing human brain can affect cortical development and result in functional disabilities. A line scan diffusion-weighted magnetic resonance imaging (MRI) sequence with diffusion tensor analysis was applied to measure the apparent diffusion coefficient, to calculate relative anisotropy, and to delineate three-dimensional fiber architecture in cerebral white matter in preterm (n = 17) and full-term infants (n = 7). To assess effects of prematurity on cerebral white matter development, early gestation preterm infants (n = 10) were studied a second time at term. In the central white matter the mean apparent diffusion coefficient at 28 wk was high, 1.8 microm2/ms, and decreased toward term to 1.2 microm2/ms. In the posterior limb of the internal capsule, the mean apparent diffusion coefficients at both times were similar (1.2 vers