In [1]:
#!pip install -U transformers --quiet

In [2]:
import transformers
transformers.__version__

'4.37.2'

In [26]:
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
import torch
import torch.nn.functional as F
from pprint import pprint

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Encoder

In [5]:
class EncoderModel:
    """Encoder class to encode documents and queries into a vector space."""
    def __init__(self, model_id, device):
        self.model_id = model_id
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = AutoModel.from_pretrained(self.model_id).to(self.device)

    def mean_pooling(self, model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def tokenize(self, document):
        return self.tokenizer(document, padding=True, truncation=True, return_tensors="pt").to(device)

    def model_inference(self, tokenized_document):
        with torch.no_grad():
            outputs = outputs = self.model(**tokenized_document)
        return outputs
    
    def __call__(self, document):
        tokenized_document = self.tokenize(document)
        outputs = self.model_inference(tokenized_document)
        embeddings = self.mean_pooling(outputs, tokenized_document["attention_mask"])
        return F.normalize(embeddings, p=2, dim=1).cpu().numpy()

In [6]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
encoder = EncoderModel(model_id, device)

  return self.fget.__get__(instance, owner)()


In [7]:
vectors = encoder(["Hello World", "Hello Mars", "Transformers are awesome"])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(vectors)

array([[1.0000002 , 0.5999794 , 0.1535384 ],
       [0.5999794 , 1.0000002 , 0.17221901],
       [0.1535384 , 0.17221901, 1.0000001 ]], dtype=float32)

# Decoder

In [23]:
class DecoderModel:
    """Causal language model to generate text based on a prompt."""
    def __init__(self, model_id, generation_config=None, device=None, **kwargs):
        self.model_id = model_id
        self.device = device if device else "auto"
        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, **kwargs)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        # TODO
        self.generation_config=generation_config if generation_config else {}

    def tokenize(self, prompt):
        _device = "cuda" if self.device == "auto" else self.device
        tokenized_prompt = self.tokenizer(prompt, return_tensors="pt").to(_device)
        return tokenized_prompt

    def __call__(self, prompt):
        tokenized_prompt = self.tokenize(prompt)
        outputs = self.model.generate(**tokenized_prompt, max_new_tokens=50) # TODO: generation config
        return self.tokenizer.decode(outputs[0][len(tokenized_prompt.input_ids[0]):])

In [24]:
model_id = "HuggingFaceH4/zephyr-7b-beta"
causal_lm = DecoderModel(model_id, device="cuda:0", torch_dtype=torch.bfloat16)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [28]:
pprint(causal_lm("What are you doing?"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


('\n'
 '\n'
 'I’m trying to figure out how to get to the top of this mountain.\n'
 '\n'
 'Why are you trying to climb that mountain?\n'
 '\n'
 'Because it’s there.\n'
 '\n'
 'But why do you want to climb it? What')
