In [1]:
import subprocess

def get_used_vram():
    command = "nvidia-smi --query-gpu=memory.used --format=csv"
    memory_free_info = subprocess.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_used_vram()

[570]

In [2]:
import psutil

base_ws = None
def print_mem():
    global base_ws

    process = psutil.Process()
    ws = process.memory_info().rss
    # if base_ws is None:
    #     base_ws = ws
    #     print(f'Base WS: {base_ws / (1 << 30):.2f} GiB')
    # else:
    #     ws -= base_ws

    vram = get_used_vram()

    print(f'Used ws and vram: {ws / (1 << 30):.2f} GiB | {vram[0] / 1024:.2f} GiB')
print_mem()

Used ws and vram: 0.08 GiB | 0.79 GiB


In [3]:
import torch

print(torch.cuda.is_available())
print_mem()

True
Used ws and vram: 0.42 GiB | 0.81 GiB


In [3]:
import data

chunks = data.chunks()
print_mem()

148574
298
Used ws and vram: 0.08 GiB | 0.78 GiB


## Transformers

In [10]:
from transformers import AutoModel, AutoTokenizer
from numpy.linalg import norm
print_mem()

Used ws and vram: 0.52 GiB | 0.98 GiB


In [6]:
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True) # trust_remote_code is needed to use the encode method
# embeddings = model.encode(['How is the weather today a?', '今天天气怎么样?'])
# print(cos_sim(embeddings[0], embeddings[1]))
print_mem()

Used ws and vram: 4.58 GiB | 0.94 GiB


In [11]:
import torch.nn.functional as F

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2', trust_remote_code=True) # trust_remote_code is needed to use the encode method

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def encode(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# embeddings = model.encode(['How is the weather today a?', '今天天气怎么样?'])
# print(cos_sim(embeddings[0], embeddings[1]))
print_mem()

Used ws and vram: 0.52 GiB | 0.98 GiB


In [12]:
embeddings = encode(['How is the weather today a?', '今天天气怎么样?'])
print_mem()

Used ws and vram: 0.57 GiB | 0.98 GiB


In [8]:
%timeit model.encode(['How is the weather today a?', '今天天气怎么样?'])

89.4 ms ± 9.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
samples = chunks[:256]
%timeit encode(samples)

6.48 s ± 481 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
import gc

del model
gc.collect()
print_mem()

Used memory and vram: 0.97 GiB | 0.92 GiB


## sentence-transformers

In [4]:
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm
print_mem()

  from tqdm.autonotebook import tqdm, trange


Used ws and vram: 0.58 GiB | 0.78 GiB


In [5]:
import gc
gc.collect()
print_mem()

Used ws and vram: 0.58 GiB | 1.09 GiB


In [5]:
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = SentenceTransformer('jinaai/jina-embeddings-v2-base-zh', device='cuda:0', trust_remote_code=True)
print_mem()

Used ws and vram: 1.20 GiB | 4.69 GiB


In [None]:
embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
print(cos_sim(embeddings[0], embeddings[1]))
print_mem()

In [7]:
import gc
gc.collect()
print_mem()

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = SentenceTransformer('jinaai/jina-embeddings-v2-base-zh', device='cpu', backend='onnx', trust_remote_code=True, model_kwargs={
    'file_name': 'onnx/model.onnx'
})
print_mem()

Used ws and vram: 0.64 GiB | 1.16 GiB
Used ws and vram: 1.72 GiB | 1.00 GiB


In [7]:
import gc
gc.collect()
print_mem()

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = SentenceTransformer('jinaai/jina-embeddings-v2-base-zh', device='cpu', backend='onnx', trust_remote_code=True, model_kwargs={
    'file_name': 'onnx/model_quantized.onnx'
})
print_mem()

Used ws and vram: 0.59 GiB | 0.86 GiB
Used ws and vram: 0.90 GiB | 1.08 GiB


In [9]:
samples = chunks[:64]
%timeit model.encode(samples, batch_size=32)
print_mem()

8.46 s ± 88 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 1.84 GiB | 1.01 GiB


In [14]:
import gc
gc.collect()
print_mem()

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = SentenceTransformer('jinaai/jina-embeddings-v2-base-zh', device='cpu', backend='onnx', trust_remote_code=True)
print_mem()

from sentence_transformers import export_dynamic_quantized_onnx_model

export_dynamic_quantized_onnx_model(model, "avx512_vnni", "private/jina-embeddings-v2-base-zh")
print_mem()

Used ws and vram: 2.20 GiB | 0.89 GiB




Used ws and vram: 3.27 GiB | 0.89 GiB
Used ws and vram: 3.28 GiB | 0.88 GiB


In [5]:
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
# Multiple ONNX files found in 'sentence-transformers/all-MiniLM-L6-v2': ['onnx/model.onnx', 'onnx/model_O1.onnx', 'onnx/model_O2.onnx', 'onnx/model_O3.onnx', 'onnx/model_O4.onnx', 'onnx/model_qint8_arm64.onnx', 'onnx/model_qint8_avx512.onnx', 'onnx/model_qint8_avx512_vnni.onnx', 'onnx/model_quint8_avx2.onnx'], defaulting to 'onnx/model.onnx'. Please specify the desired file name via `model_kwargs={"file_name": "<file_name>"}`.
for m in ['onnx/model_qint8_avx512.onnx', 'onnx/model_qint8_avx512_vnni.onnx']:
    print(m)
    import gc
    gc.collect()

    model = SentenceTransformer('private/jina-embeddings-v2-base-zh', device='cpu', backend='onnx', model_kwargs={
        'file_name': m
    })
    print_mem()

    samples = chunks[:64]
    %timeit model.encode(samples, batch_size=32)
    print_mem()
    print()
    del model

onnx/model_qint8_avx512.onnx


The ONNX file model_qint8_avx512.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.92 GiB | 0.78 GiB
4.84 s ± 340 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 1.49 GiB | 0.78 GiB

onnx/model_qint8_avx512_vnni.onnx


The ONNX file model_qint8_avx512_vnni.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.98 GiB | 0.78 GiB
5.12 s ± 428 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 1.50 GiB | 0.78 GiB



In [6]:
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
# Multiple ONNX files found in 'sentence-transformers/all-MiniLM-L6-v2': ['onnx/model.onnx', 'onnx/model_O1.onnx', 'onnx/model_O2.onnx', 'onnx/model_O3.onnx', 'onnx/model_O4.onnx', 'onnx/model_qint8_arm64.onnx', 'onnx/model_qint8_avx512.onnx', 'onnx/model_qint8_avx512_vnni.onnx', 'onnx/model_quint8_avx2.onnx'], defaulting to 'onnx/model.onnx'. Please specify the desired file name via `model_kwargs={"file_name": "<file_name>"}`.
for m in ['onnx/model.onnx', 'onnx/model_O1.onnx', 'onnx/model_O2.onnx', 'onnx/model_O3.onnx', 'onnx/model_qint8_arm64.onnx', 'onnx/model_qint8_avx512.onnx', 'onnx/model_qint8_avx512_vnni.onnx', 'onnx/model_quint8_avx2.onnx']:
    print(m)
    import gc
    gc.collect()

    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu', backend='onnx', model_kwargs={
        'file_name': m
    })
    print_mem()

    samples = chunks[:64]
    %timeit model.encode(samples, batch_size=32)
    print_mem()
    print()
    del model

onnx/model.onnx
Used ws and vram: 0.75 GiB | 0.80 GiB
1.18 s ± 170 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.94 GiB | 0.80 GiB

onnx/model_O1.onnx


The ONNX file model_O1.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.79 GiB | 0.80 GiB
1.32 s ± 246 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.93 GiB | 0.80 GiB

onnx/model_O2.onnx


The ONNX file model_O2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.79 GiB | 0.80 GiB
1.08 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.96 GiB | 0.80 GiB

onnx/model_O3.onnx


The ONNX file model_O3.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.79 GiB | 0.80 GiB
1.14 s ± 53.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.95 GiB | 0.80 GiB

onnx/model_qint8_arm64.onnx


The ONNX file model_qint8_arm64.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.70 GiB | 0.80 GiB
867 ms ± 63.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.91 GiB | 0.80 GiB

onnx/model_qint8_avx512.onnx


The ONNX file model_qint8_avx512.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.70 GiB | 0.80 GiB
929 ms ± 104 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.90 GiB | 0.80 GiB

onnx/model_qint8_avx512_vnni.onnx


The ONNX file model_qint8_avx512_vnni.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.70 GiB | 0.80 GiB
792 ms ± 68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.90 GiB | 0.80 GiB

onnx/model_quint8_avx2.onnx


The ONNX file model_quint8_avx2.onnx is not a regular name used in optimum.onnxruntime, the ORTModel might not behave as expected.


Used ws and vram: 0.70 GiB | 0.80 GiB
865 ms ± 72.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Used ws and vram: 0.91 GiB | 0.80 GiB



In [7]:
samples = ['How is the weather today?', '今天天气怎么样?']
%timeit model.encode(samples)

18.5 ms ± 2.46 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
samples = chunks[:64]
%timeit model.encode(samples, batch_size=64)

1.34 s ± 76.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
import gc

del model
gc.collect()
print_mem()

Used ws and vram: 0.49 GiB | 0.71 GiB


### Static embeddings

In [12]:
# note: `pip install model2vec` is needed, but not for inference
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding

# Initialize a Sentence Transformer model with a static embedding by distilling via model2vec
static_embedding = StaticEmbedding.from_distillation(
    "jinaai/jina-embeddings-v2-base-zh",
    device="cuda",
    pca_dims=256,
    apply_zipf=True,
)
model = SentenceTransformer(modules=[static_embedding], backend='onnx', device='cpu', trust_remote_code=True)

# Encode some texts
queries = ["What is the capital of France?", "How many people live in the Netherlands?"]
documents = ["Paris is the capital of France", "The Netherlands has 17 million inhabitants"]
query_embeddings = model.encode(queries)
document_embeddings = model.encode(documents)

# Compute similarities
scores = model.similarity(query_embeddings, document_embeddings)
print(scores)
"""
tensor([[0.8430, 0.3271],
        [0.3213, 0.5861]])
"""

Some weights of BertModel were not initialized from the model checkpoint at jinaai/jina-embeddings-v2-base-zh and are newly initialized: ['embeddings.position_embeddings.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encoder.layer.0.output.dense.weight', 'encoder.layer.1.intermediate.dense.bias', 'encoder.layer.1.intermediate.dense.weight', 'encoder.layer.1.output.LayerNorm.bias', 'encoder.layer.1.output.LayerNorm.weight', 'encoder.layer.1.output.dense.bias', 'encoder.layer.1.output.dense.weight', 'encoder.layer.10.intermediate.dense.bias', 'encoder.layer.10.intermediate.dense.weight', 'encoder.layer.10.output.LayerNorm.bias', 'encoder.layer.10.output.LayerNorm.weight', 'encoder.layer.10.output.dense.bias', 'encoder.layer.10.output.dense.weight', 'encoder.layer.11.intermediate.dense.bias', 'encoder.layer.11.intermedi

tensor([[0.9283, 0.0462],
        [0.4067, 0.2280]])


'\ntensor([[0.8430, 0.3271],\n        [0.3213, 0.5861]])\n'

In [14]:
model.save('private/onnx/model.onnx')

In [3]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [None]:
# note: `pip install model2vec` is needed, but not for inference
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding

# Initialize a Sentence Transformer model with a static embedding by distilling via model2vec
# static_embedding = StaticEmbedding.from_model2vec(
#     'onnx/model.onnx'
# )
model = SentenceTransformer('private/onnx/model.onnx', backend='onnx', device='cpu', trust_remote_code=True)

# Encode some texts
queries = ["What is the capital of France?", "How many people live in the Netherlands?"]
documents = ["Paris is the capital of France", "The Netherlands has 17 million inhabitants"]
query_embeddings = model.encode(queries)
document_embeddings = model.encode(documents)

# Compute similarities
scores = model.similarity(query_embeddings, document_embeddings)
print(scores)

In [7]:
samples = chunks[:64]
%timeit model.encode(samples, batch_size=32)
print_mem()

8.9 ms ± 362 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Used ws and vram: 0.67 GiB | 0.42 GiB


In [8]:
import gc

gc.collect()
print_mem()

Used ws and vram: 0.67 GiB | 0.42 GiB
