In [1]:
from transformers import AutoModel
from transformers import AutoTokenizer

In [2]:
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('deepvk/USER-bge-m3', trust_remote_code=True)
model = AutoModel.from_pretrained('deepvk/USER-bge-m3', trust_remote_code=True)

In [3]:
import torch

In [4]:
torch.cuda.is_available()

True

In [5]:
# tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
# model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)

In [29]:
def chunk_by_sentences(input_text: str, tokenizer: callable):
    """
    Split the input text into sentences using the tokenizer
    :param input_text: The text snippet to split into sentences
    :param tokenizer: The tokenizer to use
    :return: A tuple containing the list of text chunks and their corresponding token spans
    """
    inputs = tokenizer(input_text, return_tensors='pt', return_offsets_mapping=True)
    punctuation_mark_id = tokenizer.convert_tokens_to_ids('.')
    sep_id = tokenizer.convert_tokens_to_ids('[SEP]')
    token_offsets = inputs['offset_mapping'][0]
    token_ids = inputs['input_ids'][0]
    chunk_positions = [
        (i, int(start + 1))
        for i, (token_id, (start, end)) in enumerate(zip(token_ids, token_offsets))
        if token_id == punctuation_mark_id
    ]
    chunks = [
        input_text[x[1] : y[1]]
        for x, y in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    span_annotations = [
        (x[0], y[0]) for (x, y) in zip([(1, 0)] + chunk_positions[:-1], chunk_positions)
    ]
    return chunks, span_annotations

In [30]:
import requests

def chunk_by_tokenizer_api(input_text: str, tokenizer: callable):
    # Define the API endpoint and payload
    url = 'https://tokenize.jina.ai/'
    payload = {
        "content": input_text,
        "return_chunks": "true",
        "max_chunk_length": "512"
    }

    # Make the API request
    response = requests.post(url, json=payload)
    response_data = response.json()

    # Extract chunks and positions from the response
    chunks = response_data.get("chunks", [])
    chunk_positions = response_data.get("chunk_positions", [])

    # Adjust chunk positions to match the input format
    span_annotations = [(start, end) for start, end in chunk_positions]

    return chunks, span_annotations

In [32]:
input_text = "Berlin is the capital and largest city of Germany, both by area and by population. Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits. The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."

# determine chunks
chunks, span_annotations = chunk_by_sentences(input_text, tokenizer)
print('Chunks:\n- "' + '"\n- "'.join(chunks) + '"')

Chunks:
- "Berlin is the capital and largest city of Germany, both by area and by population."
- " Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits."
- " The city is also one of the states of Germany, and is the third smallest state in the country in terms of area."


In [33]:
def late_chunking(
    model_output: 'BatchEncoding', span_annotation: list, max_length=None
):
    token_embeddings = model_output[0]
    outputs = []
    for embeddings, annotations in zip(token_embeddings, span_annotation):
        if (
            max_length is not None
        ):  # remove annotations which go bejond the max-length of the model
            annotations = [
                (start, min(end, max_length - 1))
                for (start, end) in annotations
                if start < (max_length - 1)
            ]
        pooled_embeddings = [
            embeddings[start:end].sum(dim=0) / (end - start)
            for start, end in annotations
            if (end - start) >= 1
        ]
        pooled_embeddings = [
            embedding.detach().cpu().numpy() for embedding in pooled_embeddings
        ]
        outputs.append(pooled_embeddings)

    return outputs

In [44]:
chunks

['Berlin is the capital and largest city of Germany, both by area and by population.',
 " Its more than 3.85 million inhabitants make it the European Union's most populous city, as measured by population within city limits.",
 ' The city is also one of the states of Germany, and is the third smallest state in the country in terms of area.']

In [43]:
embeddings_traditional_chunking = model.encoder(torch.tensor(chunks))

# chunk afterwards (context-sensitive chunked pooling)
inputs = tokenizer(input_text, return_tensors='pt')
model_output = model(**inputs)
embeddings = late_chunking(model_output, [span_annotations])[0]

ValueError: too many dimensions 'str'

In [39]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_assisted_decoding',
 '_auto_class',
 '_autoset_attn_implementation',
 '_backward_compatibility_gradient_checkpointing',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_beam_search',
 '_buffers',
 '_call_impl',
 '_check_and_enable_flash_attn_2',
 '_check_and_enable_sdpa',
 '_compiled_call_impl',
 '_constrained_beam_search',
 '_contrastive_search',
 '_convert_head_mask_to_5d',
 '_copy_lm_head_original_to_resized',
 '_create_repo',
 '_dispatch_accelerate_model',
 '_dola_decoding',
 '_expand_inputs_for_generation',

In [40]:
import numpy as np

cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

berlin_embedding = model.encode('Berlin')

for chunk, new_embedding, trad_embeddings in zip(chunks, embeddings, embeddings_traditional_chunking):
    print(f'similarity_new("Berlin", "{chunk}"):', cos_sim(berlin_embedding, new_embedding))
    print(f'similarity_trad("Berlin", "{chunk}"):', cos_sim(berlin_embedding, trad_embeddings))

AttributeError: 'XLMRobertaModel' object has no attribute 'encode'

In [45]:
from datetime import datetime, timedelta

In [49]:
datetime(2024, 10, 5, 17, 4) + timedelta(minutes=295)

datetime.datetime(2024, 10, 5, 21, 59)

In [47]:
(60 * 6 + 34) * 0.75

295.5

In [50]:
import psycopg2

In [91]:
def connect_db():
    return psycopg2.connect( # use the credentials of your postgresql database 
        host = '127.0.0.1',
        database = 'jiraiya',
        user = 'jiraiya',
        password = '*v^TVZk7u6h3m2',
        port = '9012'
    )

In [92]:
conn = connect_db()

In [93]:
conn

<connection object at 0x0000023F879B4E10; dsn: 'user=jiraiya password=xxx dbname=jiraiya host=127.0.0.1 port=9012', closed: 0>

In [94]:
conn = connect_db()
cur = conn.cursor()
cur.execute("""
        CREATE TABLE IF NOT EXISTS documents (
            id SERIAL PRIMARY KEY,
            title TEXT,
            content TEXT,
            embedding VECTOR(768)
        );
    """)
conn.commit()
cur.close()
conn.close()

In [95]:
dummy_data = [
    {"title": "Seoul Tower", "content": "Seoul Tower is a communication and observation tower located on Namsan Mountain in central Seoul, South Korea."},
    {"title": "Gwanghwamun Gate", "content": "Gwanghwamun is the main and largest gate of Gyeongbokgung Palace, in Jongno-gu, Seoul, South Korea."},
    {"title": "Bukchon Hanok Village", "content": "Bukchon Hanok Village is a Korean traditional village in Seoul with a long history."},
    {"title": "Myeong-dong Shopping Street", "content": "Myeong-dong is one of the primary shopping districts in Seoul, South Korea."},
    {"title": "Dongdaemun Design Plaza", "content": "The Dongdaemun Design Plaza is a major urban development landmark in Seoul, South Korea."}
]

In [104]:
conn = connect_db()
cur = conn.cursor()

# use the port at which your ollama service is running.
a = cur.execute("""
   create table quotes
( id int not null primary key generated by default as identity
, quote text
, person text
, embedding vector(4096) -- the vector data type is from the pgvector extension
);
""")
conn.commit()
cur.close()
conn.close()

In [105]:
conn = connect_db()
cur = conn.cursor()

# use the port at which your ollama service is running.
a = cur.execute("""
insert into quotes (quote, person) values
  ('What one programmer can do in one month, two programmers can do in two months.', 'Frederick P. Brooks')
, ('The only way to learn a new programming language is by writing programs in it.', 'Dennis Ritchie')
, ('Talk is cheap. Show me the code.', 'Linus Torvalds')
;
""")
conn.commit()
cur.close()
conn.close()

In [117]:
conn = connect_db()
cur = conn.cursor()

# use the port at which your ollama service is running.
cur.execute("""
update quotes set embedding = ai.ollama_embed('llama3'::text, format('%s - %s', person, quote)::text);
""")

conn.commit()
cur.close()
conn.close()

ExternalRoutineException: httpx.ConnectError: [Errno 111] Connection refused
CONTEXT:  Traceback (most recent call last):
  PL/Python function "ollama_embed", line 21, in <module>
    resp = client.embeddings(model, input_text, options=embedding_options_1, keep_alive=keep_alive)
  PL/Python function "ollama_embed", line 200, in embeddings
  PL/Python function "ollama_embed", line 68, in _request
  PL/Python function "ollama_embed", line 836, in request
  PL/Python function "ollama_embed", line 925, in send
  PL/Python function "ollama_embed", line 953, in _send_handling_auth
  PL/Python function "ollama_embed", line 990, in _send_handling_redirects
  PL/Python function "ollama_embed", line 1026, in _send_single_request
  PL/Python function "ollama_embed", line 234, in handle_request
  PL/Python function "ollama_embed", line 152, in __exit__
  PL/Python function "ollama_embed", line 88, in map_httpcore_exceptions
PL/Python function "ollama_embed"


In [114]:
conn = connect_db()
cur = conn.cursor()

# use the port at which your ollama service is running.
a = cur.execute("""
SELECT proname, proargtypes
FROM pg_proc
WHERE proname = 'ollama_embed';
""")
cur.fetchall()


[('ollama_embed', '25 25 25 701 3802')]

In [100]:
a

In [118]:
conn = connect_db()
cur = conn.cursor()

# use the port at which your ollama service is running.
for doc in dummy_data:
    cur.execute("""
        INSERT INTO documents (title, content, embedding)
        VALUES (
            %(title)s,
            %(content)s,
            ai.ollama_embed('nomic-embed-text', concat(%(title)s, ' - ', %(content)s), _host=>'http://ollama:11434')
        )
    """, doc)

conn.commit()
cur.close()
conn.close()

UndefinedFunction: function ai.ollama_embed(unknown, text, _host => unknown) does not exist
LINE 6:             ai.ollama_embed('nomic-embed-text', concat('Seou...
                    ^
HINT:  No function matches the given name and argument types. You might need to add explicit type casts.


In [119]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [None]:
import chromadb

persistent_client = chromadb.PersistentClient()
collection = persistent_client.get_or_create_collection("collection_name")
collection.add(ids=["1", "2", "3"], documents=["a", "b", "c"])

vector_store_from_client = Chroma(
    client=persistent_client,
    collection_name="collection_name",
    embedding_function=embeddings,
)



In [120]:
import chromadb

chroma_client = chromadb.Client()
collection = chroma_client.get_or_create_collection(name="my_collection")

In [122]:
import chromadb
chroma_client = chromadb.HttpClient(host='localhost', port=8027)


In [123]:
chroma_client

<chromadb.api.client.Client at 0x23f8a3c2f10>