In [1]:
from transformers import AutoTokenizer, AutoModel
from langchain_text_splitters import TokenTextSplitter

import os
import html2text
from bs4 import BeautifulSoup
import requests

import torch

import numpy as np

In [4]:
# Инициализируем токенизатор Hugging Face
tokenizer = AutoTokenizer.from_pretrained("deepvk/USER-bge-m3")
model = AutoModel.from_pretrained("deepvk/USER-bge-m3")
model.eval()

XLMRobertaModel(
  (embeddings): XLMRobertaEmbeddings(
    (word_embeddings): Embedding(46166, 1024, padding_idx=1)
    (position_embeddings): Embedding(8194, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): XLMRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x XLMRobertaLayer(
        (attention): XLMRobertaAttention(
          (self): XLMRobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): XLMRobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, eleme

In [27]:
# Инициализируем TokenTextSplitter с использованием токенизатора Hugging Face
text_splitter = TokenTextSplitter.from_huggingface_tokenizer(
    tokenizer=tokenizer,
    chunk_size=512,
    chunk_overlap=64
)

In [2]:
url = "https://www.xtremepush.com/blog/customer-analytics-software-for-igaming"
html_content = requests.get(url).text

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')

# Remove header and footer elements
for header in soup.find_all(['header', 'footer']):
    header.decompose()

# Convert remaining HTML to Markdown
h = html2text.HTML2Text()
h.ignore_images = True  # Ignore images
h.ignore_links = True   # Ignore external links
markdown_text = h.handle(str(soup))
print(markdown_text)

customer experience,  Blog post

#  Customer Analytics Software for iGaming: Get into your Players’ Minds

by Kinga

Mar 11 2024

11 min

Numbers need to match insights, insights lead to action, and action drives
success. In the fast-paced world of iGaming, understanding the player is key
to staying ahead. Enter Customer Analytics Software—a game-changer in how you
connect with, engage, and retain your players.

## **What is Customer Analytics Software?**

A customer analytics tool dives deep into your customer data to offer insights
on behavior, preferences, and trends, guiding better business decisions.

## **Benefits of Using Customer Analytics Software in iGaming**

### **Optimize Customer Experience with Data-Driven Insights**

Leveraging a customer analytics tool allows iGaming companies to fine-tune the
user experience. By analyzing vast amounts of customer data, these tools
pinpoint what keeps players coming back. For example, identifying the most
popular games or features enab

In [30]:
# Разделение текста на куски
text_chunks = text_splitter.split_text(markdown_text)

# Вывод первого куска текста
print("Первый кусок текста:")
print(text_chunks[0])

# Вывод общего количества кусков
print(f"\nОбщее количество кусков: {len(text_chunks)}")

Первый кусок текста:
customer experience,  Blog post

#  Customer Analytics Software for iGaming: Get into your Players’ Minds

by Kinga

Mar 11 2024

11 min

Numbers need to match insights, insights lead to action, and action drives
success. In the fast-paced world of iGaming, understanding the player is key
to staying ahead. Enter Customer Analytics Software—a game-changer in how you
connect with, engage, and retain your players.

## **What is Customer Analytics Software?**

A customer analytics tool dives deep into your customer data to offer insights
on behavior, preferences, and trends, guiding better business decisions.

## **Benefits of Using Customer Analytics Software in iGaming**

### **Optimize Customer Experience with Data-Driven Insights**

Leveraging a customer analytics tool allows iGaming companies to fine-tune the
user experience. By analyzing vast amounts of customer data, these tools
pinpoint what keeps players coming back. For example, identifying the most
popular g

In [32]:
def late_chunking(model_output: torch.Tensor, span_annotations: list, max_length=None):
    """
    Агрегирует эмбеддинги токенов на основе аннотаций о чанках.

    :param model_output: Эмбеддинги токенов, полученные от модели (shape: [batch_size, seq_length, hidden_size])
    :param span_annotations: Список кортежей с позициями чанков [(start1, end1), (start2, end2), ...]
    :param max_length: Максимальная длина для ограничения чанков
    :return: Список агрегированных эмбеддингов для каждого чанка
    """
    token_embeddings = model_output.last_hidden_state  # shape: [batch_size, seq_length, hidden_size]
    outputs = []
    for batch in token_embeddings:
        pooled_embeddings = []
        for start, end in span_annotations:
            if max_length is not None:
                end = min(end, max_length - 1)
            if start < end:
                # Среднее по токенам в чанке
                pooled = batch[start:end].mean(dim=0)
                pooled_embeddings.append(pooled.detach().cpu().numpy())
        outputs.append(pooled_embeddings)
    return outputs


In [37]:
# Определение функции для расчета косинусного сходства
cos_sim = lambda x, y: np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

# Пример входного текста для сравнения
input_text = "Segmentation"

# Получение эмбеддинга для входного текста
with torch.no_grad():
    inputs = tokenizer(input_text, return_tensors='pt')
    model_output = model(**inputs)
    berlin_embedding = model_output.last_hidden_state.mean(dim=1).squeeze().detach().cpu().numpy()

# Подготовка span_annotations для late_chunking
# Предполагаем, что каждый chunk соответствует одному span
span_annotations = []
current_token = 0
for chunk in text_chunks:
    encoded = tokenizer(chunk, return_tensors='pt')
    num_tokens = encoded.input_ids.shape[1]
    span_annotations.append((current_token, current_token + num_tokens))
    current_token += num_tokens

# Объединение всех кусков в один текст для батчевой обработки
batch_text = " ".join(text_chunks)
with torch.no_grad():
    inputs = tokenizer(batch_text, return_tensors='pt')
    model_output = model(**inputs)

# Применение late_chunking
embeddings = late_chunking(model_output, span_annotations, max_length=8000)

In [38]:
# Расчет сходства для каждого куска
for i, (chunk, new_embedding) in enumerate(zip(text_chunks, embeddings)):
    similarity = cos_sim(berlin_embedding, new_embedding)
    print(f'Сходство с "Segmentation" для куска {i+1}: {similarity:.4f}')


Сходство с "Segmentation" для куска 1: 0.4354
Сходство с "Segmentation" для куска 2: 0.3763
Сходство с "Segmentation" для куска 3: 0.3714
Сходство с "Segmentation" для куска 4: 0.3798
Сходство с "Segmentation" для куска 5: 0.3812
Сходство с "Segmentation" для куска 6: 0.3856
Сходство с "Segmentation" для куска 7: 0.4122
Сходство с "Segmentation" для куска 8: 0.4273


In [42]:
embeddings

[array([-0.44218436, -0.3437931 , -0.92952126, ..., -0.63875145,
        -0.5923686 , -0.26145285], dtype=float32),
 array([-0.29643717, -0.29503995, -0.99722856, ..., -0.74367833,
        -0.49510983, -0.29981682], dtype=float32),
 array([-0.29278722, -0.2991885 , -1.0189453 , ..., -0.7594174 ,
        -0.47296345, -0.31044397], dtype=float32),
 array([-0.3068815 , -0.30437914, -0.9925017 , ..., -0.7442693 ,
        -0.4946214 , -0.30506536], dtype=float32),
 array([-0.31434178, -0.3186733 , -1.023912  , ..., -0.7444819 ,
        -0.4683653 , -0.31423634], dtype=float32),
 array([-0.30990943, -0.30388755, -1.0043858 , ..., -0.7239653 ,
        -0.49466226, -0.29954377], dtype=float32),
 array([-0.36696744, -0.32530916, -0.95352155, ..., -0.6793049 ,
        -0.55620176, -0.3017941 ], dtype=float32),
 array([-0.41542438, -0.35679534, -0.8776944 , ..., -0.65853804,
        -0.5845441 , -0.31110787], dtype=float32)]

In [43]:
len(text_chunks), len(embeddings)

(8, 8)

In [50]:
type(embeddings[0])

numpy.ndarray

In [22]:
from late_chunkning import process_large_text, model, tokenizer

In [None]:
url = "https://www.xtremepush.com/blog/customer-analytics-software-for-igaming"

In [5]:
all_chunks, all_embeddings = process_large_text(markdown_text)

In [6]:
all_chunks

['customer experience,  Blog post\n\n#  Customer Analytics Software for iGaming: Get into your Players’ Minds\n\nby Kinga\n\nMar 11 2024\n\n11 min\n\nNumbers need to match insights, insights lead to action, and action drives\nsuccess. In the fast-paced world of iGaming, understanding the player is key\nto staying ahead. Enter Customer Analytics Software—a game-changer in how you\nconnect with, engage, and retain your players.\n\n## **What is Customer Analytics Software?**\n\nA customer analytics tool dives deep into your customer data to offer insights\non behavior, preferences, and trends, guiding better business decisions.\n\n## **Benefits of Using Customer Analytics Software in iGaming**\n\n### **Optimize Customer Experience with Data-Driven Insights**\n\nLeveraging a customer analytics tool allows iGaming companies to fine-tune the\nuser experience. By analyzing vast amounts of customer data, these tools\npinpoint what keeps players coming back. For example, identifying the most\np

In [7]:
len(all_chunks)

8

In [8]:
len(all_embeddings)

8

In [37]:
all_embeddings[0].shape

(1024,)

In [10]:
import chromadb

chroma_client = chromadb.HttpClient(host='localhost', port=8027)

In [11]:
collection = chroma_client.get_or_create_collection(name="test")

In [17]:
start = collection.count()

collection.add(
    documents=all_chunks,
    embeddings=all_embeddings,
    metadatas=[{"url": url} for _ in all_chunks],
    ids=[str(i) for i in range(start, start + len(all_chunks))]
)

In [18]:
dir(collection)

['__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_client',
 '_data_loader',
 '_embed',
 '_embed_record_set',
 '_embedding_function',
 '_is_protocol',
 '_model',
 '_transform_get_response',
 '_transform_peek_response',
 '_transform_query_response',
 '_update_model_after_modify_success',
 '_validate_and_prepare_add_request',
 '_validate_and_prepare_delete_request',
 '_validate_and_prepare_get_request',
 '_validate_and_prepare_query_request',
 '_validate_and_prepare_update_request',
 '_validate_and_prepare_upsert_request',
 '_validate_modify_request',
 'add',
 'configuration_json

In [19]:
collection.count()

16

In [20]:
query_results = collection.query(
    query_texts=["Find me some delicious food!"],
    n_results=4
)

C:\Users\user\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:07<00:00, 10.9MiB/s]


InvalidDimensionException: Embedding dimension 384 does not match collection dimensionality 1024

In [None]:
from late_chain

In [30]:
batch_text = "customer analytics tools"
embeds = model(**tokenizer("customer analytics tools", return_tensors='pt'))

In [38]:
embeds.pooler_output.detach().cpu().numpy().shape

(1, 1024)

In [35]:
embeds.pooler_output.shape

torch.Size([1, 1024])

In [47]:
results = collection.query(
    query_embeddings=embeds.pooler_output.detach().cpu().numpy(),  # List of query vectors
    n_results=1,  # Number of results to retur
)
results

{'ids': [['8']],
 'distances': [[715.3309970243297]],
 'embeddings': None,
 'metadatas': [[{'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'}]],
 'documents': [['customer experience,  Blog post\n\n#  Customer Analytics Software for iGaming: Get into your Players’ Minds\n\nby Kinga\n\nMar 11 2024\n\n11 min\n\nNumbers need to match insights, insights lead to action, and action drives\nsuccess. In the fast-paced world of iGaming, understanding the player is key\nto staying ahead. Enter Customer Analytics Software—a game-changer in how you\nconnect with, engage, and retain your players.\n\n## **What is Customer Analytics Software?**\n\nA customer analytics tool dives deep into your customer data to offer insights\non behavior, preferences, and trends, guiding better business decisions.\n\n## **Benefits of Using Customer Analytics Software in iGaming**\n\n### **Optimize Customer Experience with Data-Driven Insights**\n\nLeveraging a customer analytics tool al

In [50]:
results = collection.query(
    query_embeddings=embeds.pooler_output.detach().cpu().numpy(),  # List of query vectors
    n_results=5,  # Number of results to retur
)
results

{'ids': [['8', '0', '7', '15', '6']],
 'distances': [[715.3309970243297,
   715.3309970243297,
   719.3276542282105,
   719.3276542282105,
   737.2245160491082]],
 'embeddings': None,
 'metadatas': [[{'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'}]],
 'documents': [['customer experience,  Blog post\n\n#  Customer Analytics Software for iGaming: Get into your Players’ Minds\n\nby Kinga\n\nMar 11 2024\n\n11 min\n\nNumbers need to match insights, insights lead to action, and action drives\nsuccess. In the fast-paced world of iGaming, understanding the player is key\nto staying ahead. Enter Customer Analytics Soft

In [45]:
type(results)

dict

In [46]:
results

{'ids': [['8', '0', '7', '15', '6']],
 'distances': [[715.3309970243297,
   715.3309970243297,
   719.3276542282105,
   719.3276542282105,
   737.2245160491082]],
 'embeddings': None,
 'metadatas': [[{'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'},
   {'url': 'https://www.xtremepush.com/blog/customer-analytics-software-for-igaming'}]],
 'documents': [['customer experience,  Blog post\n\n#  Customer Analytics Software for iGaming: Get into your Players’ Minds\n\nby Kinga\n\nMar 11 2024\n\n11 min\n\nNumbers need to match insights, insights lead to action, and action drives\nsuccess. In the fast-paced world of iGaming, understanding the player is key\nto staying ahead. Enter Customer Analytics Soft

In [44]:
results[]

KeyError: 4