In [1]:
import docker
from vespa.io import VespaResponse, VespaQueryResponse
from vespa.package import (
    ApplicationPackage,
    Field,
    Schema,
    Document,
    HNSW,
    RankProfile,
    Component,
    Parameter,
    FieldSet,
    GlobalPhaseRanking,
    Function,
    FirstPhaseRanking, SecondPhaseRanking
)
from vespa.deployment import VespaDocker
import pandas as pd
import numpy as np

In [None]:
package = ApplicationPackage(
    name="hybridsearchbv",
    schema=[
        Schema(
            name="doc",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(
                        name="title",
                        type="string",
                        indexing=["index", "summary"]
                    ),
                    Field(
                        name="authors",
                        type="string",
                        indexing=["index", "summary"],
                        bolding=False,
                    ),
                    Field(
                        name="description",
                        type="string",
                        indexing=["index", "summary"],
                        bolding=False,
                    ),
                    Field(
                        name="categories",
                        type="string",
                        indexing=["index", "summary"],
                        bolding=False,
                    ),
                    Field(
                        name="embedding",
                        type="tensor<bfloat16>(description{}, x[384])",
                        indexing=[
                            'input description . " " . input categories',
                            "embed",
                            "index",
                            "attribute",
                        ],
                        ann=(distance_metric="angular"),
                        is_document_field=False,
                    ),
                    Field(
                        name="colbert",
                        type="tensor<int8>(description{}, token{}, v[16])",
                        indexing=["input description . " " . input categories", "embed colbert description", "attribute"],
                        is_document_field=False,
                    ),
                ],
            ),
            fieldsets=[FieldSet(name="default", fields=["title", "contexts"])],
)

In [31]:
package = ApplicationPackage(
    name="hybridsearchbv",
    schema=[
        Schema(
            name="doc",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(
                        name="title",
                        type="string",
                        indexing=["index", "summary"]
                    ),
                    Field(
                        name="authors",
                        type="string",
                        indexing=["index", "summary"],
                        bolding=False,
                    ),
                    Field(
                        name="description",
                        type="string",
                        indexing=["index", "summary"],
                        bolding=False,
                    ),
                    Field(
                        name="categories",
                        type="string",
                        indexing=["index", "summary"],
                        bolding=False,
                    ),
                    Field(
                        name="embedding",
                        type="tensor<bfloat16>(description{}, x[384])",
                        indexing=[
                            'input description . " " . input categories',
                            "embed",
                            "index",
                            "attribute",
                        ],
                        ann=("distance_metric: angular"),
                        is_document_field=False,
                    ),
                    Field(
                        name="colbert",
                        type="tensor<int8>(description{}, token{}, v[16])",
                        indexing=["input description . " " . input categories", "embed colbert description", "attribute"],
                        is_document_field=False,
                    ),
                ],
            ),
            fieldsets=[FieldSet(name="default", fields=["title", "authors", "description", "categories"])],
            rank_profiles=[
                RankProfile(
                    name="colbert",
                    inputs=[
                        ("query(q)", "tensor<float>(x[384])"),
                        ("query(qt)", "tensor<float>(querytoken{}, v[128])"),
                    ],
                    functions=[
                        Function(name="cos_sim", expression="closeness(field, embedding)"),
                        Function(
                            name="max_sim_per_description",
                            expression="""
                                sum(
                                    reduce(
                                        sum(
                                            query(qt) * unpack_bits(attribute(colbert)) , v
                                        ),
                                        max, token
                                    ),
                                    querytoken
                                )
                            """,
                        ),
                        Function(
                            name="max_sim", expression="reduce(max_sim_per_description, max, description)"
                        ),
                    ],
                    first_phase=FirstPhaseRanking(expression="cos_sim"),
                    second_phase=SecondPhaseRanking(expression="max_sim"),
                    match_features=["cos_sim", "max_sim", "max_sim_per_description"],
                ),
            ]
        )
    ],
    components=[
        Component(
            id="e5",
            type="hugging-face-embedder",
            parameters=[
                Parameter(
                    name="transformer-model",
                    args={
                        "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                    },
                ),
                Parameter(
                    name="tokenizer-model",
                    args={
                        "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
        Component(
            id="colbert",
            type="colbert-embedder",
            parameters=[
                Parameter(
                    name="transformer-model",
                    args={
                        "url": "https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"
                    },
                ),
                Parameter(
                    name="tokenizer-model",
                    args={
                        "url": "https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"
                    },
                ),
            ],
        ),
    ]
)


In [32]:
vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...


RuntimeError: Deployment failed, code: 400, message: {'error-code': 'INVALID_APPLICATION_PACKAGE', 'message': 'Invalid application: Failed parsing schema from \'doc.sd\': Encountered " "\\n" "\\n"" at line 22, column 30.\n\nWas expecting one of:\n\n"annotation" ...\n    "annotationreference" ...\n    "schema" ...\n    "search" ...\n    "diversity" ...\n    "min-groups" ...\n    "cutoff-factor" ...\n    "cutoff-strategy" ...\n    "loose" ...\n    "strict" ...\n    "document" ...\n    "operation" ...\n    "on-match" ...\n    "on-first-phase" ...\n    "on-second-phase" ...\n    "on-summary" ...\n    "struct" ...\n    "inherits" ...\n    "field" ...\n    "fields" ...\n    "fieldset" ...\n    "struct-field" ...\n    "import" ...\n    "as" ...\n    "indexing" ...\n    "summary-to" ...\n    "document-summary" ...\n    "rank-type" ...\n    "weight" ...\n    "type" ...\n    "index" ...\n    "inputs" ...\n    "token" ...\n    "text" ...\n    "word" ...\n    "gram" ...\n    "gram-size" ...\n    "max-length" ...\n    "prefix" ...\n    "substring" ...\n    "suffix" ...\n    "constant" ...\n    "onnx-model" ...\n    "significance" ...\n    "use-model" ...\n    "intraop-threads" ...\n    "interop-threads" ...\n    "gpu-device" ...\n    "execution-mode" ...\n    "parallel" ...\n    "sequential" ...\n    "model" ...\n    "mutate" ...\n    "query" ...\n    "rank-profile" ...\n    "raw-as-base64-in-summary" ...\n    "summary" ...\n    "full" ...\n    "static" ...\n    "dynamic" ...\n    "tokens" ...\n    "matched-elements-only" ...\n    "contextual" ...\n    "override" ...\n    "title" ...\n    "url" ...\n    "properties" ...\n    "attribute" ...\n    "sorting" ...\n    "dictionary" ...\n    "ascending" ...\n    "descending" ...\n    "uca" ...\n    "raw" ...\n    "lowercase" ...\n    "function" ...\n    "locale" ...\n    "strength" ...\n    "primary" ...\n    "secondary" ...\n    "tertiary" ...\n    "quaternary" ...\n    "identical" ...\n    "stemming" ...\n    "normalizing" ...\n    "hash" ...\n    "btree" ...\n    "cased" ...\n    "uncased" ...\n    "bolding" ...\n    "none" ...\n    "on" ...\n    "off" ...\n    "true" ...\n    "false" ...\n    "symmetric" ...\n    "query-command" ...\n    "alias" ...\n    "match" ...\n    "rank" ...\n    "literal" ...\n    "exact" ...\n    "filter" ...\n    "normal" ...\n    "exact-terminator" ...\n    "ignore-default-rank-features" ...\n    "id" ...\n    "source" ...\n    "to" ...\n    "direct" ...\n    "from-disk" ...\n    "omit-summary-features" ...\n    "always" ...\n    "on-demand" ...\n    "never" ...\n    "enable-bit-vectors" ...\n    "enable-only-bit-vector" ...\n    "fast-access" ...\n    "mutable" ...\n    "paged" ...\n    "fast-rank" ...\n    "fast-search" ...\n    "array" ...\n    "weightedset" ...\n    "map" ...\n    "reference" ...\n    "create-if-nonexistent" ...\n    "remove-if-zero" ...\n    "match-phase" ...\n    "evaluation-point" ...\n    "pre-post-filter-tipping-point" ...\n    "order" ...\n    "max-filter-coverage" ...\n    "max-hits" ...\n    "first-phase" ...\n    "second-phase" ...\n    "global-phase" ...\n    "macro" ...\n    "inline" ...\n    "arity" ...\n    "lower-bound" ...\n    "upper-bound" ...\n    "dense-posting-list-threshold" ...\n    "enable-bm25" ...\n    "hnsw" ...\n    "max-links-per-node" ...\n    "double" ...\n    "float" ...\n    "long" ...\n    "string" ...\n    "distance-metric" ...\n    "neighbors-to-explore-at-insert" ...\n    "multi-threaded-indexing" ...\n    "rank-properties" ...\n    "rerank-count" ...\n    "num-threads-per-search" ...\n    "min-hits-per-thread" ...\n    "num-search-partitions" ...\n    "termwise-limit" ...\n    "post-filter-threshold" ...\n    "approximate-threshold" ...\n    "target-hits-max-adjustment-factor" ...\n    "keep-rank-count" ...\n    "rank-score-drop-limit" ...\n    "constants" ...\n    "file" ...\n    "uri" ...\n    <IDENTIFIER> ...\n    <IDENTIFIER_WITH_DASH> ...\n    <CONTEXT> ...\n    <INTEGER> ...\n    <LONG> ...\n    <STRING> ...\n    <VARIABLE> ...\n    \nAt position:\n            distance-metric: \n                             ^'}

In [8]:
def transform_row(row):
    return {
        "id": row["id"],
        "fields": {"title": row["title"], "authors": row["authors"], "description": row["description"], "categories": row["categories"], "id": row["id"]},
    }

In [9]:
def callback(response:VespaResponse, id:str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")

In [10]:
df = pd.read_csv("https://raw.githubusercontent.com/bernardovma/dados_livros/main/data.csv")
df['id'] = range(1, len(df) + 1)
df = df.fillna("")
vespa_feed = df.apply(transform_row, axis=1).tolist()

app.feed_iterable(vespa_feed, schema="doc", namespace="bookrec", callback=callback)

Error when feeding document 1: {'Exception': '[UNKNOWN(252001) @ tcp/hybridsearchbv:19101/chain.indexing]: Processing failed. Error message: java.lang.IllegalArgumentException: Invalid colbert embedder tensor target destination. Wanted a mixed 2-d mapped-indexed tensor, got tensor<int8>(steps{},token{},v[16]) -- See Vespa log for details. ', 'id': 1, 'message': 'Exception during feed_data_point'}
Error when feeding document 3: {'Exception': '[UNKNOWN(252001) @ tcp/hybridsearchbv:19101/chain.indexing]: Processing failed. Error message: java.lang.IllegalArgumentException: Invalid colbert embedder tensor target destination. Wanted a mixed 2-d mapped-indexed tensor, got tensor<int8>(steps{},token{},v[16]) -- See Vespa log for details. ', 'id': 3, 'message': 'Exception during feed_data_point'}
Error when feeding document 5: {'Exception': '[UNKNOWN(252001) @ tcp/hybridsearchbv:19101/chain.indexing]: Processing failed. Error message: java.lang.IllegalArgumentException: Invalid colbert embedde

Exception in thread Thread-5 (_consumer):
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "C:\Users\Bernardo Vargas\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Bernardo Vargas\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\vespa\application.py", line 377, in _consumer
    future:Future = executor.submit(_submit, doc, sync_session)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Error when feeding document 14: {'Exception': '[UNKNOWN(252001) @ tcp/hybridsearchbv:19101/chain.indexing]: Processing failed. Error message: java.lang.IllegalArgumentException: Invalid colbert embedder tensor target destination. Wanted a mixed 2-d mapped-indexed tensor, got tensor<int8>(steps{},token{},v[16]) -- See Vespa log for details. ', 'id': 14, 'message': 'Exception during feed_data_point'}
Error when feeding document 10: {'Exception': '[UNKNOWN(252001) @ tcp/hybridsearchbv:19101/chain.indexing]: Processing failed. Error message: java.lang.IllegalArgumentException: Invalid colbert embedder tensor target destination. Wanted a mixed 2-d mapped-indexed tensor, got tensor<int8>(steps{},token{},v[16]) -- See Vespa log for details. ', 'id': 10, 'message': 'Exception during feed_data_point'}
Error when feeding document 15: {'Exception': '[UNKNOWN(252001) @ tcp/hybridsearchbv:19101/chain.indexing]: Processing failed. Error message: java.lang.IllegalArgumentException: Invalid colbert em

In [None]:
def hits_as_df(response, fields):
    records = []
    for hit in response.hits:
        record = {}
        for field in fields:
            record[field] = hit['fields'].get(field, None) 
        records.append(record)
    return pd.DataFrame(records)

In [None]:
def query_colbert(input_query):
    with app.syncio(connections=25) as session:
        query = input_query
        response: VespaQueryResponse = session.query(
            yql="select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q)) limit 10",
            query=query,
            ranking="colbert_local",
            body={
                "input.query(q)": f"embed({query})",
                "input.query(qt)": f"tokenize({query})"
            },
        )
        assert response.is_successful()

    return hits_as_df(response, ['id', 'title', 'authors', 'description', 'categories'])

In [None]:
query_colbert('books about space travel')