# Adding a ranking profile to Vespa 

### Adding fields
Here we need to add the fields that are necissary for the ranking function. So here we add "body_length" becuse we want to evaluate the document based on the length. Not very smart, but it is just an illustration.

In [1]:
from vespa.package import Document, Field

document = Document(
    fields=[
        Field(name = "id", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body_length", type = "int", indexing = ["attribute", "summary"] )
    ]
)


In [2]:
from vespa.package import Schema, FieldSet, RankProfile

msmarco_schema = Schema(
    name = "msmarco",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title", "body"])],
    rank_profiles = [RankProfile(name = "default", first_phase = "nativeRank(title, body)")]
)

In [3]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name = "msmarco", schema=msmarco_schema)


In [4]:
from vespa.package import VespaCloud

            #C:\Users\User\OneDrive - NTNU\NTNU\Prosjekt oppgave NLP
path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"
file = "andre.olaisen.tmartins-ntnu.pem"


# App name in Cloud
app_name = "andre-test-loud"
vespa_cloud = VespaCloud(
    tenant="tmartins-ntnu",
    application=app_name,
    key_location=path_key + file,
    application_package=app_package
)

In [5]:
name = "sample_application"

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"


app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)


Deployment started in run 57 of dev-aws-us-east-1c for tmartins-ntnu.andre-test-loud.andre-olaisen. This may take about 15 minutes the first time.
INFO    [13:06:58]  Deploying platform version 7.318.21 and application version unknown ...
INFO    [13:07:01]  Deployment successful.
INFO    [13:07:01]  Session 5132 for tenant 'tmartins-ntnu' prepared and activated.
INFO    [13:07:04]  ######## Details for all nodes ########
INFO    [13:07:04]  h5250a.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [13:07:04]  --- platform vespa/centos-tenant:7.318.21 <-- :
INFO    [13:07:04]  --- container on port 4080 has not started 
INFO    [13:07:04]  h5251b.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [13:07:04]  --- platform vespa/centos-tenant:7.318.21 <-- :
INFO    [13:07:04]  --- storagenode on port 19102 has not started 
INFO    [13:07:04]  --- searchnode on port 19107 has not started 
INFO    [13:07:04]  --- distributor on port 19111 h

RuntimeError: Deployment was aborted, probably by a newer deployment

In [None]:
from pandas import read_csv

docs = read_csv("https://thigm85.github.io/data/msmarco/docs.tsv", sep = "\t")
docs.shape

doc = docs[1:100]
doc.shape

### Feeding data

Here we feed the data to the vespa app. We need to include the length of the body at this point.

In [None]:

i = 1
for idx, row in doc.iterrows():
    i += 1
    if (i > 100): # Do not need much data for this test
        break 
    response = app.feed_data_point(
        schema = "msmarco",
        data_id = str(row["id"]),
        fields = {
            "id": str(row["id"]),
            "title": str(row["title"]),
            "body": str(row["body"]),
            "body_length": len(row["body"])
        }
    )

In [None]:
from vespa.query import Query, OR, AND, WeakAnd, ANN, RankProfile as Ranking


results = app.query(
    query="Where is my app",
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

In [None]:
print(results.number_documents_retrieved)
print(results.number_documents_indexed)

print("\n")

print("Results: or , deault")
for result in results.hits:
    print(result['fields']['title'])
    print(result["relevance"])
    print(result['fields']['body_length'])
    

### Adding Ranking profile
Here we add the ranking profile. In this case we have to use attribute(body_length) or else vespa does not know what body_length is. Then we can use different arithmetic operations, like sqrt. This is found here:

https://docs.vespa.ai/documentation/reference/ranking-expressions.html

We can also use built in features found here

https://docs.vespa.ai/documentation/reference/rank-features.html

In [None]:
# Here we need to use attribute(body_length) or else vespa does not know what body_length is

app_package.schema.add_rank_profile(
    RankProfile(name = "body_length", inherits = "default", first_phase = "sqrt(attribute(body_length))"))

In [None]:
app_package.schema.add_rank_profile(
    RankProfile(name = "nativerank_bm25_combo", inherits = "default",
                first_phase = "10 * nativeRank(title,body) + bm25(body)")
)

In [None]:
path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)

### Rank_profile in action 
Here we can see the ranking in action. 

In [None]:
query_text = "What is an apple?"

results_or_body_length = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="body_length")
    ),
    hits = 10
)

print("\n")
print("Results: OR , Body_length")
rank = 1
for result in results_or_body_length.hits:
    print("Ranking:", rank)
    print("Title: \t", result['fields']['title'])
    print("Relevance: \t", result["relevance"])
    print("Sqrt(body_length): \t", (result["fields"]["body_length"])**(1/2), "\n")
    rank += 1

In [None]:
"nativerank_bm25_combo"
query_text = "What is an apple?"

results_or_body_length = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="nativerank_bm25_combo")
    ),
    hits = 10
)

print("\n")
print("Results: OR , nativerank_bm25_combo")
rank = 1
for result in results_or_body_length.hits:
    print("Ranking:", rank)
    print("Title: \t", result['fields']['title'])
    print("Relevance: \t", result["relevance"])
    rank += 1
