# Collect training data from Vespa applications

> Collect training data to analyse and/or improve ranking functions

## Example setup

Connect to the application and define a query model.

In [1]:
from vespa.application import Vespa
from vespa.query import Query, RankProfile, OR

app = Vespa(url = "https://api.cord19.vespa.ai")
query_model = Query(
    match_phase = OR(),
    rank_profile = RankProfile(name="bm25", list_features=True))

Define some labeled data.

In [2]:
labeled_data = [
    {
        "query_id": 0, 
        "query": "Intrauterine virus infections and congenital heart disease",
        "relevant_docs": [{"id": 0, "score": 1}, {"id": 3, "score": 1}]
    },
    {
        "query_id": 1, 
        "query": "Clinical and immunologic studies in identical twins discordant for systemic lupus erythematosus",
        "relevant_docs": [{"id": 1, "score": 1}, {"id": 5, "score": 1}]
    }
]

## Collect training data in batch

In [3]:
training_data_batch = app.collect_training_data(
    labeled_data = labeled_data,
    id_field = "id",
    query_model = query_model,
    number_additional_docs = 2
)
training_data_batch

Unnamed: 0,attributeMatch(authors.first),attributeMatch(authors.first).averageWeight,attributeMatch(authors.first).completeness,attributeMatch(authors.first).fieldCompleteness,attributeMatch(authors.first).importance,attributeMatch(authors.first).matches,attributeMatch(authors.first).maxWeight,attributeMatch(authors.first).normalizedWeight,attributeMatch(authors.first).normalizedWeightedWeight,attributeMatch(authors.first).queryCompleteness,...,textSimilarity(results).queryCoverage,textSimilarity(results).score,textSimilarity(title).fieldCoverage,textSimilarity(title).order,textSimilarity(title).proximity,textSimilarity(title).queryCoverage,textSimilarity(title).score,document_id,query_id,relevant
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0625,0.0,0.0,0.142857,0.055357,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,213690,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.285714,0.666667,0.739583,0.571429,0.587426,225739,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.142857,0.0,0.4375,0.142857,0.224554,3,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,213690,0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.285714,0.666667,0.739583,0.571429,0.587426,225739,0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.111111,0.0,0.0,0.083333,0.047222,1,1,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,176163,1,0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.1875,1.0,1.0,0.25,0.6125,13597,1,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.083333,0.0,0.0,0.083333,0.041667,5,1,1


## Collect training data point

> You can have finer control with the `collect_training_data_point` method.

In [4]:
from pandas import concat, DataFrame


training_data = []
for query_data in labeled_data:
    for doc_data in query_data["relevant_docs"]:
        training_data_point = app.collect_training_data_point(
            query = query_data["query"],
            query_id = query_data["query_id"],
            relevant_id = doc_data["id"],
            id_field = "id",
            query_model = query_model,
            number_additional_docs = 2
        )
        training_data.extend(training_data_point)
training_data = DataFrame.from_records(training_data)
training_data

Unnamed: 0,attributeMatch(authors.first),attributeMatch(authors.first).averageWeight,attributeMatch(authors.first).completeness,attributeMatch(authors.first).fieldCompleteness,attributeMatch(authors.first).importance,attributeMatch(authors.first).matches,attributeMatch(authors.first).maxWeight,attributeMatch(authors.first).normalizedWeight,attributeMatch(authors.first).normalizedWeightedWeight,attributeMatch(authors.first).queryCompleteness,...,textSimilarity(results).queryCoverage,textSimilarity(results).score,textSimilarity(title).fieldCoverage,textSimilarity(title).order,textSimilarity(title).proximity,textSimilarity(title).queryCoverage,textSimilarity(title).score,document_id,query_id,relevant
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0625,0.0,0.0,0.142857,0.055357,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,213690,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.285714,0.666667,0.739583,0.571429,0.587426,225739,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.142857,0.0,0.4375,0.142857,0.224554,3,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,213690,0,0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.285714,0.666667,0.739583,0.571429,0.587426,225739,0,0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.111111,0.0,0.0,0.083333,0.047222,1,1,1
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,176163,1,0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.1875,1.0,1.0,0.25,0.6125,13597,1,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.083333,0.0,0.0,0.083333,0.041667,5,1,1
