In [75]:
import docker
from vespa.io import VespaResponse, VespaQueryResponse
from vespa.package import (
    ApplicationPackage,
    Field,
    Schema,
    Document,
    HNSW,
    RankProfile,
    Component,
    Parameter,
    FieldSet,
    GlobalPhaseRanking,
    Function,
    FirstPhaseRanking, SecondPhaseRanking
)
from vespa.deployment import VespaDocker
import pandas as pd
import numpy as np
import openai

In [94]:
package = ApplicationPackage(
            name="testecolbert",
            schema=[
                Schema(
                    name="doc",
                    document=Document(
                        fields=[
                            Field(name="id", type="string", indexing=["summary"]),
                            Field(name="title", type="string", indexing=["index", "summary"]),
                            Field(
                                name="authors",
                                type="string",
                                indexing=["index", "summary"],
                                bolding=False,
                            ),
                            Field(
                                name="categories",
                                type="string",
                                indexing=["index", "summary"],
                                bolding=False,
                            ),
                            Field(name="description", type="array<string>", indexing=["summary", "index"]),
                            Field(
                                name="embedding",
                                type="tensor<bfloat16>(steps{}, x[384])",
                                indexing=[
                                    "input description",
                                    'for_each { (input title || "") . " " . ( _ || "") }',
                                    "embed e5",
                                    "attribute",
                                ],
                                attribute=["distance-metric: angular"],
                                is_document_field=False,
                            ),
                            Field(
                                name="colbert",
                                type="tensor<int8>(description{}, token{}, v[16])",
                                indexing=["input description", "embed colbert description", "attribute"],
                                is_document_field=False,
                            ),
                        ]
                    ),            
                    fieldsets=[FieldSet(name="default", fields=["title", "authors", "description", "categories"])],
                    rank_profiles=[
                        RankProfile(
                            name="bm25",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            functions=[
                                Function(name="bm25sum", expression="bm25(description) + bm25(categories)")
                            ],
                            first_phase="bm25sum",
                        ),
                        RankProfile(
                            name="semantic",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase="closeness(field, embedding)",
                        ),
                        RankProfile(
                            name="fusion",
                            inherits="bm25",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            first_phase="closeness(field, embedding)",
                            global_phase=GlobalPhaseRanking(
                                expression="reciprocal_rank_fusion(bm25sum, closeness(field, embedding))",
                                rerank_count=1000,
                            ),
                        ),
                        RankProfile(
                            name="twofase",
                            inputs=[("query(q)", "tensor<float>(x[384])")],
                            functions=[
                                Function(name="bm25sum", expression="bm25(description) + bm25(categories)"),
                                Function(name="closeness", expression="closeness(field, embedding)"),
                            ],
                            first_phase=FirstPhaseRanking(expression = "bm25sum"),
                            second_phase=SecondPhaseRanking(expression = "closeness", rerank_count=1000),
                            match_features=["bm25sum", "closeness"],
                        ),
                        RankProfile(
                            name="colbert_local",
                            inputs=[
                                ("query(q)", "tensor<float>(x[384])"),
                                ("query(qt)", "tensor<float>(querytoken{}, v[128])"),
                            ],
                            functions=[
                                Function(name="cos_sim", expression="closeness(field, embedding)"),
                                Function(
                                    name="max_sim_per_steps",
                                    expression="""
                                        sum(
                                            reduce(
                                                sum(
                                                    query(qt) * unpack_bits(attribute(colbert)) , v
                                                ),
                                                max, token
                                            ),
                                            querytoken
                                        )
                                    """,
                                ),
                                Function(
                                    name="max_sim_local", expression="reduce(max_sim_per_steps, max, description)"
                                ),
                            ],
                            first_phase=FirstPhaseRanking(expression="cos_sim"),
                            second_phase=SecondPhaseRanking(expression="max_sim_local"),
                            match_features=["cos_sim", "max_sim_local", "max_sim_per_steps"],
                        ),
                        RankProfile(
                            name="colbert_global",
                            inputs=[
                                ("query(q)", "tensor<float>(x[384])"),
                                ("query(qt)", "tensor<float>(querytoken{}, v[128])"),
                            ],
                            functions=[
                                Function(name="cos_sim", expression="closeness(field, embedding)"),
                                Function(
                                    name="max_sim_cross_steps",
                                    expression="""
                                        sum(
                                            reduce(
                                                sum(
                                                    query(qt) *  unpack_bits(attribute(colbert)) , v
                                                ),
                                                max, token, description
                                            ),
                                            querytoken
                                        )
                                        """
                                ),
                                Function(
                                    name="max_sim_global", expression="reduce(max_sim_cross_steps, max)"
                                ),
                            ],
                            first_phase=FirstPhaseRanking(expression="cos_sim"),
                            second_phase=SecondPhaseRanking(expression="max_sim_global", rerank_count=5),
                            match_features=[
                            "cos_sim",
                            "max_sim_global",
                            "max_sim_cross_steps",
                            ],
                        )
                    ]
                )
            ],
            components=[
                Component(
                    id="e5",
                    type="hugging-face-embedder",
                    parameters=[
                        Parameter(
                            name="transformer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"
                            },
                        ),
                        Parameter(
                            name="tokenizer-model",
                            args={
                                "url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"
                            },
                        ),
                    ],
                ),
                Component(
                    id="colbert",
                    type="colbert-embedder",
                    parameters=[
                        Parameter(
                            name="transformer-model",
                            args={
                                "url": "https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"
                            },
                        ),
                        Parameter(
                            name="tokenizer-model",
                            args={
                                "url": "https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"
                            },
                        ),
                    ],
                ),
            ]
        )

In [95]:
vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=package)

Waiting for configuration server, 0/60 seconds...
Waiting for configuration server, 5/60 seconds...
Waiting for configuration server, 10/60 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 10/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 15/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 20/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 25/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 3

In [62]:
def transform_row(row):
    return {
        "id": row["id"],
        "fields": {"title": row["title"], "authors": row["authors"], "description": row["description"], "categories": row["categories"], "id": row["id"]},
    }

In [63]:
def callback(response:VespaResponse, id:str):
    if not response.is_successful():
        print(f"Error when feeding document {id}: {response.get_json()}")

In [65]:
df = pd.read_csv("https://raw.githubusercontent.com/bernardovma/dados_livros/main/data.csv")
df['id'] = range(1, len(df) + 1)
df = df.fillna("")
df['description'] = df['description'].apply(lambda x: [x])
vespa_feed = df.apply(transform_row, axis=1).tolist()

app.feed_iterable(vespa_feed, schema="doc", namespace="bookrec", callback=callback)

In [66]:
def hits_as_df(response, fields):
    records = []
    for hit in response.hits:
        record = {}
        for field in fields:
            record[field] = hit['fields'].get(field, None) 
        records.append(record)
    return pd.DataFrame(records)

In [72]:
def query_colbert(input_query):
    with app.syncio(connections=25) as session:
        query = input_query
        response: VespaQueryResponse = session.query(
            yql="select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q)) limit 10",
            query=query,
            ranking="colbert_local",
            body={
                "input.query(q)": f'embed(e5, "{query}")',
                "input.query(qt)": f'embed(colbert, "{query}")'
            },
        )
        assert response.is_successful()

    return hits_as_df(response, ['id', 'title', 'authors', 'description', 'categories'])

In [73]:
query_colbert('books about space travel')

Unnamed: 0,id,title,authors,description,categories
0,3263,Two Complete Novels,Douglas Adams,"[Following themes of zany space exploration, t...","Detective and mystery stories, English."
1,6459,The Algebraist,Iain M. Banks,[It is 4034 AD. Humanity has made it to the st...,Fiction
2,878,The Fabric of the Cosmos,Brian Greene,[From the bestselling author of The Elegant Un...,Science
3,3254,The Complete Science Fiction Treasury of H.G. ...,H. G. Wells,[Includes fantasies of travel in time and spac...,"Science fiction, English."
4,5628,A Short History of Nearly Everything,Bill Bryson,[In this book Bill Bryson explores the most in...,Science
5,1180,Gulliver's Travels,Jonathan Swift;Claude Rawson;Ian Higgins,"[""Gulliver's travels purports to be a travel b...",Fiction
6,3142,Bad Astronomy,Philip C. Plait,[Advance praise for Philip Plait s Bad Astrono...,Science
7,3688,A Brief History of Time,Stephen Hawking,[Stephen Hawking's A Brief History of Time has...,Cosmology
8,5619,Pyramids of Montauk,Preston B. Nichols;Peter Moon,[During WWII there was an attempt to achieve i...,Fiction
9,3470,A Brief History of Time,Stephen Hawking,[An anniversary edition of a now-classic surve...,Science


In [74]:
def generate_generic_questions_twofase(book_description, book_category):
    prompt = f"Generate three generic questions about a book, given the following description: '{book_description} ans its category (it can be multiple categories, and in some cases the are no categories shown): '{book_category}'. The questions should be broad and not specific, not giving away the book title, being more general and applicable to other books, but at the same time giving elements to discuss the book."
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=150,
        temperature=0.7,
    )
    questions = response.choices[0].message.content.strip().split("\n")
    return [q for q in questions if q]

In [77]:
teste_df = df.sample(n=100)
teste_df = teste_df[['title', 'description', 'categories']]

In [93]:
#openai.api_key = openai api key

In [80]:
def dcg(relevance_scores, p):
    relevance_scores = np.asfarray(relevance_scores)[:p]
    if relevance_scores.size:
        return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))
    return 0.0

def ndcg(relevance_scores, p):
    dcg_p = dcg(relevance_scores, p)
    idcg_p = dcg(sorted(relevance_scores, reverse=True), p)
    if idcg_p == 0:
        return 0.0
    return dcg_p / idcg_p

In [78]:
results_colbert_local = {}
total_appearance_count_colbert_local = 0
total_questions_colbert_local = 0

In [79]:
if results_colbert_local:
    last_title = list(results_colbert_local.keys())[-1]
    last_book_data = results_colbert_local.pop(last_title)
    total_appearance_count_colbert_local -= last_book_data["appearance_count"]
    total_questions_colbert_local -= last_book_data["total_questions"]

start_index = len(results_colbert_local)
books_to_process = teste_df.iloc[start_index:]

for index, row in books_to_process.iterrows():
    title = row['title']
    print(f"Processing book: {title}")
    description = row['description']
    categories = row['categories']
    questions = generate_generic_questions_twofase(description, categories)
    
    results_colbert_local[title] = {
        "questions": questions,
        "appearance_count": 0,
        "total_questions": len(questions),
        "relevance_scores": []
    }
    
    for question in questions:
        
        search_results_colbert = query_colbert(question)
        search_results_titles_colbert = list(search_results_colbert['title'])
        
        book_appears_colbert = title in search_results_titles_colbert
        results_colbert_local[title]["relevance_scores"].append(int(book_appears_colbert))
        if book_appears_colbert:
            results_colbert_local[title]["appearance_count"] += 1
    
    total_appearance_count_colbert_local += results_colbert_local[title]["appearance_count"]
    total_questions_colbert_local += results_colbert_local[title]["total_questions"]

Processing book: Guardians of Ga'hoole
Processing book: Starshield Sentinels
Processing book: The Lord God Made Them All
Processing book: NYC Ballet Workout
Processing book: Under The Influence
Processing book: A Farewell to Arms
Processing book: Jitterbug Perfume
Processing book: The Deep End of the Ocean
Processing book: From Far Away
Processing book: Shadow Game
Processing book: Garden State
Processing book: River of Shadows
Processing book: Bad Astronomy
Processing book: The Science Book
Processing book: The World, the Text, and the Critic
Processing book: Last Wish
Processing book: The Classic Treasury of Hans Christian Andersen
Processing book: Kiss
Processing book: The Picture of Dorian Gray
Processing book: Magic Bites
Processing book: The Problem of Pain
Processing book: On Becoming a Novelist
Processing book: Autobiography of Thomas Jefferson
Processing book: The Unadulterated Cat
Processing book: The White Man's Burden
Processing book: Fell
Processing book: Good Omens
Proces

In [82]:
if total_questions_colbert_local > 0:
    overall_percentage_colbert = (total_appearance_count_colbert_local / total_questions_colbert_local) * 100
else:
    overall_percentage_colbert = 0

print(f"Overall percentage: {overall_percentage_colbert:.2f}%")

Overall percentage: 48.44%


In [83]:
ndcg_scores_colbert = {}
p = 10  
for title, data in results_colbert_local.items():
    relevance_scores = data["relevance_scores"]
    ndcg_scores_colbert[title] = ndcg(relevance_scores, p)

mean_ndcg_colbert = np.mean(list(ndcg_scores_colbert.values()))
print(f"Mean NDCG@{p} across all books: {mean_ndcg_colbert:.4f}")

Mean NDCG@10 across all books: 0.6417


In [84]:
def query_colbert_global(input_query):
    with app.syncio(connections=25) as session:
        query = input_query
        response: VespaQueryResponse = session.query(
            yql="select * from sources * where userQuery() or ({targetHits:1000}nearestNeighbor(embedding,q)) limit 10",
            query=query,
            ranking="colbert_global",
            body={
                "input.query(q)": f'embed(e5, "{query}")',
                "input.query(qt)": f'embed(colbert, "{query}")'
            },
        )
        assert response.is_successful()

    return hits_as_df(response, ['id', 'title', 'authors', 'description', 'categories'])

In [87]:
results_colbert_local = {}
total_appearance_count_colbert_local = 0
total_questions_colbert_local = 0

In [88]:
if results_colbert_local:
    last_title = list(results_colbert_local.keys())[-1]
    last_book_data = results_colbert_local.pop(last_title)
    total_appearance_count_colbert_local -= last_book_data["appearance_count"]
    total_questions_colbert_local -= last_book_data["total_questions"]

start_index = len(results_colbert_local)
books_to_process = teste_df.iloc[start_index:]

for index, row in books_to_process.iterrows():
    title = row['title']
    print(f"Processing book: {title}")
    description = row['description']
    categories = row['categories']
    questions = generate_generic_questions_twofase(description, categories)
    
    results_colbert_local[title] = {
        "questions": questions,
        "appearance_count": 0,
        "total_questions": len(questions),
        "relevance_scores": []
    }
    
    for question in questions:
        
        search_results_colbert = query_colbert_global(question)
        search_results_titles_colbert = list(search_results_colbert['title'])
        
        book_appears_colbert = title in search_results_titles_colbert
        results_colbert_local[title]["relevance_scores"].append(int(book_appears_colbert))
        if book_appears_colbert:
            results_colbert_local[title]["appearance_count"] += 1
    
    total_appearance_count_colbert_local += results_colbert_local[title]["appearance_count"]
    total_questions_colbert_local += results_colbert_local[title]["total_questions"]

Processing book: Guardians of Ga'hoole
Processing book: Starshield Sentinels
Processing book: The Lord God Made Them All
Processing book: NYC Ballet Workout
Processing book: Under The Influence
Processing book: A Farewell to Arms
Processing book: Jitterbug Perfume
Processing book: The Deep End of the Ocean
Processing book: From Far Away
Processing book: Shadow Game
Processing book: Garden State
Processing book: River of Shadows
Processing book: Bad Astronomy
Processing book: The Science Book
Processing book: The World, the Text, and the Critic
Processing book: Last Wish
Processing book: The Classic Treasury of Hans Christian Andersen
Processing book: Kiss
Processing book: The Picture of Dorian Gray
Processing book: Magic Bites
Processing book: The Problem of Pain
Processing book: On Becoming a Novelist
Processing book: Autobiography of Thomas Jefferson
Processing book: The Unadulterated Cat
Processing book: The White Man's Burden
Processing book: Fell
Processing book: Good Omens
Proces

In [90]:
if total_questions_colbert_local > 0:
    overall_percentage_colbert = (total_appearance_count_colbert_local / total_questions_colbert_local) * 100
else:
    overall_percentage_colbert = 0

print(f"Overall percentage: {overall_percentage_colbert:.2f}%")

Overall percentage: 43.67%


In [92]:
ndcg_scores_colbert = {}
p = 10  
for title, data in results_colbert_local.items():
    relevance_scores = data["relevance_scores"]
    ndcg_scores_colbert[title] = ndcg(relevance_scores, p)

mean_ndcg_colbert = np.mean(list(ndcg_scores_colbert.values()))
print(f"Mean NDCG@{p} across all books: {mean_ndcg_colbert:.4f}")

Mean NDCG@10 across all books: 0.5757
