In [1]:
import matplotlib.pyplot as plt
import numpy as np
from random import random
import time
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
from vespa.package import Document, Field

document = Document(
    fields=[
        Field(name = "id", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body_length", type = "int", indexing = ["attribute", "summary"]),
        Field(name = "title_embedding", type = "tensor<float>(x[768]) ", indexing = ["attribute", "summary"])
    ]
)



In [None]:
document

In [3]:
from vespa.package import Schema, FieldSet, RankProfile

msmarco_schema = Schema(
    name = "msmarco",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title", "body"])],
    rank_profiles = [RankProfile(name = "default", first_phase = "nativeRank(title, body)"),
                    RankProfile(name = "bm25", first_phase = "bm25(title) + bm25(body)")]
)

In [4]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name = "msmarco", schema=msmarco_schema)



In [5]:
from vespa.package import VespaCloud

            #C:\Users\User\OneDrive - NTNU\NTNU\Prosjekt oppgave NLP
path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"
file = "andre.olaisen.tmartins-ntnu.pem"


# App name in Cloud
app_name = "andre-msmarco"
vespa_cloud = VespaCloud(
    tenant="tmartins-ntnu",
    application=app_name,
    key_location=path_key + file,
    application_package=app_package
)

In [6]:
name = "sample_application_MSMARCO"
path = path_key + name
print(path)

C:\Users\User\OneDrive - NTNU\NTNU\Prosjekt oppgave NLP\Cloud_test\sample_application_MSMARCO


In [7]:
name = "sample_application_MSMARCO"

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"


app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)



Deployment started in run 2 of dev-aws-us-east-1c for tmartins-ntnu.andre-msmarco.andre-olaisen. This may take about 15 minutes the first time.
INFO    [14:11:13]  Deploying platform version 7.314.13 and application version unknown ...
INFO    [14:11:15]  Deployment successful.
INFO    [14:11:15]  Session 3931 for tenant 'tmartins-ntnu' prepared and activated.
INFO    [14:11:15]  ######## Details for all nodes ########
INFO    [14:11:15]  h5252d.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [14:11:15]  --- platform vespa/centos-tenant:7.314.13
INFO    [14:11:15]  --- container on port 4080 has config generation 3931, wanted is 3931
INFO    [14:11:15]  h5250c.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [14:11:15]  --- platform vespa/centos-tenant:7.314.13
INFO    [14:11:15]  --- distributor on port 19111 has config generation 3928, wanted is 3931
INFO    [14:11:15]  --- storagenode on port 19102 has config generation 3931, wa

In [8]:
from pandas import read_csv

docs = read_csv("https://thigm85.github.io/data/msmarco/docs.tsv", sep = "\t")
docs.shape

(996, 3)

In [None]:
# Importing sentence encoding models 
# link https://github.com/UKPLab/sentence-transformers#getting-started
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [None]:
docs.head(2)

In [None]:
len(docs["body"][1])

In [9]:
def feed_datapoint(row,i):
    #if type(row["title"]) != float:
    #    embedding =  model.encode(row["title"]).tolist()
    #else:
    #    embedding = [0 for _ in range(768)]
    #print(len(row["title"]), end = "  ")
    response = app.feed_data_point(
        schema = "msmarco",
        data_id = str(row["id"]),
        fields = {
            "id": str(row["id"]),
            "title": str(row["title"]),
            "body": str(row["body"]),
            "body_length": len(row["body"])#,
            #"title_embedding": embedding
        })
    
    return(i)


In [None]:
feed_datapoint(docs.iloc[1,:],1)

In [None]:
n = len(docs["id"])
print(n)
Parallel(n_jobs=num_cores)(delayed(app.feed_data_point)(
        schema = "msmarco",
        data_id = str(row["id"]),
        fields = {
            "id": str(row["id"]),
            "title": str(row["title"]),
            "body": str(row["body"]),
            "body_length": len(row["body"])#,
            #"title_embedding": embedding
        }) for idx, row in docs.iterrows())

In [None]:
import os
os.environ["DATABRICKS_HOST"] = "<YOUR DATABRICKS HOST>"
os.environ["DATABRICKS_TOKEN"] = "<YOUR DATABRICKS TOKEN>"

In [None]:
from joblibspark import register_spark
register_spark()

In [None]:
pip install dill

In [14]:
import dill as pickle

In [15]:
n = len(docs["id"])
print(n)
Parallel(n_jobs=2)(delayed(feed_datapoint)(docs.iloc[i,:],i) for i in range(20))

996


PicklingError: Could not pickle the task to send it to the workers.

In [None]:
start = time.time()
Parallel(n_jobs=-1)(delayed(feed_datapoint)(row) for idx, row in docs.iterrows())
print(time.time() - start)

In [None]:

start = time.time()
m = start
i = 1
for idx, row in docs.iterrows():
    i += 1
    if (i % 100 == 0):
        print(i)
        print("Time:", round(time.time() - m,1))
        m = time.time()
    if type(row["title"]) != float:
        embedding =  model.encode(row["title"]).tolist()
    else:
        embedding = [0 for _ in range(768)]
    #print(len(row["title"]), end = "  ")
    response = app.feed_data_point(
        schema = "msmarco",
        data_id = str(row["id"]),
        fields = {
            "id": str(row["id"]),
            "title": str(row["title"]),
            "body": str(row["body"]),
            "body_length": len(row["body"]),
            "title_embedding": embedding
        }
    )

In [None]:


num_cores = multiprocessing.cpu_count()
print(num_cores)
inputs = myList

processed_list = Parallel(n_jobs=num_cores)(delayed(my_function(i,parameters) 
                                                        for i in inputs)

In [None]:
num_cores = multiprocessing.cpu_count()
print(num_cores)

In [None]:
def numpy_random():
    """Generate a random number using NumPy"""
    np.random.random(10**7)
    return 1

In [None]:
n = 8*50

In [None]:
s = time.time()
a = Parallel(n_jobs=-1)(delayed(numpy_random)() for i in range(n))
time.time() - s

In [None]:
s = time.time()
b = [numpy_random() for i in range(n)]
time.time() - s     

In [None]:
print(a, b)

In [None]:
from vespa.query import Query, OR, AND, WeakAnd, ANN, RankProfile as Ranking


results = app.query(
    query="Where is my app",
    query_model = Query(
        match_phase=AND(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

In [None]:
print(results.request_body)

In [None]:
len(results.hits)
sum = 0
for result in results.hits:
    sum += result["fields"]["body_length"]
    print(result["id"], "  ", result["fields"]["body_length"])
print(sum)

In [None]:
### Testing different matching phases

query_text = "What is food?"

results1 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=WeakAnd(hits = 1),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

results2 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

results3 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=AND(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

# Very different number of ducuments retrieved.
# 
print(results1.number_documents_retrieved)
print(results2.number_documents_retrieved)
print(results3.number_documents_retrieved)

print("\n")

# Size of the corpus?
print(results1.number_documents_indexed)
print(results2.number_documents_indexed)
print(results3.number_documents_indexed)

#


In [None]:
#Testing WeakAnd
# Can be read about here: https://docs.vespa.ai/documentation/using-wand-with-vespa.html
# How does hits affect the search?
# Is this the target amount of retrived documents?
# Retrived documents seam to increase linearly with hits
query_text = "How too kill the warm black friday mood???"

n = 100

results = []
retrived = np.zeros(n)


for i in range(n):
    results_temp = app.query(
        query=query_text,
        query_model = Query(
            match_phase=WeakAnd(hits = i),
            rank_profile=Ranking(name="default")
        ),
        hits = 1
    )
    results.append(results)
    retrived[i] = results_temp.number_documents_retrieved



In [None]:
plt.plot(retrived)
plt.plot(np.arange(100) + retrived[5])
plt.ylabel('Numbre of documents retrived')
plt.xlabel("WeakAnd(hits = x)")
plt.title("Query:" + query_text)
plt.show()


In [None]:
query_model = Query(
            match_phase=WeakAnd(hits = 10, ),
            rank_profile=Ranking(name="default"))
    
query_model.body




In [None]:
app_package.schema.add_rank_profile(
    RankProfile(name = "body_length", inherits = "default", first_phase = "body_length")
)

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance = 'andre-olaisen',
    disk_folder = path_key 
)




In [None]:
query_text = "Was jesus a socialist"

results_or_default = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 5
)

results_or_bm25 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 5
)

print(results_or_default.number_documents_retrieved)
print(results_or_bm25.number_documents_retrieved)

print(query_text)
print("\n")

print("Results: or , deault")
for result in results_or_default.hits:
    print(result['fields']['title'])
    print(result["relevance"])
    
print("\n")
    
print("Results: OR , bm25(title) + bm25(body)")
for result in results_or_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])



In [None]:
# add_rank_profile: What does inherits mean? Why is this needed?
# Not able to make bm25 work
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25", inherits = "default", first_phase = "bm25(body)+bm25(title)")
)
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25_title", inherits = "default", first_phase = "bm25(title)")
)
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25_body", inherits = "default", first_phase = "bm25(body)")
)

app_package



In [None]:
# After adding a new RankingProfile the app has to be redeployed

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)

In [None]:
query_text = "Could muhammad take a selfie?"

results_or_bm25 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 10
)


results_or_bm25_title = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25_body")
    ),
    hits = 10
)

results_or_bm25_body = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25_title")
    ),
    hits = 10
)


print("Results: OR , bm25(title)+bm25(body)")
for result in results_or_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])
    
print("\n")

print("Results: OR , bm25(title)")
for result in results_or_bm25_body.hits:
    print(result['fields']['title'])
    print(result["relevance"])

print("\n")
print("Results: OR , bm25(body)")
for result in results_or_bm25_title.hits:
    print(result['fields']['title'])
    print(result["relevance"])



In [None]:
app_package.schema.add_rank_profile(
    RankProfile(name = "nativerank_bm25_combo", inherits = "default",
                first_phase = "nativeRank(title,body) + bm25(body)")
)

# After adding a new RankingProfile the app has to be redeployed

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)

In [None]:
query_text = "Could Muhammad take a selfie?"

results_or_native_bm_combo = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="nativerank_bm25_combo")
    ),
    hits = 10
)

print("\n")
print("Results: OR , bm25(body)")
for result in results_or_native_bm_combo.hits:
    print(result['fields']['title'])
    print(result["relevance"])

In [None]:
# Testing ANN 
from vespa.query import Union, WeakAnd, ANN

In [None]:


match_phase = Union(
    WeakAnd(hits = 10),
    ANN(
        doc_vector="title_embedding",
        query_vector=query_text,
        embedding_model=model.encode().tolist(),
        hits = 10,
        label="title"
    )
)

m

rank_profile = Ranking(name="default" ,list_features=True)

query_model = Query(match_phase=match_phase, rank_profile=rank_profile)

results_ANN_bm25 = app.query(
    query=query_text,
    query_model = query_model )

print(results_ANN_bm25.number_documents_retrieved) # = 0
print(results_ANN_bm25.number_documents_indexed)   # = 0



In [None]:
??results_ANN_bm25

In [None]:
match_phase.get_query_properties("dsfsdf")

In [None]:
rank_profile = Ranking(name="default" ,list_features=True)

query_model = Query(match_phase=match_phase, rank_profile=rank_profile)

results_ANN_bm25 = app.query(
    query=query_text,
    query_model = query_model)

print(results_ANN_bm25.number_documents_retrieved)
print(results_ANN_bm25.number_documents_indexed)
print("\n")
print("Results: ANN , bm25")
for result in results_ANN_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])

   

In [None]:
import requests, json

labelled_data = json.loads(
    requests.get("https://thigm85.github.io/data/msmarco/query-labels.json").text
)

In [None]:
print(len(labelled_data))

labelled_data[0:4]

In [None]:
default_ranking = Query(
    match_phase=OR(),
    rank_profile=Ranking(name="default")
)

In [None]:
bm25_ranking = Query(
    match_phase=OR(),
    rank_profile=Ranking(name="bm25")
)



In [None]:
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank

eval_metrics = [MatchRatio(), Recall(at = 10), ReciprocalRank(at = 10)]



In [None]:
default_evaluation = app.evaluate(
    labelled_data=labelled_data,
    eval_metrics=eval_metrics,
    query_model=default_ranking,
    id_field="id",
    timeout=5,
    hits=10
)

In [None]:
bm25_evaluation = app.evaluate(
    labelled_data=labelled_data,
    eval_metrics=eval_metrics,
    query_model=bm25_ranking,
    id_field="id",
    timeout=5,
    hits=10
)



In [None]:
from pandas import merge

eval_comparison = merge(
    left=default_evaluation,
    right=bm25_evaluation,
    on="query_id",
    suffixes=('_default', '_bm25')
)
eval_comparison[0:10]



In [None]:
eval_comparison[["match_ratio_value_default", "match_ratio_value_bm25"]].describe().loc[["mean", "std"]]

In [None]:
eval_comparison[["recall_10_value_default", "recall_10_value_bm25"]].describe().loc[["mean", "std"]]

In [None]:
eval_comparison[["reciprocal_rank_10_value_default", "reciprocal_rank_10_value_bm25"]].describe().loc[["mean", "std"]]