In [1]:
import matplotlib.pyplot as plt
import numpy as np
from random import random
import time
import multiprocessing
from joblib import Parallel, delayed
from tqdm import tqdm

In [2]:
from vespa.package import Document, Field

document = Document(
    fields=[
        Field(name = "id", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "title_embedding", type = "tensor<float>(x[768])", indexing = ["attribute", "summary"]),
        Field(name = "body_embedding", type = "tensor<float>(x[768])", indexing = ["attribute", "summary"])
    ]
)

In [3]:
document

Document([Field('id', 'string', ['attribute', 'summary'], None), Field('title', 'string', ['index', 'summary'], 'enable-bm25'), Field('body', 'string', ['index', 'summary'], 'enable-bm25'), Field('title_embedding', 'tensor<float>(x[768])', ['attribute', 'summary'], None), Field('body_embedding', 'tensor<float>(x[768])', ['attribute', 'summary'], None)])

In [4]:
from vespa.package import Schema, FieldSet, RankProfile

msmarco_schema = Schema(
    name = "msmarco",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title", "body"])],
    rank_profiles = [RankProfile(name = "default", first_phase = "nativeRank(title, body)"),
                    RankProfile(name = "bm25", first_phase = "bm25(title) + bm25(body)")]
)

In [5]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name = "msmarco", schema=msmarco_schema)



In [6]:
from vespa.package import VespaCloud

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"
file = "andre.olaisen.tmartins-ntnu.pem"


# App name in Cloud
app_name = "andre-test-loud"

vespa_cloud = VespaCloud(
    tenant="tmartins-ntnu",
    application=app_name,
    key_location=path_key + file,
    application_package=app_package
)

In [7]:
name = "sample_application_MSMARCO"
path = path_key + name
print(path)

C:\Users\User\OneDrive - NTNU\NTNU\Prosjekt oppgave NLP\Cloud_test\sample_application_MSMARCO


In [8]:
name = "sample_application_MSMARCO"

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"


app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key + name
)



Deployment started in run 63 of dev-aws-us-east-1c for tmartins-ntnu.andre-test-loud.andre-olaisen. This may take about 15 minutes the first time.
INFO    [11:59:05]  Deploying platform version 7.319.17 and application version unknown ...
INFO    [11:59:07]  Deployment successful.
INFO    [11:59:07]  Session 5372 for tenant 'tmartins-ntnu' prepared and activated.
INFO    [11:59:07]  ######## Details for all nodes ########
INFO    [11:59:07]  h5250a.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [11:59:07]  --- platform vespa/centos-tenant:7.319.17
INFO    [11:59:07]  --- container on port 4080 has config generation 5371, wanted is 5372
INFO    [11:59:07]  h5251b.dev.aws-us-east-1c.vespa-external.aws.oath.cloud: expected to be UP
INFO    [11:59:07]  --- platform vespa/centos-tenant:7.319.17
INFO    [11:59:07]  --- storagenode on port 19102 has config generation 5372, wanted is 5372
INFO    [11:59:07]  --- searchnode on port 19107 has config generation 5372, 

In [9]:
from pandas import read_csv

docs = read_csv("https://thigm85.github.io/data/msmarco/docs.tsv", sep = "\t")
docs.shape

(996, 3)

In [10]:
# Importing sentence encoding models 
# link https://github.com/UKPLab/sentence-transformers#getting-started
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
docs.head(2)

Unnamed: 0,id,title,body
0,D2185715,What Is an Appropriate Gift for a Bris,Hub Pages Religion and Philosophy Judaism...
1,D2819479,lunge,1lungenoun ˈlənj Popularity Bottom 40 of...


In [12]:
len(docs["body"][1])

4937

In [14]:
def feed_datapoint(row,i):
    embedding_title = np.zeros(768)
    embedding_body = np.zeros(768)
    
    if type(row["title"]) != float:
        embedding_title =  model.encode(row["title"].lower()).tolist()
    else:
        embedding_title = [0 for _ in range(768)]
    
    if type(row["body"]) != float:
        embedding_body =  model.encode(row["body"].lower()).tolist()
    else:
        embedding_body = [0 for _ in range(768)]
    #print(len(row["title"]), end = "  ")
    print(len(embedding_title), len(embedding_body))
    response = app.feed_data_point(
        schema = "msmarco",
        data_id = str(row["id"]),
        fields = {
            "id": str(row["id"]),
            "title": str(row["title"]),
            "body": str(row["body"]),
            "title_embedding": {"x": embedding_title},
            "body_embedding": {"x": embedding_body}
        }
    )
    return(i)

In [15]:

start = time.time()
m = start
i = 0
for idx, row in docs.iterrows():
    if (i >100):
        break
    if (i % 100 == 0):
        print(i)
        print("Time:", round(time.time() - m,1))
        m = time.time()
    if (i % 10 == 0):
        print("=",end="")
    i += 1
    
    feed_datapoint(row,idx)
    
        
    
print("Total_time:", time.time()- start)

0
Time: 0.0
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
=768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
768 768
100
Time: 93.3
=768 768
Total_time: 95.04311347007751


## Feeding data parallel 

In [None]:
# Does not work
"""
s = time.time()
a = Parallel(n_jobs=-1)(delayed(feed_datapoint)(row, idx) for row, idx in docs.iterrows())
time.time() - s
"""

In [16]:
from vespa.query import Query, OR, AND, WeakAnd, ANN, RankProfile as Ranking


results = app.query(
    query="Are the any documents here",
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 10
)

In [17]:
print(len(results.hits))
for result in results.hits:
    print(result["fields"]["id"], "  ", result["fields"])


10
D2742696    {'sddocname': 'msmarco', 'documentid': 'id:msmarco:msmarco::D2742696', 'id': 'D2742696', 'title': 'Building Rectangular Prisms Part 1', 'body': ' Like It  1026 likes This is the first part of a two part volume lesson  In this lesson  students will build foundational concepts for volume and and count cubes to find volume  In the second part lesson Building Rectangular Prisms Part 2  attached   students will discover the volume formulas length x width x height and base x height as they build rectangular prisms  They will use the formulas to find volume in real world situations  Subject  s   Mathematics Grade Level  s   5Intended Audience  Educators Suggested Technology  Document Camera  Computers for Students  Internet Connection  LCD Projector Instructional Time  1 Hour  s Resource supports reading in content area  Yes Freely Available  Yes Keywords  volume  cubic units  rectangular prisms Instructional Component Type  s   Lesson Plan  Virtual Manipulative   Formative Ass

In [None]:
### Testing different matching phases

query_text = "What is food?"

results1 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=WeakAnd(hits = 1),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

results2 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

results3 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=AND(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 10
)

# Very different number of ducuments retrieved.
# 
print(results1.number_documents_retrieved)
print(results2.number_documents_retrieved)
print(results3.number_documents_retrieved)

print("\n")

# Size of the corpus?
print(results1.number_documents_indexed)
print(results2.number_documents_indexed)
print(results3.number_documents_indexed)

#


In [None]:
#Testing WeakAnd
# Can be read about here: https://docs.vespa.ai/documentation/using-wand-with-vespa.html
# How does hits affect the search?
# Is this the target amount of retrived documents?
# Retrived documents seam to increase linearly with hits
query_text = "How too kill the warm black friday mood???"

n = 100

results = []
retrived = np.zeros(n)


for i in range(n):
    results_temp = app.query(
        query=query_text,
        query_model = Query(
            match_phase=WeakAnd(hits = i),
            rank_profile=Ranking(name="default")
        ),
        hits = 1
    )
    results.append(results)
    retrived[i] = results_temp.number_documents_retrieved



In [None]:
plt.plot(retrived)
plt.plot(np.arange(100) + retrived[5])
plt.ylabel('Numbre of documents retrived')
plt.xlabel("WeakAnd(hits = x)")
plt.title("Query:" + query_text)
plt.show()


In [None]:
query_model = Query(
            match_phase=WeakAnd(hits = 10, ),
            rank_profile=Ranking(name="default"))
    




In [None]:
app_package.schema.add_rank_profile(
    RankProfile(name = "body_length", inherits = "default", first_phase = "body_length")
)

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance = 'andre-olaisen',
    disk_folder = path_key 
)




In [None]:
query_text = "Was jesus a socialist"

results_or_default = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="default")
    ),
    hits = 5
)

results_or_bm25 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 5
)

print(results_or_default.number_documents_retrieved)
print(results_or_bm25.number_documents_retrieved)

print(query_text)
print("\n")

print("Results: or , deault")
for result in results_or_default.hits:
    print(result['fields']['title'])
    print(result["relevance"])
    
print("\n")
    
print("Results: OR , bm25(title) + bm25(body)")
for result in results_or_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])



In [None]:
# add_rank_profile: What does inherits mean? Why is this needed?
# Not able to make bm25 work
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25", inherits = "default", first_phase = "bm25(body)+bm25(title)")
)
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25_title", inherits = "default", first_phase = "bm25(title)")
)
app_package.schema.add_rank_profile(
    RankProfile(name = "bm25_body", inherits = "default", first_phase = "bm25(body)")
)

app_package



In [None]:
# After adding a new RankingProfile the app has to be redeployed

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)

In [None]:
query_text = "Could muhammad take a selfie?"

results_or_bm25 = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25")
    ),
    hits = 10
)


results_or_bm25_title = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25_body")
    ),
    hits = 10
)

results_or_bm25_body = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="bm25_title")
    ),
    hits = 10
)


print("Results: OR , bm25(title)+bm25(body)")
for result in results_or_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])
    
print("\n")

print("Results: OR , bm25(title)")
for result in results_or_bm25_body.hits:
    print(result['fields']['title'])
    print(result["relevance"])

print("\n")
print("Results: OR , bm25(body)")
for result in results_or_bm25_title.hits:
    print(result['fields']['title'])
    print(result["relevance"])



In [None]:
app_package.schema.add_rank_profile(
    RankProfile(name = "nativerank_bm25_combo", inherits = "default",
                first_phase = "nativeRank(title,body) + bm25(body)")
)

# After adding a new RankingProfile the app has to be redeployed

path_key = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\Cloud_test\\"

app = vespa_cloud.deploy(
    instance='andre-olaisen',
    disk_folder=path_key
)

In [None]:
query_text = "Could Muhammad take a selfie?"

results_or_native_bm_combo = app.query(
    query=query_text,
    query_model = Query(
        match_phase=OR(),
        rank_profile=Ranking(name="nativerank_bm25_combo")
    ),
    hits = 10
)

print("\n")
print("Results: OR , bm25(body)")
for result in results_or_native_bm_combo.hits:
    print(result['fields']['title'])
    print(result["relevance"])

In [None]:
# Testing ANN 
from vespa.query import Union, WeakAnd, ANN

In [None]:


match_phase = Union(
    WeakAnd(hits = 10),
    ANN(
        doc_vector="title_embedding",
        query_vector=query_text,
        embedding_model=model.encode().tolist(),
        hits = 10,
        label="title"
    )
)

m

rank_profile = Ranking(name="default" ,list_features=True)

query_model = Query(match_phase=match_phase, rank_profile=rank_profile)

results_ANN_bm25 = app.query(
    query=query_text,
    query_model = query_model )

print(results_ANN_bm25.number_documents_retrieved) # = 0
print(results_ANN_bm25.number_documents_indexed)   # = 0



In [None]:
??results_ANN_bm25

In [None]:
match_phase.get_query_properties("dsfsdf")

In [None]:
rank_profile = Ranking(name="default" ,list_features=True)

query_model = Query(match_phase=match_phase, rank_profile=rank_profile)

results_ANN_bm25 = app.query(
    query=query_text,
    query_model = query_model)

print(results_ANN_bm25.number_documents_retrieved)
print(results_ANN_bm25.number_documents_indexed)
print("\n")
print("Results: ANN , bm25")
for result in results_ANN_bm25.hits:
    print(result['fields']['title'])
    print(result["relevance"])

   

In [None]:
import requests, json

labelled_data = json.loads(
    requests.get("https://thigm85.github.io/data/msmarco/query-labels.json").text
)

In [None]:
print(len(labelled_data))

labelled_data[0:4]

In [None]:
default_ranking = Query(
    match_phase=OR(),
    rank_profile=Ranking(name="default")
)

In [None]:
bm25_ranking = Query(
    match_phase=OR(),
    rank_profile=Ranking(name="bm25")
)



In [None]:
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank

eval_metrics = [MatchRatio(), Recall(at = 10), ReciprocalRank(at = 10)]



In [None]:
default_evaluation = app.evaluate(
    labelled_data=labelled_data,
    eval_metrics=eval_metrics,
    query_model=default_ranking,
    id_field="id",
    timeout=5,
    hits=10
)

In [None]:
bm25_evaluation = app.evaluate(
    labelled_data=labelled_data,
    eval_metrics=eval_metrics,
    query_model=bm25_ranking,
    id_field="id",
    timeout=5,
    hits=10
)



In [None]:
from pandas import merge

eval_comparison = merge(
    left=default_evaluation,
    right=bm25_evaluation,
    on="query_id",
    suffixes=('_default', '_bm25')
)
eval_comparison[0:10]



In [None]:
eval_comparison[["match_ratio_value_default", "match_ratio_value_bm25"]].describe().loc[["mean", "std"]]

In [None]:
eval_comparison[["recall_10_value_default", "recall_10_value_bm25"]].describe().loc[["mean", "std"]]

In [None]:
eval_comparison[["reciprocal_rank_10_value_default", "reciprocal_rank_10_value_bm25"]].describe().loc[["mean", "std"]]