In [1]:
import matplotlib.pyplot as plt
import numpy as np
from random import random
import time
import pickle

In [2]:
from vespa.package import Document, Field

document = Document(
    fields=[
        Field(name = "id", type = "string", indexing = ["attribute", "summary"]),
        Field(name = "title", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "abstract", type = "string", indexing = ["index", "summary"], index = "enable-bm25"),
        Field(name = "body", type = "string", indexing = ["index", "summary"], index = "enable-bm25")
    ]
)

In [3]:
from vespa.package import Schema, FieldSet, RankProfile

cord_19_schema = Schema(
    name = "cord_19_simple",
    document = document,
    fieldsets = [FieldSet(name = "default", fields = ["title", "body"])],
    rank_profiles = [RankProfile(name = "default", first_phase = "nativeRank(title, body)"),
                    RankProfile(name = "bm25", first_phase = "bm25(title) + bm25(body)")]
)

In [4]:
from vespa.package import ApplicationPackage

app_package = ApplicationPackage(name = "cord_19_simple", schema=cord_19_schema)



In [5]:
from vespa.package import VespaDocker

vespa_docker = VespaDocker()

In [6]:
name = "cord_19_application_simple"

path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\docker_apps\\"


app = vespa_docker.deploy(application_package=app_package, 
                          disk_folder=path + name,
                          container_memory = '8G')

In [7]:
from pandas import read_csv

path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\"
docs = read_csv(path + "crod_19_only_rel.csv")
docs.shape

  interactivity=interactivity, compiler=compiler, result=result)


(37279, 21)

In [8]:
docs.head()

Unnamed: 0.1,Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,...,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id,body_text
0,3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,...,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,,from xenopus laevis [16] . eta receptors in no...
1,5,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,green-oa,Nidovirus subgenomic mRNAs contain a leader se...,...,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,document_parses/pdf_json/b2897e1277f56641193a6...,document_parses/pmc_json/PMC125340.xml.json,http://europepmc.org/articles/pmc125340?pdf=re...,,the genetic information of rna viruses is orga...
2,7,8zchiykl,5806726a24dc91de3954001effbdffd7a82d54e2,PMC,The 21st International Symposium on Intensive ...,10.1186/cc1013,PMC137274,11353930.0,no-cc,The 21st International Symposium on Intensive ...,...,"Ball, Jonathan; Venn, Richard",Crit Care,,,,document_parses/pdf_json/5806726a24dc91de39540...,document_parses/pmc_json/PMC137274.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,,this year's symposium was dominated by the res...
3,10,5tkvsudh,9d4e3e8eb092d5ed282d0aa4aadcaa8b7165b5e9,PMC,Conservation of polyamine regulation by transl...,10.1093/emboj/19.8.1907,PMC302018,10775274.0,no-cc,Regulation of ornithine decarboxylase in verte...,...,"Ivanov, Ivaylo P.; Matsufuji, Senya; Murakami,...",EMBO J,,,,document_parses/pdf_json/9d4e3e8eb092d5ed282d0...,document_parses/pmc_json/PMC302018.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,the ef®ciency of +1 ribosomal frameshifting at...
4,12,tvxpckxo,d09b79026117ec9faebba46a8d13aa9b23ec751e,PMC,A Method to Identify p62's UBA Domain Interact...,10.1251/bpo66,PMC302190,14702098.0,no-cc,The UBA domain is a conserved sequence motif a...,...,"Pridgeon, Julia W.; Geetha, Thangiah; Wooten, ...",Biol Proced Online,,,,document_parses/pdf_json/d09b79026117ec9faebba...,document_parses/pmc_json/PMC302190.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,,p62 is a novel cellular protein which was init...


In [9]:
start = time.time()
m = start
i = 0
for idx, row in docs.iterrows():
    if i < 10000:
        if (i % 1000 == 0):
            # Each loop about 10 min or 600 s
            print(i," : " ,idx)
            print("Time:", round(time.time() - m,1))
            m = time.time()
        i += 1
        #print(len(row["title"]), end = "  ")
        response = app.feed_data_point(
            schema = "cord_19_simple",
            data_id = str(row["cord_uid"]),
            fields = {
                "id": str(row["cord_uid"]),
                "title": str(row["title"]),
                "abstract": str(row["abstract"]),
                "body": str(row["body_text"])
            }
        )

0  :  0
Time: 0.1
1000  :  1000
Time: 138.9
2000  :  2000
Time: 168.3
3000  :  3000
Time: 111.8
4000  :  4000
Time: 90.4
5000  :  5000
Time: 25.6
6000  :  6000
Time: 24.8
7000  :  7000
Time: 42.0
8000  :  8000
Time: 29.5
9000  :  9000
Time: 35.7
10000  :  10000
Time: 37.8


KeyboardInterrupt: 

In [None]:
def feed_datapoint(row,i):
    #embedding_title = np.zeros(768)
    #embedding_body = np.zeros(768)
    
    #if type(row["title"]) != float:
        #embedding_title =  model.encode(row["title"].lower()).tolist()
    #else:
    #    embedding_title = [0 for _ in range(768)]
    
    #if type(row["body"]) != float:
    #    embedding_body =  model.encode(row["body"].lower()).tolist()
    #else:
    #    embedding_body = [0 for _ in range(768)]
    #print(len(row["title"]), end = "  ")
    
    print(i)
    response = app.feed_data_point(
        schema = "cord_19_simple",
        data_id = str(row["cord_uid"]),
        fields = {
            "id": str(row["cord_uid"]),
            "title": str(row["title"]),
            "abstract": str(row["abstract"]),
            "body": str(row["body_text"])
        }
    )
    return(i)

In [None]:
labeled_data = [
    {
        'query_id': 1,
        'query': 'coronavirus origin',
        'relevant_docs': [{'id': '005b2j4b', 'score': 2}, {'id': '00fmeepz', 'score': 1}]
    },
    {
        'query_id': 2,
        'query': 'coronavirus response to weather changes',
        'relevant_docs': [{'id': '01goni72', 'score': 2}, {'id': '03h85lvy', 'score': 2}]
    }
]


### Loading labeled data

In [10]:
path = "C:\\Users\\User\\OneDrive - NTNU\\NTNU\\Prosjekt oppgave NLP\\dataset\\CORD-19\\"

labeled_data = []
with open(path + "labeled_data_cord_19.txt", "rb") as fp:   # Unpickling
    labeled_data = pickle.load(fp)


In [11]:
from vespa.query import Query, OR, AND, WeakAnd, ANN, RankProfile as Ranking


results = app.query(
    query="Where is my app",
    query_model = Query(
        match_phase=AND(),
        rank_profile=Ranking(name="default")
    ),
    hits = 10
)

In [12]:
len(results.hits)
for result in results.hits:
    print(result["id"], "  ", result["fields"]["title"])

id:cord_19_simple:cord_19_simple::vy0qgtll    Proteases
id:cord_19_simple:cord_19_simple::gkgb20ed    Analysing the combined health, social and economic impacts of the corovanvirus pandemic using agent-based social simulation
id:cord_19_simple:cord_19_simple::nmiqin3a    COVID-19 Contact Tracing and Privacy: Studying Opinion and Preferences
id:cord_19_simple:cord_19_simple::tpqsjjet    Section II: Poster Sessions
id:cord_19_simple:cord_19_simple::kb9fnbgy    Oral presentations
id:cord_19_simple:cord_19_simple::aw63xinw    GeoWeb and crisis management: issues and perspectives of volunteered geographic information
id:cord_19_simple:cord_19_simple::dwfb81aj    ECR 2012 Book of Abstracts - A - Postergraduate Educational Programme
id:cord_19_simple:cord_19_simple::gu1elx07    Conservation Magazine, Summer 2014
id:cord_19_simple:cord_19_simple::pnqw7g5p    Abstracts of FIS/HIS 2016 – Poster Presentations
id:cord_19_simple:cord_19_simple::bbg11u3w    Aerodrome Security


## Evaluating app

In [13]:
from vespa.evaluation import MatchRatio, Recall, ReciprocalRank, NormalizedDiscountedCumulativeGain

eval_metrics = [
    MatchRatio(),
    Recall(at=10),
    ReciprocalRank(at=10),
    NormalizedDiscountedCumulativeGain(at=10)
]

In [14]:
model = Query(match_phase=OR(),
      rank_profile=Ranking(name="default")
    )

In [15]:

bm25_evaluation = app.evaluate(
        labeled_data = labeled_data,
        eval_metrics = eval_metrics,
        query_model = model,
        id_field = "id",
        hits = 10
    )

In [16]:
bm25_evaluation

Unnamed: 0,query_id,match_ratio_retrieved_docs,match_ratio_docs_available,match_ratio_value,recall_10_value,reciprocal_rank_10_value,ndcg_10_ideal_dcg,ndcg_10_dcg,ndcg_10_value
0,1,3083,9973,0.309135,0.001821,1.0,2.0,0.63093,0.315465
1,2,5612,9973,0.562719,0.006216,1.0,0.0,0.0,0.0
2,3,3718,9973,0.372807,0.001777,1.0,1.63093,1.30103,0.797723
3,4,6985,9973,0.700391,0.001082,0.2,1.0,0.30103,0.30103
4,5,8381,9973,0.840369,0.000589,0.333333,0.0,0.0,0.0
5,6,4620,9973,0.463251,0.003111,1.0,3.13093,2.63093,0.840303
6,7,5898,9973,0.591397,0.003618,1.0,2.0,2.0,1.0
7,8,4654,9973,0.46666,0.00214,1.0,2.63093,2.26186,0.859719
8,9,6868,9973,0.688659,0.005409,1.0,0.0,0.0,0.0
9,10,4236,9973,0.424747,0.007011,1.0,4.26186,3.839989,0.901013
