In [1]:
import faiss
import numpy as np
import pandas as pd
from scipy import spatial  
from scipy.spatial import distance
import ast  
from sentence_transformers import SentenceTransformer, util
import torch
import pickle



In [2]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

  return self.fget.__get__(instance, owner)()


## Cosine Similairty

In [3]:
def strings_ranked_by_relatedness(
    query: str,
    stored_sentences,
    stored_embeddings
):
    query_embedding_response = model.encode(query,convert_to_tensor=True)

    top_k = min(5, len(stored_sentences))
    cos_scores = util.cos_sim(query_embedding_response, stored_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    for score, idx in zip(top_results[0], top_results[1]):
        print(stored_sentences[idx], "(Score: {:.4f})".format(score))
    


with open("../dataset/embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data["sentences"]
    stored_embeddings = stored_data["embeddings"]


query = "What is the level of agreement between the fully differential calculation in perturbative quantum chromodynamics for the production of massive photon pairs and data from the Fermilab Tevatron, and what predictions are made for more detailed tests with CDF and DO data"

print(strings_ranked_by_relatedness(query, stored_sentences,stored_embeddings))

  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obtained with judicious
selection of events.
 (Score: 0.4476)


## Semantic Search

In [4]:
def strings_ranked_by_relatedness(
    query: str,
    stored_sentences,
    stored_embeddings
):
    query_embedding_response = model.encode(query,convert_to_tensor=True)

    top_k = min(5, len(stored_sentences))
    cos_scores = util.semantic_search(query_embedding_response, stored_embeddings,top_k=3)[0]
    print(cos_scores)

    results = []
    for score in cos_scores:
        corpus_id = score['corpus_id']
        sentence = stored_sentences[corpus_id]
        similarity_score = score['score']
        results.append({'text': sentence, 'score': similarity_score})

    return results



with open("../dataset/embeddings.pkl", "rb") as fIn:
    stored_data = pickle.load(fIn)
    stored_sentences = stored_data["sentences"]
    stored_embeddings = stored_data["embeddings"]


query = "What is the level of agreement between the fully differential calculation in perturbative quantum chromodynamics for the production of massive photon pairs and data from the Fermilab Tevatron, and what predictions are made for more detailed tests with CDF and DO data"

print(strings_ranked_by_relatedness(query, stored_sentences,stored_embeddings))

[{'corpus_id': 0, 'score': 0.44760435819625854}, {'corpus_id': 59, 'score': 0.3470006585121155}, {'corpus_id': 24, 'score': 0.33712178468704224}]
[{'text': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted 