# Sentence Similarity

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

import scipy.spatial as ss
from sentence_transformers import SentenceTransformer

from importlib.util import find_spec
if find_spec("similarity_abstract_search") is None:
    import sys
    sys.path.append('..')
    
from similarity_abstract_search.datasets.dataset import SemanticCorpusDataset  
from similarity_abstract_search import utils



In [2]:
data = SemanticCorpusDataset()
data.load_one_batch()

Unnamed: 0_level_0,id,paperAbstract,title,citeEmbeddingsID
EmbeddingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10291659,3b9b9509bb4eea4711515a40a7f210d8561487aa,Anterior segment dysgeneses are developmental ...,The 6p25 deletion syndrome: An update on a rar...,"[12279879, 11377917, 10611622, 4885766, 671395..."
13584557,c4c1d7a8051a5ee1562dc8b5722c3353b8d7e226,"The first synthesis of dolabelide C (1), a cyt...",Total synthesis of dolabelide C: a phosphate-m...,[5007102]
9811857,e300444162aa59fe05c18f99e36db8a1125275c3,BACKGROUND & AIMS: Helicobacter pylori eradica...,Effects of Community Screening for Helicobacte...,"[9936039, 1851207, 446836, 11518128, 9864285, ..."
764869,57c98b205d48d605f17d884bd6abe9a66c846989,BACKGROUND\nMinorities are more prevalent than...,Racial and ethnic disparities in physical abus...,"[6924879, 457180]"
10295489,e316c2902c421370001baf099e439cf68bef62fa,"Today, many tetraplegics benefit from surgical...",New concepts on treatment of the upper limb in...,"[7564871, 10920400, 13471692, 3174646, 10921211]"
...,...,...,...,...
13139466,10e10bb1a46166152458e1959eadab88ab604d3d,'Metadata' has received a fraction of the atte...,Metadata accounts: Achieving data and evidence...,[2442410]
707527,e0ae60948a24dfa0e6296effb7750c00391f7d46,OBJECTIVES\nTo describe average levels of free...,Human energy expenditure in affluent societies...,"[1997359, 11775545, 7111644, 86053, 3490113, 1..."
9324370,99d8a0df8507fc78e3a8cf4cc68bb3134a4d4b20,I N VIVO confocal laser scanning microscopy (C...,The vascular features of psoriatic skin: imagi...,"[11362903, 12617902, 8475976, 12960323, 257405..."
9802191,7983389420ea39ec8dc8440b921319e1d154601e,INTRODUCTION\nCongenital tuberculosis is a rar...,Analysis of 170 cases of congenital TB reporte...,"[3540689, 12109320, 10107597, 983278, 9470101,..."


## Semantic Search

In [2]:
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:37<00:00, 10.8MB/s]


In [3]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]

In [4]:
corpus_embeddings = embedder.encode(corpus)

In [5]:
queries = ['A man is eating pasta.', 
    'Someone in a gorilla costume is playing a set of drums.', 
    'A cheetah chases prey on across a field.']
query_embeddings = embedder.encode(queries)

In [7]:
closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = ss.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating a piece of bread. (Score: 0.8480)
A man is eating food. (Score: 0.7759)
Two men pushed carts through the woods. (Score: 0.2095)
A monkey is playing drums. (Score: 0.1945)
A man is riding a white horse on an enclosed ground. (Score: 0.1586)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.7985)
A cheetah is running behind its prey. (Score: 0.2860)
The girl is carrying a baby. (Score: 0.2351)
A man is riding a horse. (Score: 0.2023)
A man is riding a white horse on an enclosed ground. (Score: 0.1963)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.9007)
Two men pushed carts through the woods. (Score: 0.3662)
A monkey is playing drums. (Score: 0.3061)
A man is riding a horse. (Score: 0.2930)
A man is riding a whit