# Sentence Similarity

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

import scipy.spatial as ss
from sentence_transformers import SentenceTransformer

from importlib.util import find_spec
if find_spec("similarity_abstract_search") is None:
    import sys
    sys.path.append('..')
    
from similarity_abstract_search.datasets.dataset import SemanticCorpusDataset
from similarity_abstract_search.models.tfidf_model import TfidfModel  
from similarity_abstract_search import utils



In [41]:
data = SemanticCorpusDataset(5000)
df = data.load_one_batch()
df

Unnamed: 0_level_0,id,paperAbstract,title,citeEmbeddingsID
EmbeddingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10291659,3b9b9509bb4eea4711515a40a7f210d8561487aa,Anterior segment dysgeneses are developmental ...,The 6p25 deletion syndrome: An update on a rar...,"[12279879, 11377917, 10611622, 4885766, 671395..."
13584557,c4c1d7a8051a5ee1562dc8b5722c3353b8d7e226,"The first synthesis of dolabelide C (1), a cyt...",Total synthesis of dolabelide C: a phosphate-m...,[5007102]
9811857,e300444162aa59fe05c18f99e36db8a1125275c3,BACKGROUND & AIMS: Helicobacter pylori eradica...,Effects of Community Screening for Helicobacte...,"[9936039, 1851207, 446836, 11518128, 9864285, ..."
764869,57c98b205d48d605f17d884bd6abe9a66c846989,BACKGROUND\nMinorities are more prevalent than...,Racial and ethnic disparities in physical abus...,"[6924879, 457180]"
10295489,e316c2902c421370001baf099e439cf68bef62fa,"Today, many tetraplegics benefit from surgical...",New concepts on treatment of the upper limb in...,"[7564871, 10920400, 13471692, 3174646, 10921211]"
...,...,...,...,...
10982771,85b472872422a7aa871c5bb2c07ebbf42c204c30,"In the current study, laser scanning confocal ...",Embryonic corneal epithelial actin alters dist...,"[2037344, 1821756, 9279551, 4924471, 3857404, ..."
8097853,11079171db9679d3dbcd5959fea7bb321a1bfe58,"A newly isolated endo-β-1,4-xylanase (Xyn10E) ...",The family 22 carbohydrate-binding module of b...,"[971168, 924179]"
1837898,9e7b5d40efc6c2762ae45fa294e3efc150a02d9d,Nonobese diabetic (NOD) mice develop spontaneo...,Deviation of pancreas-infiltrating cells to Th...,"[9962269, 8742671, 1200954, 4260073, 10496015,..."
4281586,2dba9b6e4bfa3609d36fda76e780ea47f9779c3d,Abstract This report describes the preparation...,Fluorometric assay of neuraminidase with a sod...,"[8936234, 10497471, 295886, 6024591, 1970914, ..."


In [42]:
df.reset_index().to_json(str(data.data_dirname().parent/'paper_data.json'), orient='records')
# utils.load_json(str(data.data_dirname().parent/'paper_data.json'))

## Semantic Search

In [2]:
embedder = SentenceTransformer('bert-base-nli-mean-tokens')

100%|██████████| 405M/405M [00:37<00:00, 10.8MB/s]


In [3]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.'
          ]

In [4]:
corpus_embeddings = embedder.encode(corpus)

In [5]:
queries = ['A man is eating pasta.', 
    'Someone in a gorilla costume is playing a set of drums.', 
    'A cheetah chases prey on across a field.']
query_embeddings = embedder.encode(queries)

In [7]:
closest_n = 5
for query, query_embedding in zip(queries, query_embeddings):
    distances = ss.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]

    results = zip(range(len(distances)), distances)
    results = sorted(results, key=lambda x: x[1])

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx, distance in results[0:closest_n]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))





Query: A man is eating pasta.

Top 5 most similar sentences in corpus:
A man is eating a piece of bread. (Score: 0.8480)
A man is eating food. (Score: 0.7759)
Two men pushed carts through the woods. (Score: 0.2095)
A monkey is playing drums. (Score: 0.1945)
A man is riding a white horse on an enclosed ground. (Score: 0.1586)




Query: Someone in a gorilla costume is playing a set of drums.

Top 5 most similar sentences in corpus:
A monkey is playing drums. (Score: 0.7985)
A cheetah is running behind its prey. (Score: 0.2860)
The girl is carrying a baby. (Score: 0.2351)
A man is riding a horse. (Score: 0.2023)
A man is riding a white horse on an enclosed ground. (Score: 0.1963)




Query: A cheetah chases prey on across a field.

Top 5 most similar sentences in corpus:
A cheetah is running behind its prey. (Score: 0.9007)
Two men pushed carts through the woods. (Score: 0.3662)
A monkey is playing drums. (Score: 0.3061)
A man is riding a horse. (Score: 0.2930)
A man is riding a whit

# TD-IDF

In [30]:
corpus = [f'{t} . {a} ' for t, a in zip(df.title, df.paperAbstract)]

In [31]:
model = TfidfModel()

In [32]:
%%time
X, V = model.fit(corpus)
X.shape

CPU times: user 22.1 ms, sys: 511 µs, total: 22.6 ms
Wall time: 22 ms


(64, 417)

In [33]:
IX=model.svmSimilarity(X)

100%|██████████| 64/64 [00:00<00:00, 528.33it/s]


## Search

In [43]:
paper_dict  = utils.load_json(str(data.data_dirname().parent/'paper_data.json'))
search_dict = utils.load_json(str(data.data_dirname().parent/'search.json'))
sim_dict    = utils.load_json(str(data.data_dirname().parent/'sim_vecs.json'))


In [37]:
paper_dict[3]['title']

'Racial and ethnic disparities in physical abuse reporting and child protective services interventions in the United States.'

In [44]:

qparts = ('breast cancer screening').strip().split() 
n = len(paper_dict)

scores =[]
for i, sd in enumerate(search_dict):
    score = sum(sd.get(q, 0) for q in qparts)
    if score ==0: continue
    score += 1.0*(n-i)/n
    scores.append((score, paper_dict[i]))
scores.sort(reverse=True, key=lambda x: x[0])
papers = [x[1] for x in scores if x[0]>0]
if len(papers)>40:
    papers = papers[:40]

In [46]:
papers[0]

{'EmbeddingID': 4986424,
 'id': '3ae086a71aba70379989bc7804562517e29343ec',
 'paperAbstract': 'Cancer screening programmes differ throughout the European Union with regard to their content as well as their acceptance by the population. In Germany, mammography is not yet part of the recommended screening programme, although its routine use is recommended by several national and international institutions. We were interested in the present methods of breast cancer detection and the correlation to tumour stage, histology and prognosis. Patients with breast cancer, presenting in our department between January 1990 and December 1994 (1,050 cases), were asked whether the suspicious finding was first detected by themselves, their physician, or in routine mammography. Seventy-two per cent of tumours were detected by patients themselves, 12% by the physician at routine cancer screening or for other reasons, and 16% were found in mammography performed without clinical suspicion of cancer. Tumour

In [53]:
paper_dict[0]['id']

'3b9b9509bb4eea4711515a40a7f210d8561487aa'

In [54]:
len(sim_dict)

5000