#https://www.kaggle.com/vj1494/covid-19-biobert-semantic-search-engine

In [None]:
!pip install biopython
!pip install --upgrade biopython
!pip install metapub

In [None]:
import Bio
import metapub
from metapub import PubMedFetcher
from Bio import Entrez
from Bio import SeqIO
from Bio.Entrez import efetch

In [None]:
from Bio import Medline

In [None]:
Entrez.email = "a.sakapetis@students.uu.nl"

In [None]:
def load_pmids_list(pmids_file):
    """
    Return a list of pmids from a text file.
    Args:
        pmids_file: a text file containing pmids with one pmid in each line.
    """
    assert isinstance(pmids_file, str)
    with open(f'/{pmids_file}.txt', 'r', encoding = 'utf8') as f:
        pmids_list = [line.strip() for line in f]
    return pmids_list

In [None]:
def get_pubmed_tiabs(pmids_list):
    """
    Return a dictionary of title and abstract of papers by pmid
    in the pmids_list in one query of the pubmed database.
    Args:
        pmids_list: a list of pmids to query for the title and abstract of papers.
    """
    assert isinstance(pmids_list, list) and len(pmids_list) <= 100000
    pubmed_tiabs = {}
    records =  Medline.parse(Entrez.efetch(
        db="pubmed", id=pmids_list, rettype='medline', retmode='text'
        ))
    for record in records:
        if 'PMID' not in record: continue
        pubmed_tiabs[record['PMID']] = {'title': record.get('TI', ''),
                                        'abstract': record.get('AB', '')}
    return pubmed_tiabs

In [None]:
def get_pubmed_tiabs_all(pmids_list):
    """
    Acquire title and abstract of papers in a list of pmids by query the pubmed database
    and write the pmid, title and abstract to a csv file.
    Args:
        pmids_list: a list of pmids to query for the title and abstract of papers.
        file_name: a string as name of the csv file
    """
    assert isinstance(pmids_list, list)
    len_list = len(pmids_list)
    df_tiabs = {'pmid':[], 'title':[], 'abstract':[]}
    for i in range(0, len_list, 10000):
        pmid_list = pmids_list[i:i+10000] if i+10000 <= len_list else pmids_list[i:]
        pubmed_tiabs = get_pubmed_tiabs(pmid_list)
        for k,v in pubmed_tiabs.items():
            df_tiabs['pmid'].append(k)
            df_tiabs['title'].append(v['title'])
            df_tiabs['abstract'].append(v['abstract'])
    return pd.DataFrame(data=df_tiabs)


In [None]:
import pandas as pd

In [None]:
a = load_pmids_list('content/251')

In [None]:
b = get_pubmed_tiabs(a)

In [None]:
data = get_pubmed_tiabs_all(a)

In [None]:
from scipy.spatial.distance import cdist
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
from ipywidgets import interact, widgets # this is what makes the dataframe interactive

In [None]:
!pip uninstall tensorflow==2.1.0 --yes
!pip install bert-serving-server
!pip install bert-serving-client
!pip install --upgrade ipykernel
!pip install tensorflow==1.13.1

In [None]:
!wget https://github.com/naver/biobert-pretrained/releases/download/v1.1-pubmed/biobert_v1.1_pubmed.tar.gz

In [None]:
!tar xvzf biobert_v1.1_pubmed.tar.gz
%cd biobert_v1.1_pubmed


In [None]:
!rename 's/model.ckpt-1000000.data-00000-of-00001/bert_model.ckpt.data-00000-of-00001/' *
!rename 's/model.ckpt-1000000.meta/bert_model.ckpt.meta/' *
!rename 's/model.ckpt-1000000.index/bert_model.ckpt.index/' *
!ls #/kaggle/working/biobert_v1.1_pubmed
#!port_num=5555

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
import pandas as pd
from bert_serving.client import BertClient
import numpy as np
from bert_serving.server.helper import get_args_parser
from bert_serving.server import BertServer

In [None]:
a = get_args_parser().parse_args(['-model_dir', '/content/biobert_v1.1_pubmed',
                                     '-port', '5555',
                                     '-port_out', '5556',
                                     '-max_seq_len', 'NONE',
                                     '-mask_cls_sep',
                                     '-cpu',
                                     '-num_worker','4'])
server = BertServer(a)
server.start()

In [None]:
bc = BertClient(port=5555, port_out=5556)

In [None]:
#biorx_df.head()

In [None]:
biorx_df = data
biorx_lst = biorx_df['title'].astype(str).to_list()
print(biorx_lst)

In [None]:
doc_vecs = bc.encode(biorx_lst)
print(doc_vecs.shape)

In [None]:
def find_similar_articles(query,topk):
    query_vec = bc.encode([query])[0]
    score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1)
    topk_idx = np.argsort(score)[::-1][:topk]
    for idx in topk_idx:
        print('> %s\t%s' % (score[idx], biorx_lst[idx]))

In [None]:
find_similar_articles("chlorophenoxyacetic",10)