In [2]:
import requests
from xml.etree import ElementTree

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import UDPOS

import numpy as np

import time
import random
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader


In [26]:
# Initialize the dataset
pub_med = []
num_articles = 10000

# Use ESearch to get PMIDs
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
search_params = {
    'db': 'pubmed',
    'term': 'intelligence[Abstract] AND ("2013/01/01"[Date - Publication] : "2023/12/31"[Date - Publication])',
    'retmax': num_articles, # is by default 20
    'retmode': 'json',

}

# send request to eSearch, and process response
search_response = requests.get(esearch_url, params=search_params)
search_data = search_response.json()['esearchresult']
pmids = search_response.json()['esearchresult']['idlist'] # pmid are the identifyers of the different articles


ConnectionError: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /entrez/eutils/esearch.fcgi?db=pubmed&term=intelligence%5BAbstract%5D+AND+%28%222013%2F01%2F01%22%5BDate+-+Publication%5D+%3A+%222023%2F12%2F31%22%5BDate+-+Publication%5D%29&retmax=10000&retmode=json (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001E57D1A4640>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))

In [27]:
# how many articles with "intelligence in the abstract
total_count = search_data['count']  # Fetch the total count
print(f"Total number of articles with 'intelligence' in the abstract published between 2013 to 2023: {total_count}")
print(pmids[0:10])

Total number of articles with 'intelligence' in the abstract published between 2013 to 2023: 196515
['38197338', '38197310', '38197102', '38197095', '38197072', '38196964', '38196848', '38196835', '38196833', '38196820']


In [29]:
# eSearch is to find identifiers, eFetch is to find the abstracts
efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Process in batches due to API limits
for i in range(0, len(pmids), 100):
    batch_pmids = pmids[i:i+100]
    fetch_params = {
        'db': 'pubmed',
        'id': ','.join(batch_pmids),
        'retmode': 'xml'
    }

    fetch_response = requests.get(efetch_url, params=fetch_params)

    if fetch_response.status_code == 200:
        root = ElementTree.fromstring(fetch_response.content)
        for article in root.findall(".//PubmedArticle"):
            pmid = article.find(".//PMID").text
            article_title = article.find(".//ArticleTitle").text
            abstract_text = article.find(".//Abstract/AbstractText").text if article.find(".//Abstract/AbstractText") is not None else "No abstract available"
            pub_date = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else "No publication year"
            pub_med.append({'PMID': pmid, 'Title': article_title, 'Abstract': abstract_text, 'Publication Year': pub_date})

# Added pmids to be added to the list, so it gets included in the DataFrame
# Convert to DataFrame
pub_med_df = pd.DataFrame(pub_med)

ConnectionError: HTTPSConnectionPool(host='eutils.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /entrez/eutils/efetch.fcgi?db=pubmed&id=38197338%2C38197310%2C38197102%2C38197095%2C38197072%2C38196964%2C38196848%2C38196835%2C38196833%2C38196820%2C38196818%2C38196774%2C38196696%2C38196681%2C38196661%2C38196652%2C38196551%2C38196510%2C38196504%2C38195235%2C38195113%2C38194819%2C38194762%2C38193887%2C38193037%2C38193015%2C38193004%2C38192969%2C38192845%2C38192839%2C38192792%2C38192785%2C38192775%2C38192752%2C38192743%2C38192695%2C38192693%2C38192682%2C38192604%2C38192596%2C38192576%2C38192556%2C38192545%2C38192523%2C38192509%2C38192486%2C38192482%2C38192469%2C38192466%2C38192464%2C38192462%2C38192459%2C38192454%2C38192453%2C38192451%2C38192447%2C38192379%2C38192376%2C38192329%2C38192191%2C38189746%2C38189729%2C38189543%2C38189204%2C38189200%2C38189042%2C38188937%2C38188914%2C38188871%2C38188835%2C38188729%2C38188728%2C38188638%2C38188637%2C38188635%2C38188614%2C38188591%2C38188353%2C38188296%2C38188204%2C38188203%2C38188107%2C38188058%2C38188047%2C38188038%2C38188035%2C38188029%2C38188024%2C38187958%2C38187912%2C38187847%2C38187658%2C38187648%2C38187591%2C38187430%2C38187396%2C38187334%2C38187332%2C38187331%2C38187262&retmode=xml (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001E57D3989A0>: Failed to establish a new connection: [Errno 11002] getaddrinfo failed'))

In [30]:
print(pub_med_df.head())
print(f"Total articles in the dataset: {len(pub_med_df)}")

                                               Title  \
0  Evaluation of efficacy of artificial intellige...   
1  Network pharmacology: towards the artificial i...   
2  Identifying immunodeficiency status in childre...   
3  Artificial Intelligence in Medicine: A Double-...   
4  Advantages of IMRT optimization with MCO compa...   

                                            Abstract Publication Year  
0  The objective of our study was to build a conv...             2023  
1  Network pharmacology (NP) provides a new metho...             2023  
2  Children with primary immunodeficiency disease...             2023  
3                              No abstract available             2023  
4  Traditional intensity-modulated radiation ther...             2023  
Total articles in the dataset: 9998


## Data processing

##Pipeline:
Promt goes into UI (streamlit?), which has a search function which interacts with ElasticSearch, which then gives relevant results, which are then displayed in the UI

In [7]:
# !pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
     ---------------------------------------- 0.0/86.0 kB ? eta -:--:--
     ---------------------------------------- 86.0/86.0 kB 2.4 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torchvision (from sentence-transformers)
  Downloading torchvision-0.16.2-cp38-cp38-win_amd64.whl.metadata (6.6 kB)
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp38-cp38-win_amd64.whl (977 kB)
     ---------------------------------------- 0.0/977.6 kB ? eta -:--:--
     ------------------------------ ------ 809.0/977.6 kB 12.7 MB/s eta 0:00:01
     -------------------------------------- 977.6/977.6 kB 8.9 MB/s eta 0:00:00
Downloading torchvision-0.16.2-cp38-cp38-win_amd64.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ------------------------ -------------

In [8]:
# ! pip install transformers



In [30]:
# ! pip install elasticsearch

In [31]:
# ! pip install opensearch

In [32]:
# !pip install python-dotenv

In [9]:
# Want to access elastic search from notebook:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')



Downloading .gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Get data from Pub_Med, process it in pandas -> Then convert into vectors (using BERT?), not convert everything, just abstract -> insert everything into elasticSearch to process queries. Make UI in streamlit?

Write a search function in streamlit UI,
So for a key word or prompt, it goes into streamlit UI, into search function which talks to elasticsearch, which gives relevant results and shows them in the UI.


In [12]:
# Since I didnt get elasticsearch to work yet, I wanted to make a simple model that would still work without it
# First make abstract embeddings, which are vector representatives of the abstracts, given by the model. 
abstracts = pub_med_df['Abstract'].tolist()
abstract_embeddings = model.encode(abstracts, convert_to_tensor=True)


KeyboardInterrupt: 

In [18]:
# to get an estimation for how long it will take to run, try with a smaller number first
num_samples = 20
current_num_abstracts = len(pub_med_df)

sample_abstracts = abstracts[:num_samples]

start_time = time.time()

sample_embeddings = model.encode(sample_abstracts, convert_to_tensor=True)

end_time = time.time()

time_taken = end_time - start_time

# here all is if we included every article from pubmed that meets the requirements, not all in our dataframe
estimated_time_for_all = time_taken * (int(total_count) / num_samples)
estimated_time_for_current_amount_abstracts = time_taken * (current_num_abstracts / num_samples)


It took 6.265285968780518 seconds to complete 20 embeddings.
The estimated time for all abstracts is then 17.100314891040327 hours and for current length of df: 0.8700045710537168 hours


In [22]:

print(f'It took {time_taken:.2f} seconds to complete {num_samples:.2f} embeddings.')


print(f'The estimated time for all abstracts is then {estimated_time_for_all / (60 * 60):.2f} hours and for current length of df: {estimated_time_for_current_amount_abstracts / (60 * 60):.2f} hours')
print("This is assuming we have a good representative sample on article length")

It took 6.27 seconds to complete 20.00 embeddings.
The estimated time for all abstracts is then 17.10 hours and for current length of df: 0.87 hours
This is assuming we have a good representative sample on article length


In [33]:
# Making a rudimentary search function
def find_relevant_articles(question, top_k=5):
    # the question is also turned into a multidimensional vector, like the abstracts
    question_embedding = model.encode(question, convert_to_tensor=True)
    # we can then find how similar the question is to the different abstracts, using Cosine Similarity. [0, 1], where closer to one is more similar
    cos_scores = util.pytorch_cos_sim(question_embedding, sample_embeddings)[0] # currently using sample embeddings, so very few
    top_results = torch.topk(cos_scores, k=top_k)

    print("Question:", question)
    print("\nTop relevant articles:")
    for score, idx in zip(top_results[0], top_results[1]):
        idx = idx.item()
        print(f"Article Index: {idx}, Title: {pub_med_df.iloc[idx]['Title']}, Abstract: {pub_med_df.iloc[idx]['Abstract']}, (Score: {score:.4f})")


In [34]:

find_relevant_articles("How is covid related to intelligence?")

Question: How is covid related to intelligence?

Top relevant articles:
Article Index: 11, Title: Global research on artificial intelligence in thyroid-associated ophthalmopathy: A bibliometric analysis., Abstract: To provide an overview of global publications on artificial intelligence (AI) in thyroid-associated ophthalmopathy (TAO) through bibliometric analysis., (Score: 0.3076)
Article Index: 13, Title: Impact of Epidemic Intelligence Service Training in Occupational Respiratory Epidemiology., Abstract: The Centers for Disease Control and Prevention's Epidemic Intelligence Service (EIS) is a fellowship in applied epidemiology for physicians, veterinarians, nurses, scientists, and other health professionals. Each EIS fellow is assigned to a position at a federal, state, or local site for 2 years of on-the-job training in outbreak investigation, epidemiologic research, surveillance system evaluation, and scientific communication. Although the original focus of the program on the contr

__Comment:__ As we can see the Cosine Similarity is low, the largest is 0.3, but this rudimentary version actually works. The following objective is to expand the model and the search, implementing better search functions, and hopefully get an elastic search server to work. 