In [None]:
import requests
from xml.etree import ElementTree

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import UDPOS

import numpy as np
import math

import time
import random
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader


In [None]:
# Initialize the dataset
pub_med = []
num_articles = 10000 # want as many as there are with the correct information

# Use ESearch to get PMIDs
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
search_params = {
    'db': 'pubmed',
    'term': 'intelligence[Abstract] AND ("2013/01/01"[Date - Publication] : "2023/12/31"[Date - Publication])',
    'retmax': num_articles, # is by default 20
    'retmode': 'json',

}

# send request to eSearch, and process response
search_response = requests.get(esearch_url, params=search_params)
search_data = search_response.json()['esearchresult']
pmids = search_response.json()['esearchresult']['idlist'] # pmid are the identifyers of the different articles


In [None]:
# how many articles with "intelligence in the abstract
total_count = search_data['count']  # Fetch the total count
print(f"Total number of articles with 'intelligence' in the abstract published between 2013 to 2023: {total_count}")
print(len(pmids))

Total number of articles with 'intelligence' in the abstract published between 2013 to 2023: 197402
9999


In [None]:
# eSearch is to find identifiers, eFetch is to find the abstracts
efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Process in batches due to API limits
for i in range(0, len(pmids), 100):
    batch_pmids = pmids[i:i+100]
    fetch_params = {
        'db': 'pubmed',
        'id': ','.join(batch_pmids),
        'retmode': 'xml'
    }

    fetch_response = requests.get(efetch_url, params=fetch_params)

    if fetch_response.status_code == 200:
        root = ElementTree.fromstring(fetch_response.content)
        for article in root.findall(".//PubmedArticle"):
            pmid = article.find(".//PMID").text
            article_title = article.find(".//ArticleTitle").text
            abstract_text = article.find(".//Abstract/AbstractText").text if article.find(".//Abstract/AbstractText") is not None else "No abstract available"
            pub_date = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else "No publication year"
            pub_med.append({'PMID': pmid, 'Title': article_title, 'Abstract': abstract_text, 'Publication Year': pub_date})

# Added pmids to be added to the list, so it gets included in the DataFrame
# Convert to DataFrame
pub_med_df = pd.DataFrame(pub_med)

In [None]:
print(pub_med_df.head())
print(f"Total articles in the dataset: {len(pub_med_df)}")

       PMID                                              Title  \
0  38357574  Flavonoids from mulberry leaves inhibit fat pr...   
1  38357464  Association between obesity and age-related ca...   
2  38356684  Advances and Applications of Metal-Organic Fra...   
3  38356654  Using an innovative family-centered evidence t...   
4  38356624  Manipulation of interfacial charge dynamics fo...   

                                            Abstract Publication Year  
0  This study evaluated the effects of flavonoids...             2024  
1  There are inconsistent findings on the associa...             2023  
2  Metal-organic frameworks (MOFs) that are the w...             2024  
3  Most of the disability-related scholarly liter...             2023  
4  Compared to other known materials, metal-organ...             2024  
Total articles in the dataset: 9994


In [None]:
pub_med_df.to_csv('pubmed_articles_first_9999.csv', index=False)

## Data processing

##Pipeline:
Promt goes into UI (streamlit?), which has a search function which interacts with ElasticSearch, which then gives relevant results, which are then displayed in the UI

In [None]:
 !pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [None]:
# ! pip install transformers

In [None]:
 ! pip install elasticsearch

Collecting elasticsearch
  Downloading elasticsearch-8.12.0-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.9/431.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting elastic-transport<9,>=8 (from elasticsearch)
  Downloading elastic_transport-8.12.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.12.0 elasticsearch-8.12.0


In [None]:
 ! pip install opensearch

Collecting opensearch
  Downloading opensearch-0.9.2.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: opensearch
  Building wheel for opensearch (setup.py) ... [?25l[?25hdone
  Created wheel for opensearch: filename=opensearch-0.9.2-py3-none-any.whl size=39842 sha256=3534034697fe0cf84763a3396ca41cd7f17e4749be6f975ef262d5102af0dd6e
  Stored in directory: /root/.cache/pip/wheels/83/d7/57/c1c8e01cdae22d9c55b7d0b494de94c668c3cc4cdd10aa1425
Successfully built opensearch
Installing collected packages: opensearch
Successfully installed opensearch-0.9.2


In [None]:
 !pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
# Want to access elastic search from notebook:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Get data from Pub_Med, process it in pandas -> Then convert into vectors (using BERT?), not convert everything, just abstract -> insert everything into elasticSearch to process queries. Make UI in streamlit?

Write a search function in streamlit UI,
So for a key word or prompt, it goes into streamlit UI, into search function which talks to elasticsearch, which gives relevant results and shows them in the UI.


In [None]:
# Since I didnt get elasticsearch to work yet, I wanted to make a simple model that would still work without it
# First make abstract embeddings, which are vector representatives of the abstracts, given by the model.
abstracts = pub_med_df['Abstract'].tolist()
abstract_embeddings = model.encode(abstracts, convert_to_tensor=True)


TypeError: 'NoneType' object is not subscriptable

In [None]:
# to get an estimation for how long it will take to run, try with a smaller number first
num_samples = 20
current_num_abstracts = len(pub_med_df)

sample_abstracts = abstracts[:num_samples]

start_time = time.time()

sample_embeddings = model.encode(sample_abstracts, convert_to_tensor=True)

end_time = time.time()

time_taken = end_time - start_time

# here all is if we included every article from pubmed that meets the requirements, not all in our dataframe
estimated_time_for_all = time_taken * (int(total_count) / num_samples)
estimated_time_for_current_amount_abstracts = time_taken * (current_num_abstracts / num_samples)


NameError: name 'abstracts' is not defined

In [None]:

print(f'It took {time_taken:.2f} seconds to complete {num_samples:.2f} embeddings.')


print(f'The estimated time for all abstracts is then {estimated_time_for_all / (60 * 60):.2f} hours and for current length of df: {estimated_time_for_current_amount_abstracts / (60 * 60):.2f} hours')
print("This is assuming we have a good representative sample on article length")

It took 6.27 seconds to complete 20.00 embeddings.
The estimated time for all abstracts is then 17.10 hours and for current length of df: 0.87 hours
This is assuming we have a good representative sample on article length


In [None]:
# Making a rudimentary search function
def find_relevant_articles(question, top_k=5):
    # the question is also turned into a multidimensional vector, like the abstracts
    question_embedding = model.encode(question, convert_to_tensor=True)
    # we can then find how similar the question is to the different abstracts, using Cosine Similarity. [0, 1], where closer to one is more similar
    cos_scores = util.pytorch_cos_sim(question_embedding, sample_embeddings)[0] # currently using sample embeddings, so very few
    top_results = torch.topk(cos_scores, k=top_k)

    print("Question:", question)
    print("\nTop relevant articles:")
    for score, idx in zip(top_results[0], top_results[1]):
        idx = idx.item()
        print(f"Article Index: {idx}, Title: {pub_med_df.iloc[idx]['Title']}, Abstract: {pub_med_df.iloc[idx]['Abstract']}, (Score: {score:.4f})")


In [None]:

find_relevant_articles("How is covid related to intelligence?")

Question: How is covid related to intelligence?

Top relevant articles:
Article Index: 11, Title: Global research on artificial intelligence in thyroid-associated ophthalmopathy: A bibliometric analysis., Abstract: To provide an overview of global publications on artificial intelligence (AI) in thyroid-associated ophthalmopathy (TAO) through bibliometric analysis., (Score: 0.3076)
Article Index: 13, Title: Impact of Epidemic Intelligence Service Training in Occupational Respiratory Epidemiology., Abstract: The Centers for Disease Control and Prevention's Epidemic Intelligence Service (EIS) is a fellowship in applied epidemiology for physicians, veterinarians, nurses, scientists, and other health professionals. Each EIS fellow is assigned to a position at a federal, state, or local site for 2 years of on-the-job training in outbreak investigation, epidemiologic research, surveillance system evaluation, and scientific communication. Although the original focus of the program on the contr

__Comment:__ As we can see the Cosine Similarity is low, the largest is 0.3, but this rudimentary version actually works. The following objective is to expand the model and the search, implementing better search functions, and hopefully get an elastic search server to work.

##Search Function

In [None]:
# Making a comprehensive search function for the entire dataset
def find_relevant_articles_full(question, top_k=5):
    # Encode the query to a vector
    question_embedding = model.encode(question, convert_to_tensor=True)

    # Calculate cosine similarities between the query and all abstract embeddings
    cos_scores = util.pytorch_cos_sim(question_embedding, abstract_embeddings)[0]

    # Retrieve the top k most similar abstracts
    top_results = torch.topk(cos_scores, k=top_k)

    print("Question:", question)
    print("\nTop relevant articles:")

    for score, idx in zip(top_results[0], top_results[1]):
        idx = idx.item()
        print(f"Article Index: {idx}, Title: {pub_med_df.iloc[idx]['Title']}, Abstract: {pub_med_df.iloc[idx]['Abstract']}, (Score: {score:.4f})")

find_relevant_articles_full("What are the latest advances in Alzheimer's research?")


##Embedding

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Simulating a small dataset of abstracts
abstracts = [
    "Study on the effect of flavonoids in mulberry leaves on fat processing.",
    "Research on the association between obesity and age-related cataract development.",
    "Advancements in Metal-Organic Frameworks for drug delivery.",
    "Evidence-based techniques in family-centered disability support.",
    "Interfacial charge dynamics in metal-organic frameworks for energy storage solutions."
]

# Simulate embeddings for these abstracts (using random numbers for demonstration)
np.random.seed(42)  # For reproducibility
abstract_embeddings = np.random.rand(len(abstracts), 5)  # Simulate 5-dimensional embeddings

# Define a function to simulate encoding a query into an embedding
def encode_query(query):
    return np.random.rand(1, 5)  # Simulate encoding the query into a 5-dimensional vector

# Define a function to find relevant articles based on cosine similarity
def find_relevant_articles(query, top_k=5):
    query_embedding = encode_query(query)
    cos_similarities = cosine_similarity(query_embedding, abstract_embeddings)

    # Get the top_k indices of the most similar abstracts
    top_k_indices = cos_similarities.argsort()[0][-top_k:][::-1]

    print("Question:", query)
    print("\nTop relevant articles:")
    for idx in top_k_indices:
        print(f"Article Index: {idx}, Abstract: {abstracts[idx]}, (Score: {cos_similarities[0][idx]:.4f})")

# Let's test the search function with a sample query
query = "What are the latest advances in energy storage solutions?"
find_relevant_articles(query, top_k=3)