In [4]:
import requests
from xml.etree import ElementTree

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import UDPOS

import numpy as np

import time
import random
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

In [11]:
# Initialize the dataset
pub_med = []
num_articles = 10000

# Use ESearch to get PMIDs
esearch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
search_params = {
    'db': 'pubmed',
    'term': 'intelligence[Abstract] AND ("2013/01/01"[Date - Publication] : "2023/12/31"[Date - Publication])',
    'retmax': num_articles, # is by default 20
    'retmode': 'json',

}

# send request to eSearch, and process response
search_response = requests.get(esearch_url, params=search_params)
search_data = search_response.json()['esearchresult']
pmids = search_response.json()['esearchresult']['idlist'] # pmid are the identifyers of the different articles


In [19]:
# how many articles with "intelligence in the abstract
total_count = search_data['count']  # Fetch the total count
print(f"Total number of articles with 'intelligence' in the abstract published between 2013 to 2023: {total_count}")
print(pmids[0:10])

Total number of articles with 'intelligence' in the abstract published between 2013 to 2023: 196486
['38193887', '38193037', '38193015', '38193004', '38192969', '38192845', '38192839', '38192792', '38192785', '38192775']


In [22]:
# eSearch is to find identifiers, eFetch is to find the abstracts
efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"

# Process in batches due to API limits
for i in range(0, len(pmids), 100):
    batch_pmids = pmids[i:i+100]
    fetch_params = {
        'db': 'pubmed',
        'id': ','.join(batch_pmids),
        'retmode': 'xml'
    }

    fetch_response = requests.get(efetch_url, params=fetch_params)

    if fetch_response.status_code == 200:
        root = ElementTree.fromstring(fetch_response.content)
        for article in root.findall(".//PubmedArticle"):
            article_title = article.find(".//ArticleTitle").text
            abstract_text = article.find(".//Abstract/AbstractText").text if article.find(".//Abstract/AbstractText") is not None else "No abstract available"
            pub_date = article.find(".//PubDate/Year").text if article.find(".//PubDate/Year") is not None else "No publication year"
            pub_med.append({'Title': article_title, 'Abstract': abstract_text, 'Publication Year': pub_date})

# Convert to DataFrame
pub_med_df = pd.DataFrame(pub_med)

In [23]:
# Display the first few rows of the DataFrame
print(pub_med_df.head())
print(f"Total articles in the dataset: {len(pub_med_df)}")

                                               Title  \
0  Using machine learning in the prediction of sy...   
1  Assessing the impact of transfusion thresholds...   
2  Mutual mate guarding with limited sexual confl...   
3  Effect of neoadjuvant endocrine therapy on the...   
4  Revolutionizing Breast Healthcare: Harnessing ...   

                                            Abstract Publication Year  
0  Venous thromboembolism (VTE) is a major cause ...             2023  
1  Sepsis is a severe condition that often leads ...             2023  
2  Mate guarding is typically considered a male s...             2024  
3  Neoadjuvant endocrine therapy (NET) of prostat...             2023  
4  Breast cancer has the highest incidence and se...             2023  
Total articles in the dataset: 9998


## Data processing

##Pipeline:
Promt goes into UI (streamlit?), which has a search function which interacts with ElasticSearch, which then gives relevant results, which are then displayed in the UI

In [None]:
# !pip install -U sentence-transformers

In [None]:
# ! pip install transformers

In [30]:
# ! pip install elasticsearch

In [31]:
# ! pip install opensearch

In [32]:
# !pip install python-dotenv

In [34]:
# Want to access elastic search from notebook:
from elasticsearch import Elasticsearch

In [35]:
es = Elasticsearch(

)

ValueError: Either 'hosts' or 'cloud_id' must be specified

Get data from Pub_Med, process it in pandas -> Then convert into vectors (using BERT?), not convert everything, just abstract -> insert everything into elasticSearch to process queries. Make UI in streamlit?

Write a search function in streamlit UI,
So for a key word or prompt, it goes into streamlit UI, into search function which talks to elasticsearch, which gives relevant results and shows them in the UI.
