In [1]:
!pip install python-dotenv
!pip install transformers pymilvus
!pip install -U langchain-community



In [2]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import os
from dotenv import load_dotenv


def get_pubmed_data():
    # Load environment variables from the .env file
    load_dotenv()

    # Access the environment variables
    API_KEY = "781f1d6e9a1ddd33b37d1ef4facf505a7209"
    BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    SEARCH_STRATEGY = "brain disease AND 2014:2024[dp] AND Observational Study[pt] AND English[la]"

    # Step 1: Search for articles using ESearch
    search_params = {
        "db": "pubmed",
        "term": SEARCH_STRATEGY,
        "retmax": 10,  # Number of results to retrieve
        "retmode": "json",
        "api_key": API_KEY
    }
    esearch_url = f"{BASE_URL}esearch.fcgi"
    search_response = requests.get(esearch_url, params=search_params)

    # Parse the article IDs
    article_ids = search_response.json()["esearchresult"]["idlist"]
    print("Article IDs:", article_ids)

    # Step 2: Fetch detailed information using EFetch
    efetch_params = {
        "db": "pubmed",
        "id": ",".join(article_ids),
        "retmode": "xml",
        "api_key": API_KEY
    }
    efetch_url = f"{BASE_URL}efetch.fcgi"
    fetch_response = requests.get(efetch_url, params=efetch_params)

    # Parse the XML response
    root = ET.fromstring(fetch_response.content)

    # Prepare a list to store extracted data
    data = []

    # Extract Title, Abstract, DOI, Keywords, and Publication Date
    for article in root.findall(".//PubmedArticle"):
        title = article.find(".//ArticleTitle").text
        abstract = article.find(".//AbstractText")
        doi = None
        for id_elem in article.findall(".//ArticleId"):
            if id_elem.get("IdType") == "doi":
                doi = id_elem.text

        # Extract keywords
        keywords = [kw.text for kw in article.findall(".//Keyword") if kw.text is not None]

        # Extract publication date
        date_elem = article.find(".//DateCompleted")  # Prefer DateCompleted if available
        if date_elem is None:
            date_elem = article.find(".//ArticleDate")  # Use ArticleDate as a fallback

        if date_elem is not None:
            pub_date = "-".join([
                date_elem.find("Year").text,
                date_elem.find("Month").text.zfill(2),  # Ensure two-digit month
                date_elem.find("Day").text.zfill(2)  # Ensure two-digit day
            ])
        else:
            pub_date = None

        # Append the extracted data to the list
        data.append({
            "Title": title,
            "Abstract": abstract.text if abstract is not None else None,
            "DOI": doi,
            "Keywords": ", ".join(keywords) if keywords else None,
            "Publication Date": pub_date
        })

    # Create a DataFrame from the collected data
    df = pd.DataFrame(data)

    return df

In [3]:
df=get_pubmed_data()

Article IDs: ['39599649', '39599625', '39599035', '39596918', '39593101', '39585693', '39584766', '39581701', '39577922', '39574302']


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Title             10 non-null     object
 1   Abstract          9 non-null      object
 2   DOI               10 non-null     object
 3   Keywords          8 non-null      object
 4   Publication Date  10 non-null     object
dtypes: object(5)
memory usage: 528.0+ bytes


In [5]:
df.head(5)

Unnamed: 0,Title,Abstract,DOI,Keywords,Publication Date
0,Anchor-Based and Distributional Responsiveness...,Patients with dementia present with feeding di...,10.3390/nu16223863,"dementia, feeding behaviour, longitudinal stud...",2024-11-27
1,Clinical Phenotypes Associated with the Gut Mi...,Frailty increases the risk of needing nursing ...,10.3390/nu16223839,"clinical phenotype, frailty, microbiome, nursi...",2024-11-27
2,Application of Isokinetic Dynamometry Data in ...,"Three-dimensional gait analysis, supported by ...",10.3390/s24227258,"gait deviation index, isokinetic dynamometry, ...",2024-11-27
3,Characteristics of Inherited Metabolic Disorde...,,10.3390/medicina60111733,"clinical outcomes, inherited metabolic disorde...",2024-11-27
4,Telerehabilitation using a 2-D planar arm reha...,"We evaluated the feasibility, safety, and effi...",10.1177/019791839903300105,"Cost-effectiveness, End effector robot, Roboti...",2024-11-27


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

def chunking(df=get_pubmed_data(), chunk_size=100, chunk_overlap=20):
    """Splits the abstracts into chunks. The function will return a list of chunks inside a list of abstracts."""

     # Fill missing items with an empty string
    df['Abstract'] = df['Abstract'].fillna('')
    df['Keywords'] = df['Keywords'].fillna('')

    abstracts_list = df['Abstract'].to_list()
    abstracts_chunks = []

    # Iterate through each abstract and corresponding metadata
    for i, abstract in enumerate(abstracts_list):
        # Get metadata from the DataFrame
        title = df.iloc[i]['Title']
        doi = df.iloc[i]['DOI']
        keywords = df.iloc[i]['Keywords']
        publication_date = df.iloc[i]['Publication Date']

        # Create metadata dictionary
        metadata = {
            'Title': title,
            'DOI': doi,
            'Keywords': keywords,
            'Publication Date': publication_date
        }

        # Wrap the abstract in a Document object with the metadata
        document = Document(page_content=abstract, metadata=metadata)

        # Create a RecursiveCharacterTextSplitter instance
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

        # Split the document into chunks
        chunks = text_splitter.split_documents([document])

        # Append the chunks to the list
        abstracts_chunks.append(chunks)

    return abstracts_chunks

# Test
abstracts_chunks=chunking(df=get_pubmed_data(), chunk_size=100, chunk_overlap=20)
abstracts_chunks

Article IDs: ['39599649', '39599625', '39599035', '39596918', '39593101', '39585693', '39584766', '39581701', '39577922', '39574302']
Article IDs: ['39599649', '39599625', '39599035', '39596918', '39593101', '39585693', '39584766', '39581701', '39577922', '39574302']


[[Document(metadata={'Title': 'Anchor-Based and Distributional Responsiveness of the Spanish Version of the Edinburgh Feeding Evaluation in Dementia Scale in Older People with Dementia: A Longitudinal Study.', 'DOI': '10.3390/nu16223863', 'Keywords': 'dementia, feeding behaviour, longitudinal studies, malnutrition, predictive value of tests', 'Publication Date': '2024-11-27'}, page_content='Patients with dementia present with feeding difficulties (FDs) since diagnosis, conditioning their'),
  Document(metadata={'Title': 'Anchor-Based and Distributional Responsiveness of the Spanish Version of the Edinburgh Feeding Evaluation in Dementia Scale in Older People with Dementia: A Longitudinal Study.', 'DOI': '10.3390/nu16223863', 'Keywords': 'dementia, feeding behaviour, longitudinal studies, malnutrition, predictive value of tests', 'Publication Date': '2024-11-27'}, page_content='conditioning their progression. Early identification is vital for preventing deterioration due to'),
  Documen

In [7]:
abstracts_chunks[0][1]

Document(metadata={'Title': 'Anchor-Based and Distributional Responsiveness of the Spanish Version of the Edinburgh Feeding Evaluation in Dementia Scale in Older People with Dementia: A Longitudinal Study.', 'DOI': '10.3390/nu16223863', 'Keywords': 'dementia, feeding behaviour, longitudinal studies, malnutrition, predictive value of tests', 'Publication Date': '2024-11-27'}, page_content='conditioning their progression. Early identification is vital for preventing deterioration due to')

In [8]:
abstracts_chunks[0][2]

Document(metadata={'Title': 'Anchor-Based and Distributional Responsiveness of the Spanish Version of the Edinburgh Feeding Evaluation in Dementia Scale in Older People with Dementia: A Longitudinal Study.', 'DOI': '10.3390/nu16223863', 'Keywords': 'dementia, feeding behaviour, longitudinal studies, malnutrition, predictive value of tests', 'Publication Date': '2024-11-27'}, page_content='due to nutritional problems. The Edinburgh Feeding Evaluation in Dementia Scale (EdFED) identifies')

In [9]:
len(abstracts_chunks)

10

In [10]:
from transformers import AutoTokenizer, AutoModel
from langchain.embeddings import HuggingFaceEmbeddings
import numpy as np

def embed_with_scibert(abstracts_chunks):
    """
    Embeds the `page_content` of abstracts_chunks using SciBERT and returns the embeddings.

    Parameters:
    - abstracts_chunks: List[List[Document]] - A list of lists containing Document objects.

    Returns:
    - embeddings_data: List[dict] - A list of dictionaries containing page_content and their embeddings.
    """
    # Initialize HuggingFaceEmbeddings with SciBERT
    hf_embeddings = HuggingFaceEmbeddings(model_name="allenai/scibert_scivocab_uncased")

    # Prepare a list to store results
    embeddings_data = []

    # Iterate over abstracts_chunks and compute embeddings
    for chunks in abstracts_chunks:
        for chunk in chunks:
            if chunk.page_content:  # Ensure content exists
                # Compute embeddings for the page_content
                embedding = hf_embeddings.embed_query(chunk.page_content)

                # Append to results
                embeddings_data.append({
                    "page_content": chunk.page_content,
                    "embedding": np.array(embedding).tolist(),  # Convert to list for easier handling
                    "metadata": chunk.metadata
                })

    return embeddings_data
embeddings_data=embed_with_scibert(abstracts_chunks)

  hf_embeddings = HuggingFaceEmbeddings(model_name="allenai/scibert_scivocab_uncased")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
len(embeddings_data)

53

In [12]:
embeddings_data[0]['page_content']

'Patients with dementia present with feeding difficulties (FDs) since diagnosis, conditioning their'

In [13]:
max_len = 0  # Initialize max_len to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    content_len = len(item['page_content'])  # Get the length of the current 'page_content'
    if content_len > max_len:  # Check if the current length is greater than max_len
        max_len = content_len  # Update max_len if the current length is greater

# Print the maximum length
print("Maximum length of 'page_content':", max_len)

Maximum length of 'page_content': 100


In [14]:
type(embeddings_data[0]['embedding'])

list

In [15]:
max_len = 0  # Initialize max_len to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    content_len = len(item['embedding'])  # Get the length of the current 'page_content'
    if content_len > max_len:  # Check if the current length is greater than max_len
        max_len = content_len  # Update max_len if the current length is greater

# Print the maximum length
print("Maximum length of 'embedding':", max_len)

Maximum length of 'embedding': 768


In [16]:
embeddings_data[0]['metadata']

{'Title': 'Anchor-Based and Distributional Responsiveness of the Spanish Version of the Edinburgh Feeding Evaluation in Dementia Scale in Older People with Dementia: A Longitudinal Study.',
 'DOI': '10.3390/nu16223863',
 'Keywords': 'dementia, feeding behaviour, longitudinal studies, malnutrition, predictive value of tests',
 'Publication Date': '2024-11-27'}

In [17]:
max_len = 0  # Initialize max_len to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    content_len = len(item['metadata'])  # Get the length of the current 'page_content'
    if content_len > max_len:  # Check if the current length is greater than max_len
        max_len = content_len  # Update max_len if the current length is greater

# Print the maximum length
print("Maximum length of 'metadata':", max_len)

Maximum length of 'metadata': 4


In [18]:
type(embeddings_data[0]['metadata']['Title'])

str

In [19]:
max_len_title = 0  # Initialize max_len_title to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    title_len = len(item['metadata']['Title'])  # Get the length of the current 'Title'
    if title_len > max_len_title:  # Check if the current length is greater than max_len_title
        max_len_title = title_len  # Update max_len_title if the current length is greater

# Print the maximum length
print("Maximum length of 'Title':", max_len_title)

Maximum length of 'Title': 191


In [20]:
embeddings_data[0]['metadata']['DOI']

'10.3390/nu16223863'

In [21]:
max_len = 0  # Initialize max_len to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    title_len = len(item['metadata']['DOI'])  # Get the length of the current 'Title'
    if title_len > max_len:  # Check if the current length is greater than max_len_title
        max_len = title_len  # Update max_len_title if the current length is greater

# Print the maximum length
print("Maximum length of 'DOI':", max_len)

Maximum length of 'DOI': 29


In [22]:
embeddings_data[0]['metadata']['Keywords']

'dementia, feeding behaviour, longitudinal studies, malnutrition, predictive value of tests'

In [23]:
max_len = 0  # Initialize max_len to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    # Check if 'Keywords' exists and is not None before getting its length
    if item['metadata']['Keywords'] is not None:
        title_len = len(item['metadata']['Keywords'])  # Get the length of the current 'Title'
        if title_len > max_len:  # Check if the current length is greater than max_len_title
            max_len = title_len  # Update max_len_title if the current length is greater

# Print the maximum length
print("Maximum length of 'Keywords':", max_len)

Maximum length of 'Keywords': 122


In [24]:
embeddings_data[0]['metadata']['Publication Date']

'2024-11-27'

In [25]:
max_len = 0  # Initialize max_len to 0

# Iterate through the embeddings_data list
for item in embeddings_data:
    # Check if 'Keywords' exists and is not None before getting its length
    if item['metadata']['Publication Date'] is not None:
        title_len = len(item['metadata']['Publication Date'])  # Get the length of the current 'Title'
        if title_len > max_len:  # Check if the current length is greater than max_len_title
            max_len = title_len  # Update max_len_title if the current length is greater

# Print the maximum length
print("Maximum length of 'Publication Date':", max_len)

Maximum length of 'Publication Date': 10


In [26]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
from pymilvus import MilvusClient # Import MilvusClient from pymilvus


def create_milvus():
    client = MilvusClient(uri="./MethodMIND.db")
    # Define schema
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="doi", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="keywords", dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name="publication_date", dtype=DataType.VARCHAR, max_length=512)
    ]
    schema = CollectionSchema(fields=fields, auto_id=True)

    # Create collection
    collection_name = "MethodVectors"

    client.drop_collection(collection_name=collection_name)

    if collection_name in client.list_collections():
        print(f"Collection {collection_name} already exists...")

    client.create_collection(collection_name=collection_name, schema=schema)

    print(f"Collection {collection_name} created successfully.")

    # Creting the index
    # Set up the index parameters
    index_params = client.prepare_index_params()

    # Add an index on the vector field.
    index_params.add_index(
        field_name="embedding",
        metric_type="COSINE",
        index_type="HNSW",
        index_name="vector_index",
        params={ "nlist": 128 })

    # Create an index file
    client.create_index(
        collection_name=collection_name,
        index_params=index_params,
        sync=True
        # Whether to wait for index creation to complete before returning. Defaults to True.
        )

    print("Index vector created successfully.")
# Call the function
create_milvus()

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 47e8c0030d534875a6c2c42970097e73
DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: MethodVectors


Collection MethodVectors created successfully.


DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: MethodVectors


Index vector created successfully.


In [55]:
def insert_data_milvus(embeddings_data):
    from pymilvus import Collection  # Import Collection
    database_name="MethodMIND"
    client = MilvusClient(uri=f"./{database_name}.db")  # Initialize MilvusClient
    collection_name = "MethodVectors"

    # In pymilvus 2.5.0, you load the collection using the client directly
    collection = client.load_collection(collection_name)  # Load the collection

    # Prepare data for insertion
    embeddings = []
    titles = ''
    dois = ''
    keywords = ''
    publication_dates = ''
    data = {}

    for i in range(len(embeddings_data)):
        embeddings = (embeddings_data[i]["embedding"])
        titles = embeddings_data[i]["metadata"]["Title"] if embeddings_data[i]["metadata"]["Title"] is not None else ''
        dois = embeddings_data[i]["metadata"]["DOI"] if embeddings_data[i]["metadata"]["DOI"] is not None else ''
        keywords = embeddings_data[i]["metadata"]["Keywords"] if embeddings_data[i]["metadata"]["Keywords"] is not None else ''
        publication_dates = embeddings_data[i]["metadata"]["Publication Date"] if embeddings_data[i]["metadata"]["Publication Date"] is not None else ''

        data = {
            "embedding": embeddings,
            "title": titles,
            "doi": dois,
            "keywords": keywords,
            "publication_date": publication_dates}

        client.insert(collection_name=collection_name,
        data=data)

        print(f"Inserted entity {i} into collection {collection_name}")


    # Check if data is successfully inserted.

    row_count = client.get_collection_stats(collection_name=collection_name)['row_count']

    print(f"\n {database_name} database as {row_count} in collection {collection_name}")

    client.flush(collection_name) # Pass collection_name directly as a string
    client.close()
    print("Milvus client connection closed.")

insert_data_milvus(embeddings_data)

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: bf044ecdd7a04f2e9aa86af4718ae604


Inserted entity 0 into collection MethodVectors
Inserted entity 1 into collection MethodVectors
Inserted entity 2 into collection MethodVectors
Inserted entity 3 into collection MethodVectors
Inserted entity 4 into collection MethodVectors
Inserted entity 5 into collection MethodVectors
Inserted entity 6 into collection MethodVectors
Inserted entity 7 into collection MethodVectors
Inserted entity 8 into collection MethodVectors
Inserted entity 9 into collection MethodVectors
Inserted entity 10 into collection MethodVectors
Inserted entity 11 into collection MethodVectors
Inserted entity 12 into collection MethodVectors
Inserted entity 13 into collection MethodVectors
Inserted entity 14 into collection MethodVectors
Inserted entity 15 into collection MethodVectors
Inserted entity 16 into collection MethodVectors
Inserted entity 17 into collection MethodVectors
Inserted entity 18 into collection MethodVectors
Inserted entity 19 into collection MethodVectors
Inserted entity 20 into collec

In [56]:
!ls -lah

total 2.6M
drwxr-xr-x 1 root root 4.0K Nov 28 05:42 .
drwxr-xr-x 1 root root 4.0K Nov 28 03:07 ..
drwxr-xr-x 4 root root 4.0K Nov 25 19:13 .config
-rw-r--r-- 1 root root 2.6M Nov 28 05:42 MethodMIND.db
drwxr-xr-x 1 root root 4.0K Nov 25 19:13 sample_data


In [54]:
!rm .MethodMIND.db.lock