# Entrez


## 1. Installing required libraries

In [1]:
!pip install sentencepiece transformers torch biopython cohere annoy pinecone-client -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.6/166.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.5/215.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m5.6 MB

In [None]:
import numpy as np
import pandas as pd
import random
from google.colab import userdata
from pinecone import Pinecone, ServerlessSpec
import cohere
from annoy import AnnoyIndex
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from Bio import Entrez, Medline
from concurrent.futures import ThreadPoolExecutor

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pinecone_api_key = "" #Add your pinecone API key here
cohere_api_key = "" #Add your cohere API key here

## 2. Entrez

I. Loading credentials

II. Searching for articles / research papers based on keywords and publication date

III. Fetching and saving articles from the PubMed database based on the following:

      

* PMID
* Title
* Abstract
* Authors
* Author Affiliations
* Author Keywords
* Publication Title
* Publication Year

IV. Creating a Pandas dataframe to save the results

V. Saving it to a csv file


In [None]:
Entrez.email =  userdata.get('enrez_email')
Entrez.api_key = userdata.get('enrez_key')

In [None]:
#ask user to select earliest publication date
publication_date = '2000/01/01'
retmax = 1000

unique_pmids = set()
keyword_dictionary_pmids = {}

keywords_list = ["diabetes", "cancer"]

for kw in keywords_list:
    search_term = kw
    handle = Entrez.esearch(db="pubmed", term=search_term, mindate=publication_date, retmax=retmax)
    record = Entrez.read(handle)
    handle.close()

    keyword_dictionary_pmids[kw] = record["IdList"]

    # Filter out PMIDs that have been already collected
    new_pmids = set(record["IdList"]) - unique_pmids

    print(f"{len(new_pmids)} new results found for Search '{kw}'")

    # Update the set of unique PMIDs
    unique_pmids.update(new_pmids)

1000 new results found for Search 'diabetes'
972 new results found for Search 'cancer'


*Ask user to "add to basket" which of the result queries they wish to keep*

*Transparent Search*

In [None]:
for keyword, pmid_list in keyword_dictionary_pmids.items():
    print(f"Search: {keyword}, \n PMIDs: {len(pmid_list)}")
    print()

Search: diabetes, 
 PMIDs: 1000

Search: cancer, 
 PMIDs: 1000



*Or they can simply select all*

In [None]:
print(len(unique_pmids))

1972


### For Semantic Analysis we will use all

In [None]:
# Define a function to fetch article data
def fetch_article_data(pmid):
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
        record = Entrez.read(handle)
        handle.close()

        # Extract data as needed
        citation = record['PubmedArticle'][0]['MedlineCitation']
        keyword_list = citation.get('KeywordList', [])
        article = citation['Article']
        title = article.get('ArticleTitle', 'Title not available')

        # Check if the abstract is available
        abstract = None
        if 'Abstract' in article:
            abstract = article['Abstract']['AbstractText']
            abstract = ' '.join(abstract)

        # Extract author names & author affiliations
        author_list = article['AuthorList']
        affiliations = []
        authors = []
        for author in author_list:
            if 'LastName' in author and 'Initials' in author:
                full_name = f"{author['LastName']} {author['Initials']}"
                authors.append(full_name)
            if 'AffiliationInfo' in author:
                affiliation_info = author['AffiliationInfo']
                if len(affiliation_info):
                  affiliation_name_first = affiliation_info[0]
                  affiliations.append(affiliation_name_first.get('Affiliation', "Not available"))

        # Extract author keywords
        keywords = []
        if len(keyword_list):
          for keyword in keyword_list[0]:
            keywords.append(keyword)

        citation = record['PubmedArticle'][0]['MedlineCitation']
        journal_info = article['Journal']
        journal = journal_info.get('Title', 'Journal not available')
        issue = journal_info.get('JournalIssue', 'Journal issue not available')
        pub_date = issue.get('PubDate', 'PubDate not available')
        pub_year = pub_date.get('Year', 'Year not available')

        return {
            "PMID": pmid,
            "Title": title,
            "Abstract": abstract,
            "Authors": ['; '.join(authors)],
            "Author Affiliations": ['; '.join(affiliations)],
            "Author Keywords": [';'.join(keywords)],
            "Publication Title": journal,
            "Publication Year": pub_year,
        }

    except Exception as e:
        print(f"Error retrieving data for PMID {pmid}: {str(e)}")
        return None

# Specify the number of concurrent threads
num_threads = 5

# Use ThreadPoolExecutor for concurrent fetching
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Fetch article data concurrently
    results = list(executor.map(fetch_article_data, unique_pmids))

# Remove None values (articles with errors)
results = [result for result in results if result is not None]

# Create the DataFrame
df = pd.DataFrame(results)

# Print the resulting DataFrame
df.head()

Error retrieving data for PMID 38779496: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38763052: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38776926: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38774394: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38775477: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38757094: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38766432: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38758819: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38758687: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38777442: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38776803: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38764060: HTTP Error 429: Too Many Requests
Error retrieving data for PMID 38757245: HTTP Error 429: Too Many Requests
Error retrieving data for

Unnamed: 0,PMID,Title,Abstract,Authors,Author Affiliations,Author Keywords,Publication Title,Publication Year
0,38780889,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",[Schwieger L; Postlewait LM; Liu Y; Jou S; Yi ...,"[Division of Surgical Oncology, Department of ...",[Breast cancer;Breast pathology;Hormone recept...,Breast cancer research and treatment,2024
1,38776443,Recombinant ADAMTS13: an effective Rescue Ther...,,[Dadoun SE; Adam K; Hensch L; Boyd TK; Ibrahim...,"[Baylor College of Medicine, Houston, Texas, U...",[],Blood advances,2024
2,38778679,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",[Wang XF; Sun YH],"[Department of General Surgery, Zhongshan Hosp...",[],Zhonghua wei chang wai ke za zhi = Chinese jou...,2024
3,38779178,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,[Lin X; Chen Z; Zhao Q; Zhou X],"[School of Nursing, Guangdong Pharmaceutical U...",[Benefit finding;Cancer care quality;Cross-sec...,Asia-Pacific journal of oncology nursing,2024
4,38776638,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,[Dashevsky BZ; Muneer MS; Hao M; Liang T; Wapn...,"[Department of Radiology, Stanford University ...",[breast biopsy;breast localization;breast ultr...,Journal of breast imaging,2024


In [None]:
len(df)

1590

In [None]:
df.head()

Unnamed: 0,PMID,Title,Abstract,Authors,Author Affiliations,Author Keywords,Publication Title,Publication Year
0,38780889,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",[Schwieger L; Postlewait LM; Liu Y; Jou S; Yi ...,"[Division of Surgical Oncology, Department of ...",[Breast cancer;Breast pathology;Hormone recept...,Breast cancer research and treatment,2024
1,38776443,Recombinant ADAMTS13: an effective Rescue Ther...,,[Dadoun SE; Adam K; Hensch L; Boyd TK; Ibrahim...,"[Baylor College of Medicine, Houston, Texas, U...",[],Blood advances,2024
2,38778679,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",[Wang XF; Sun YH],"[Department of General Surgery, Zhongshan Hosp...",[],Zhonghua wei chang wai ke za zhi = Chinese jou...,2024
3,38779178,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,[Lin X; Chen Z; Zhao Q; Zhou X],"[School of Nursing, Guangdong Pharmaceutical U...",[Benefit finding;Cancer care quality;Cross-sec...,Asia-Pacific journal of oncology nursing,2024
4,38776638,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,[Dashevsky BZ; Muneer MS; Hao M; Liang T; Wapn...,"[Department of Radiology, Stanford University ...",[breast biopsy;breast localization;breast ultr...,Journal of breast imaging,2024


In [None]:
df.to_csv('entrez_data.csv')

## 3. Restructuring and cleaning our data

In [None]:
df = pd.read_csv("entrez_data.csv")

In [None]:
df

Unnamed: 0.1,Unnamed: 0,PMID,Title,Abstract,Authors,Author Affiliations,Author Keywords,Publication Title,Publication Year
0,0,38780889,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",['Schwieger L; Postlewait LM; Liu Y; Jou S; Yi...,"['Division of Surgical Oncology, Department of...",['Breast cancer;Breast pathology;Hormone recep...,Breast cancer research and treatment,2024
1,1,38776443,Recombinant ADAMTS13: an effective Rescue Ther...,,['Dadoun SE; Adam K; Hensch L; Boyd TK; Ibrahi...,"[""Baylor College of Medicine, Houston, Texas, ...",[''],Blood advances,2024
2,2,38778679,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",['Wang XF; Sun YH'],"['Department of General Surgery, Zhongshan Hos...",[''],Zhonghua wei chang wai ke za zhi = Chinese jou...,2024
3,3,38779178,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,['Lin X; Chen Z; Zhao Q; Zhou X'],"['School of Nursing, Guangdong Pharmaceutical ...",['Benefit finding;Cancer care quality;Cross-se...,Asia-Pacific journal of oncology nursing,2024
4,4,38776638,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,['Dashevsky BZ; Muneer MS; Hao M; Liang T; Wap...,"['Department of Radiology, Stanford University...",['breast biopsy;breast localization;breast ult...,Journal of breast imaging,2024
...,...,...,...,...,...,...,...,...,...
1585,1585,38778887,Complex Regional Pain Syndrome in Cancer Cases...,Complex regional pain syndrome (CRPS) is a dis...,['Thanaboriboon C; Matos Macêdo MC; Perez J'],"['Cancer Pain Clinic, Departments of Anesthesi...",['CRPS;cancer;causalgia'],International medical case reports journal,2024
1586,1586,38775859,Mapping the Single-cell Differentiation Landsc...,The genetic intratumoral heterogeneity observe...,['Truong DD; Weistuch C; Murgas KA; Admane P; ...,['The University of Texas MD Anderson Cancer C...,[''],Clinical cancer research : an official journal...,2024
1587,1587,38774165,Poor Glycemic Control Affecting Screening of P...,Introduction Diabetes and cancer are commonly ...,['Bharti A; Shekhar R; Prakash P; Kumari S; Ku...,"['Biochemistry, Indira Gandhi Institute of Med...",['carcinoma prostrate;diabetes mellitus;glycat...,Cureus,2024
1588,1588,38776225,The C-terminal disordered loop domain of Apc8 ...,The anaphase-promoting complex/cyclosome (APC/...,['Darling S; Fujimitsu K; Chia KH; Zou J; Rapp...,"['Cell Cycle Control Group, University College...",['APC/C;CDK;CP: Molecular biology;Cdc20;Cks;ce...,Cell reports,2024


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,PMID,Title,Abstract,Authors,Author Affiliations,Author Keywords,Publication Title,Publication Year
0,0,38780889,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",['Schwieger L; Postlewait LM; Liu Y; Jou S; Yi...,"['Division of Surgical Oncology, Department of...",['Breast cancer;Breast pathology;Hormone recep...,Breast cancer research and treatment,2024
1,1,38776443,Recombinant ADAMTS13: an effective Rescue Ther...,,['Dadoun SE; Adam K; Hensch L; Boyd TK; Ibrahi...,"[""Baylor College of Medicine, Houston, Texas, ...",[''],Blood advances,2024
2,2,38778679,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",['Wang XF; Sun YH'],"['Department of General Surgery, Zhongshan Hos...",[''],Zhonghua wei chang wai ke za zhi = Chinese jou...,2024
3,3,38779178,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,['Lin X; Chen Z; Zhao Q; Zhou X'],"['School of Nursing, Guangdong Pharmaceutical ...",['Benefit finding;Cancer care quality;Cross-se...,Asia-Pacific journal of oncology nursing,2024
4,4,38776638,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,['Dashevsky BZ; Muneer MS; Hao M; Liang T; Wap...,"['Department of Radiology, Stanford University...",['breast biopsy;breast localization;breast ult...,Journal of breast imaging,2024


In [None]:
df.tail()

Unnamed: 0.1,Unnamed: 0,PMID,Title,Abstract,Authors,Author Affiliations,Author Keywords,Publication Title,Publication Year
1585,1585,38778887,Complex Regional Pain Syndrome in Cancer Cases...,Complex regional pain syndrome (CRPS) is a dis...,['Thanaboriboon C; Matos Macêdo MC; Perez J'],"['Cancer Pain Clinic, Departments of Anesthesi...",['CRPS;cancer;causalgia'],International medical case reports journal,2024
1586,1586,38775859,Mapping the Single-cell Differentiation Landsc...,The genetic intratumoral heterogeneity observe...,['Truong DD; Weistuch C; Murgas KA; Admane P; ...,['The University of Texas MD Anderson Cancer C...,[''],Clinical cancer research : an official journal...,2024
1587,1587,38774165,Poor Glycemic Control Affecting Screening of P...,Introduction Diabetes and cancer are commonly ...,['Bharti A; Shekhar R; Prakash P; Kumari S; Ku...,"['Biochemistry, Indira Gandhi Institute of Med...",['carcinoma prostrate;diabetes mellitus;glycat...,Cureus,2024
1588,1588,38776225,The C-terminal disordered loop domain of Apc8 ...,The anaphase-promoting complex/cyclosome (APC/...,['Darling S; Fujimitsu K; Chia KH; Zou J; Rapp...,"['Cell Cycle Control Group, University College...",['APC/C;CDK;CP: Molecular biology;Cdc20;Cks;ce...,Cell reports,2024
1589,1589,38772586,Breaking down barriers to bariatric care: a qu...,Telemedicine is becoming an increasingly feasi...,['Poljo A; Tynes DM; Timper K; Süsstrunk J; Kr...,"['Department of Visceral Surgery, Clarunis, Un...",['bariatric surgery;health informatics;qualita...,BMJ open,2024


In [None]:
df["Publication Year"].min()

'2023'

In [None]:
df["Publication Year"] = df["Publication Year"].str.replace('Year not available', '2024')

In [None]:
df["Publication Year"].max()

'2024'

In [None]:
column_list = [column for column in df.columns]
print(column_list)

['Unnamed: 0', 'PMID', 'Title', 'Abstract', 'Authors', 'Author Affiliations', 'Author Keywords', 'Publication Title', 'Publication Year']


In [None]:
for column in df.columns:
    if column in ["PMID", "Author Affiliations", "Publication Title", "Author Keywords"]:
        df.pop(column)

In [None]:
df.dropna(subset=['Abstract'])

Unnamed: 0.1,Unnamed: 0,Title,Abstract,Authors,Publication Year
0,0,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",['Schwieger L; Postlewait LM; Liu Y; Jou S; Yi...,2024
2,2,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",['Wang XF; Sun YH'],2024
3,3,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,['Lin X; Chen Z; Zhao Q; Zhou X'],2024
4,4,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,['Dashevsky BZ; Muneer MS; Hao M; Liang T; Wap...,2024
5,5,Factors Associated with Patient Education in P...,Patient education in chronic obstructive pulmo...,['Lindh A; Giezeman M; Theander K; Zakrisson A...,2024
...,...,...,...,...,...
1585,1585,Complex Regional Pain Syndrome in Cancer Cases...,Complex regional pain syndrome (CRPS) is a dis...,['Thanaboriboon C; Matos Macêdo MC; Perez J'],2024
1586,1586,Mapping the Single-cell Differentiation Landsc...,The genetic intratumoral heterogeneity observe...,['Truong DD; Weistuch C; Murgas KA; Admane P; ...,2024
1587,1587,Poor Glycemic Control Affecting Screening of P...,Introduction Diabetes and cancer are commonly ...,['Bharti A; Shekhar R; Prakash P; Kumari S; Ku...,2024
1588,1588,The C-terminal disordered loop domain of Apc8 ...,The anaphase-promoting complex/cyclosome (APC/...,['Darling S; Fujimitsu K; Chia KH; Zou J; Rapp...,2024


In [None]:
df.to_csv("cleaned_entrez.csv", index = False)

## 4. Summarization (based on Transformers and Tokenizers)


In [None]:
df = pd.read_csv("cleaned_entrez.csv")

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
for abstract in df['Abstract']:
    print("-----------")
    text = str(abstract)
    preprocessed_text = text.strip().replace('\n','')
    t5_input_text = 'summarize: ' + preprocessed_text
    tokenized_text = tokenizer.encode(t5_input_text, return_tensors='pt', max_length=300).to(device)
    summary_ids = model.generate(tokenized_text, min_length=30, max_length=300)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print("Abstract: ",abstract,"\n", "Summary: ", summary)

-----------
Abstract:  In metastatic breast cancer, differences in expression patterns of estrogen receptor (ER), progesterone receptor (PR), and human epidermal growth factor receptor-2 (HER2) between the primary tumor (PT) and metastatic site (MET) have been reported. However, there is limited understanding of the relationship of tumor subtype discordance and overall survival (OS). We evaluated patterns of ER/PR/HER2 in PTs and corresponding METs and assessed the relationship between these patterns and OS. Patients diagnosed at our center with metastatic breast cancer (2011-2020) were included. ER/PR were stratified as < 1%/1-10%/ > 10% by immunohistochemistry and HER2 as positive/negative by immunohistochemistry/FISH. Tumor subtypes were classified as ER or PR + /HER2-, HER2+ , or triple-negative. Biomarker discordance data from PTs to METs were analyzed for expression patterns. OS was assessed. Of 254 patients, 41 (16.1%) had synchronous and 213 (83.9%) had metachronous METs. Categ

KeyboardInterrupt: 

In [None]:
count = 0
# Define the summarization function
def summarize_text(text):
    global count
    count +=1
    if count == 2:
      print("Its working")
    if count%100 == 0:
      print(f"{count} summarized")

    # Check if the text is NaN (float) or None
    if pd.isna(text) or text is None:
        return ""  # Return an empty string for missing or None values
    preprocessed_text = str(text).strip().replace('\n', '')  # Convert to string and preprocess
    t5_input_text = 'summarize: ' + preprocessed_text
    tokenized_text = tokenizer.encode(
        t5_input_text,
        return_tensors='pt',
        max_length=350,
        truncation=True  # Explicitly enable truncation
    ).to(device)
    summary_ids = model.generate(tokenized_text, min_length=30, max_length=350)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Apply the summarization function to the "Abstract" column
df['Summarized'] = df['Abstract'].apply(summarize_text)

df.head()

Its working
100 summarized
200 summarized
300 summarized
400 summarized
500 summarized
600 summarized
700 summarized
800 summarized
900 summarized
1000 summarized
1100 summarized
1200 summarized
1300 summarized
1400 summarized
1500 summarized


Unnamed: 0,Title,Abstract,Authors,Publication Year,Summarized
0,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",['Schwieger L; Postlewait LM; Liu Y; Jou S; Yi...,2024,differences in expression patterns between pri...
1,Recombinant ADAMTS13: an effective Rescue Ther...,,['Dadoun SE; Adam K; Hensch L; Boyd TK; Ibrahi...,2024,
2,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",['Wang XF; Sun YH'],2024,EOGC has seen a gradually increasing incidence...
3,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,['Lin X; Chen Z; Zhao Q; Zhou X'],2024,study aimed to explore benefit finding profile...
4,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,['Dashevsky BZ; Muneer MS; Hao M; Liang T; Wap...,2024,"biopsy examinations reported the term ""SCOUT"" ..."


In [None]:
df = df.replace(to_replace="", value=np.nan).dropna()

In [None]:
df = pd.read_csv("summarized_entrez.csv")
df.pop('Unnamed: 0')

0          0
1          1
2          2
3          3
4          4
        ... 
1585    1585
1586    1586
1587    1587
1588    1588
1589    1589
Name: Unnamed: 0, Length: 1590, dtype: int64

In [None]:
df

Unnamed: 0,Title,Abstract,Authors,Publication Year,Summarized
0,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",['Schwieger L; Postlewait LM; Liu Y; Jou S; Yi...,2024,differences in expression patterns between pri...
2,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",['Wang XF; Sun YH'],2024,EOGC has seen a gradually increasing incidence...
3,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,['Lin X; Chen Z; Zhao Q; Zhou X'],2024,study aimed to explore benefit finding profile...
4,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,['Dashevsky BZ; Muneer MS; Hao M; Liang T; Wap...,2024,"biopsy examinations reported the term ""SCOUT"" ..."
5,Factors Associated with Patient Education in P...,Patient education in chronic obstructive pulmo...,['Lindh A; Giezeman M; Theander K; Zakrisson A...,2024,"a nationwide study identified 29,692 COPD pati..."
...,...,...,...,...,...
1585,Complex Regional Pain Syndrome in Cancer Cases...,Complex regional pain syndrome (CRPS) is a dis...,['Thanaboriboon C; Matos Macêdo MC; Perez J'],2024,complex regional pain syndrome (CRPS) is a dis...
1586,Mapping the Single-cell Differentiation Landsc...,The genetic intratumoral heterogeneity observe...,['Truong DD; Weistuch C; Murgas KA; Admane P; ...,2024,a 'roadmap' of a human mesenchymal differentia...
1587,Poor Glycemic Control Affecting Screening of P...,Introduction Diabetes and cancer are commonly ...,['Bharti A; Shekhar R; Prakash P; Kumari S; Ku...,2024,the link between diabetes and cancer is common...
1588,The C-terminal disordered loop domain of Apc8 ...,The anaphase-promoting complex/cyclosome (APC/...,['Darling S; Fujimitsu K; Chia KH; Zou J; Rapp...,2024,anaphase-promoting complex/cyclosome (APC/C) i...


In [None]:
df.to_csv("new_entrez.csv", index = False)

## 5. Cohere Embedding, Queries, and KNN

In [None]:
raw_df = pd.read_csv("/content/drive/MyDrive/entrez/new_entrez.csv")
print(raw_df.shape)
hq_df = raw_df
hq_df.head()

(1461, 5)


Unnamed: 0,Title,Abstract,Authors,Publication Year,Summary
0,Changes in expression of breast cancer tumor b...,"In metastatic breast cancer, differences in ex...",['Schwieger L; Postlewait LM; Liu Y; Jou S; Yi...,2024,differences in expression patterns between pri...
1,"[Trends, challenges, and reflections on early-...","Early onset gastric cancer (EOGC), as a distin...",['Wang XF; Sun YH'],2024,EOGC has seen a gradually increasing incidence...
2,Benefit-finding profiles and comparison of car...,This study aimed to explore the benefit findin...,['Lin X; Chen Z; Zhao Q; Zhou X'],2024,study aimed to explore benefit finding profile...
3,SCOUT® Radar Localization at Time of Breast Bi...,Evaluate surgical utilization of SCOUT reflect...,['Dashevsky BZ; Muneer MS; Hao M; Liang T; Wap...,2024,"biopsy examinations reported the term ""SCOUT"" ..."
4,Factors Associated with Patient Education in P...,Patient education in chronic obstructive pulmo...,['Lindh A; Giezeman M; Theander K; Zakrisson A...,2024,"a nationwide study identified 29,692 COPD pati..."


In [None]:
hq_df.shape

(1461, 5)

In [None]:
df = hq_df

In [None]:
# Print a few sample documents
texts = hq_df["Summary"].tolist()

random.seed(100)
for item in random.sample(texts, 3):
  print(item)

nationwide cohort and mendelian randomisation analysis investigated link. a nationwide matched cohort used data from the Swedish ESPRESSO cohort. the association between persistent villous atrophy and type 2 diabetes remains undetermined.
AMP facilitated wound healing in vivo. AMP inhibited pro-inflammatory factor secretions and inflammasome pathway.
study aimed to explore association between SHR and all-cause mortality in the community-dwelling population. a total of 18 480 participants were included out of 82 091 from the NHANES 1999-2014 survey.


In [None]:
co = cohere.Client(cohere_api_key)

### Creating and Storing Document Embeddings

In [None]:
response = co.embed(texts=texts, model='multilingual-22-12').embeddings
embeds = np.array(response)
# Embed the documents and store in index
search_index = AnnoyIndex(embeds.shape[1], 'angular')
# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(100) # 10 trees
search_index.save('entrez_index.ann')

True

### Defining Example Queries

In [None]:
queries = ["what are the symptoms of diabetes in females?"]

queries_lang = ["English"]

### Returning results similar to given Queries (KNN)

In [None]:
results_list = []

for idx, q in enumerate(queries):

    # Retrieve the nearest neighbors
    query_embed = co.embed(texts=[q], model='multilingual-22-12').embeddings
    similar_item_ids, _ = search_index.get_nns_by_vector(query_embed[0], 5, include_distances=True)

    if len(similar_item_ids) >= 2:
        # Format the results
        results = pd.DataFrame(data={'Title': df.iloc[similar_item_ids[0]]['Title'],
                                     'Abstract': df.iloc[similar_item_ids[0]]['Abstract'],
                                     'Authors': df.iloc[similar_item_ids[0]]['Authors'],
                                     'Publication Year': df.iloc[similar_item_ids[0]]['Publication Year']}, index=[idx])

        results_list.append(results)
        print(f"Query:'{q}'\nNearest neighbors:")
        print(queries_lang[idx])
        print(results)
        print("\n")
    else:
        print("Not enough similar items found for query:", q)


Query:'what are the symptoms of diabetes in females?'
Nearest neighbors:
English
                                               Title  \
0  Diabetes Distress Among the Roma Population Fr...   

                                            Abstract  \
0  Background Distress in patients with diabetes ...   

                                             Authors  Publication Year  
0  ['Cosoreanu A; Rusu E; Mihai DA; Rusu F; Pante...              2024  




## 6. Pinecone Index and Final Results


### Initialize the Pinecone Index

In [None]:
pc = Pinecone(pinecone_api_key)

In [None]:
index_name = 'entrez'

# if the index does not exist, we create it
if index_name not in pc.list_indexes().names():
   pc.create_index(
    index_name,
    dimension=shape[1], # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
    )

# connect to index
index = pc.Index(index_name)

In [None]:
shape = np.array(embeds).shape
shape

(1461, 768)

### Upsert the data into the Pinecone index

In [None]:
batch_size = 128

ids = [str(i) for i in range(shape[0])]
# create list of metadata dictionaries
meta = [{'Title': title, 'Abstract': abstract, 'Authors': authors, 'Publication Year': publication_year} for title, abstract, authors, publication_year in zip(df['Title'], df['Abstract'], df['Authors'], df['Publication Year'])]

# create list of (id, vector, metadata) tuples to be upserted
to_upsert = list(zip(ids, embeds, meta))

for i in range(0, shape[0], batch_size):
    i_end = min(i+batch_size, shape[0])
    index.upsert(vectors=to_upsert[i:i_end])

# let's view the index statistics
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1792}},
 'total_vector_count': 1792}

### Return the results most similar to the query from the Pinecone index

In [None]:
query = "Are women with diabetes at higher risk for heart disease?"

# create the query embedding
xq = co.embed(
    texts=[query],
    model='multilingual-22-12',
    truncate='NONE'
).embeddings

print(np.array(xq).shape)

# query, returning the top 10 most similar results
res = index.query(vector=xq, top_k=10, include_metadata=True)
res

(1, 768)


{'matches': [{'id': '362',
              'metadata': {'Abstract': 'Type 2 diabetes mellitus (T2DM) is a '
                                       'complex health issue include obesity, '
                                       'high cholesterol, high blood pressure, '
                                       'and chronic inflammation that increase '
                                       'the risk of cardiovascular diseases '
                                       '(CVDs). CVDs are of great concern in '
                                       'the disease progression and prognosis '
                                       'of T2DM. This review is a '
                                       'comprehensive examination of the '
                                       'literature on the relationship between '
                                       'T2DM and cardiovascular risk, '
                                       'nutrition-related cardiometabolic risk '
                                      

In [None]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['Title']}")
    print(f"{match['score']:.2f}: {match['metadata']['Abstract']}")
    print(f"{match['score']:.2f}: {match['metadata']['Authors']}")

0.92: Dietary Modulations in Preventing Cardiometabolic Risk in Individuals with Type 2 Diabetes.
0.92: Type 2 diabetes mellitus (T2DM) is a complex health issue include obesity, high cholesterol, high blood pressure, and chronic inflammation that increase the risk of cardiovascular diseases (CVDs). CVDs are of great concern in the disease progression and prognosis of T2DM. This review is a comprehensive examination of the literature on the relationship between T2DM and cardiovascular risk, nutrition-related cardiometabolic risk (CMR) factors, and impact of dietary modulations on CMR. In recent years the researches has been focus on the importance of a comprehensive treatment approach like dietary modulations to address multiple cardiovascular risk reductions, including hypertension and dyslipidemia. Modulation of dietary patterns are the most promising interventions to prevent CMR factors and T2DM via affecting the body weight, glucose control, and microbial diversity of individuals. 