In [1]:
# Inputs

question = """
In adult patients presenting to the emergency department with suspected sepsis, what is the reported 
impact of using point-of-care procalcitonin (PCT) testing, compared to standard central laboratory PCT testing, 
on time to antibiotic administration and key clinical outcomes such as in-hospital mortality and length of stay?
"""

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')


sw_nltk = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def keyword_generator_stopwords(question):
    question = question.lower()
    words = nltk.word_tokenize(question)
    words_no_punct = [re.sub(r'[^\w\s]', '', word) for word in words]
    words_no_punct = [word for word in words_no_punct if word]
    filtered_words = [word for word in words_no_punct if word not in sw_nltk]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    keyword = ' '.join(lemmatized_words)
    return keyword

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tanmayshubhgarg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tanmayshubhgarg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tanmayshubhgarg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tanmayshubhgarg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
without_stopwords = keyword_generator_stopwords(question)

In [4]:
prompt = f"""
You are an expert AI assistant that generates a search query for the arXiv database. Your goal is to convert a user's question into a syntactically perfect and logically optimal search query string.

First, analyze the user's question to identify the core concepts, keywords, and important relationships.

Then, construct a single search query string based on those concepts. The query string **must** adhere to the following strict formatting rules:

1.  **Uppercase Operators**: All boolean operators **must** be in uppercase: `AND`, `OR`, `ANDNOT`.
2.  **Quote All Phrases**: Any search term containing more than one word **must** be enclosed in double quotes (`"`). For example, search for `"point-of-care testing"`, not `point-of-care testing`.
3.  **Group with Parentheses**: Use parentheses `()` to logically group related concepts, especially when mixing `AND` and `OR`.
4.  **Combine Synonyms & Acronyms**: Use the `OR` operator inside parentheses to search for multiple variations of a single concept (e.g., synonyms, acronyms, or different spellings). For example: `("procalcitonin" OR "pct")`.
5.  **Connect Core Concepts with AND**: Link the main, distinct ideas of the search together using the `AND` operator to ensure all concepts are present in the results.

The user's question is: "{question}"
The question with stopwords removed is: "{without_stopwords}"

Now, generate the single, optimal search query string. Your output must be **only the search term string itself**, with no additional explanation, preamble, or text.
"""



In [5]:
print(prompt)


You are an expert AI assistant that generates a search query for the arXiv database. Your goal is to convert a user's question into a syntactically perfect and logically optimal search query string.

First, analyze the user's question to identify the core concepts, keywords, and important relationships.

Then, construct a single search query string based on those concepts. The query string **must** adhere to the following strict formatting rules:

1.  **Uppercase Operators**: All boolean operators **must** be in uppercase: `AND`, `OR`, `ANDNOT`.
2.  **Quote All Phrases**: Any search term containing more than one word **must** be enclosed in double quotes (`"`). For example, search for `"point-of-care testing"`, not `point-of-care testing`.
3.  **Group with Parentheses**: Use parentheses `()` to logically group related concepts, especially when mixing `AND` and `OR`.
4.  **Combine Synonyms & Acronyms**: Use the `OR` operator inside parentheses to search for multiple variations of a sin

In [6]:
import os
import dotenv
dotenv.load_dotenv()
from openai import AzureOpenAI

endpoint = "https://aoai-camp.openai.azure.com/"
model_name = "gpt-4o-mini"
deployment = "abbott_researcher"
subscription_key = os.getenv("AZURE_OPEN_AI_KEY")
api_version = "2024-12-01-preview"

client = AzureOpenAI(
    api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
)

In [7]:
chat_completion = client.chat.completions.create(
        model=deployment,
        messages=[{"role": "user", "content": prompt}]
    )
reply = chat_completion.choices[0].message.content
print(f"ChatGPT: {reply}")

ChatGPT: ("adult patients" OR "adults") AND ("emergency department" OR "emergency room") AND "suspected sepsis" AND ("point-of-care" OR "POC") AND ("procalcitonin" OR "pct") AND ("central laboratory" OR "standard laboratory") AND ("time to antibiotic administration" OR "time until antibiotic administration") AND ("in-hospital mortality" OR "mortality") AND ("length of stay" OR "hospital stay")


In [8]:
llm_keywords = reply
keyword = llm_keywords


In [9]:
keyword = without_stopwords

In [25]:
keyword = "diagnostic device"
num_articles = 100
encodingmethod = "utf-8"
errortype = "strict"

In [26]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

encoded_search_term = urllib.parse.quote(keyword, encoding=encodingmethod, errors=errortype)
url = f'http://export.arxiv.org/api/query?search_query=all:{encoded_search_term}&start=0&max_results={num_articles}'

print(f"Searching for '{keyword}' on arXiv...")
print(f"URL: {url}")

try:
    response = urllib.request.urlopen(url)
    try:
        url_read = response.read().decode("utf-8")
    except UnicodeDecodeError:
        response = urllib.request.urlopen(url)
        url_read = response.read().decode("utf-8", errors="ignore")

    parse_xml = ET.fromstring(url_read)
    print("Successfully retrieved search results!")
except Exception as e:
    print(f"Error retrieving data: {e}")
    raise

Searching for 'diagnostic device' on arXiv...
URL: http://export.arxiv.org/api/query?search_query=all:diagnostic%20device&start=0&max_results=100
Successfully retrieved search results!
Successfully retrieved search results!


In [27]:
ns = {"ns": "http://www.w3.org/2005/Atom"}
entries = parse_xml.findall('ns:entry', ns)

articles_data = []
for entry in entries:
    link = entry.find('ns:link[@type="application/pdf"]', ns)
    if link is not None and "href" in link.attrib:
        pdf_url = link.attrib['href']

        title = entry.find('ns:title', ns)
        title_text = title.text.strip() if title is not None else "Unknown Title"

        authors = entry.findall('ns:author/ns:name', ns)
        author_names = [author.text for author in authors] if authors else ["Unknown Author"]

        published = entry.find('ns:published', ns)
        published_date = published.text[:10] if published is not None else "Unknown Date"

        summary = entry.find('ns:summary', ns)
        summary_text = summary.text.strip() if summary is not None else "No summary available"

        metadata = {
            'title': title_text,
            'authors': author_names,
            'published': published_date,
            'summary': summary_text
        }

        articles_data.append({
            'pdf_url': pdf_url,
            'metadata': metadata
        })

print(f"Found {len(articles_data)} articles with PDF links")
for i, article in enumerate(articles_data):
    print(f"{i+1}. {article['metadata']['title'][:80]}...")

Found 100 articles with PDF links
1. An Electrochemical Potentiostat Interface for Mobile Devices: Enabling
  Remote ...
2. Practical Statistical Considerations for the Clinical Validation of
  AI/ML-enab...
3. Cross-device Federated Learning for Mobile Health Diagnostics: A First
  Study o...
4. Random Forests for Industrial Device Functioning Diagnostics Using
  Wireless Se...
5. D-Mag: a laboratory for studying plasma physics and diagnostics in
  strong magn...
6. Noninvasive Acute Compartment Syndrome Diagnosis Using Random Forest
  Machine L...
7. Diagnostic criterion for crystallized beams...
8. Plasma diagnostics using digital holographic interferometry...
9. Bioimpedance a Diagnostic Tool for Tobacco Induced Oral Lesions: a Mixed
  Model...
10. Active Sampling for MRI-based Sequential Decision Making...
11. Conceptual Study of a Collective Thomson Scattering Diagnostic for SPARC...
12. Integrated Data Analysis and Validation...
13. Special behavior of alkali beam emission spect

In [29]:
from sentence_transformers import SentenceTransformer
import faiss

dimension = 768
model = SentenceTransformer('pritamdeka/S-BioBert-snli-multinli-stsb')
chunk_index = faiss.IndexFlatL2(dimension)

In [31]:
import numpy as np
def cos_sim(e1, e2):
    return np.dot(e1, e2) / (np.linalg.norm(e1) * np.linalg.norm(e2))

In [None]:
import requests
from PyPDF2 import PdfReader
import io

chunks = []

for i, article in enumerate(articles_data):
    try:
        pdf_response = requests.get(article['pdf_url'], timeout=30)
        pdf_response.raise_for_status()

        pdf_file = io.BytesIO(pdf_response.content)
        pdf_reader = PdfReader(pdf_file)
        pdf_text = ""

        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text and page_text.strip():
                pdf_text += page_text + " "

        pdf_text = re.sub(r' {2,}', ' ', pdf_text)
        pdf_text = re.sub(r'\n{3,}', '\n\n', pdf_text)
        pdf_text = re.sub(r'[\f\v\r]', ' ', pdf_text)
        pdf_text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', pdf_text)
        pdf_text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', pdf_text)   
        pdf_text = pdf_text.strip()

        sentences = nltk.sent_tokenize(pdf_text)

        article_chunks = []
        current_chunk = []
        current_chunk_text = ""
        prev_embedding = None
        max_chunk_size = 1500

        for sentence in sentences:
            sentence_embedding = model.encode(sentence, convert_to_tensor=True)
            sentence_embedding = sentence_embedding.cpu().numpy().astype('float32')

            new_chunk = False

            if prev_embedding is not None:
                similarity = cos_sim(sentence_embedding, prev_embedding)
                if similarity < 0.8:
                    new_chunk = True
            
            if len(current_chunk_text.split()) + len(sentence.split()) > max_chunk_size:
                new_chunk = True
            
            if new_chunk:
                chunk_text = " ".join(current_chunk)
                chunk_embedding = model.encode(chunk_text, convert_to_tensor=True)
                chunk_embedding = chunk_embedding.cpu().numpy().astype('float32')

                chunk_data = {
                    'text': chunk_text,
                    'embedding': chunk_embedding,
                    'metadata': article['metadata'],
                    'count': len(current_chunk)
                }
                chunks.append(chunk_data)
                current_chunk = []
                current_chunk_text = ""

            current_chunk.append(sentence)
            current_chunk_text += sentence + " "
            prev_embedding = sentence_embedding

        if current_chunk:
            chunk_text = " ".join(current_chunk)
            chunk_embedding = model.encode(chunk_text, convert_to_tensor=True)
            chunk_embedding = chunk_embedding.cpu().numpy().astype('float32')
            
            chunk_data = {
                'text': chunk_text,
                'embedding': chunk_embedding,
                'article_index': i,
                'article_title': article['metadata']['title'],
                'sentence_count': len(current_chunk)
            }
            
            article_chunks.append(chunk_data)
            chunks.append(chunk_data)

    except Exception as e:
        print(f"Error processing article {i+1}: {str(e)}")



  return forward_call(*args, **kwargs)


: 