In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [8]:
keyword = "diagnostic device"
num_articles = 100
encodingmethod = "utf-8"
errortype = "strict"

In [9]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

encoded_search_term = urllib.parse.quote(keyword, encoding=encodingmethod, errors=errortype)
url = f'http://export.arxiv.org/api/query?search_query=all:{encoded_search_term}&start=0&max_results={num_articles}'

print(f"Searching for '{keyword}' on arXiv...")
print(f"URL: {url}")

try:
    response = urllib.request.urlopen(url)
    try:
        url_read = response.read().decode("utf-8")
    except UnicodeDecodeError:
        response = urllib.request.urlopen(url)
        url_read = response.read().decode("utf-8", errors="ignore")

    parse_xml = ET.fromstring(url_read)
    print("Successfully retrieved search results!")
except Exception as e:
    print(f"Error retrieving data: {e}")
    raise

Searching for 'diagnostic device' on arXiv...
URL: http://export.arxiv.org/api/query?search_query=all:diagnostic%20device&start=0&max_results=100
Successfully retrieved search results!


In [10]:
ns = {"ns": "http://www.w3.org/2005/Atom"}
entries = parse_xml.findall('ns:entry', ns)

articles_data = []
for entry in entries:
    link = entry.find('ns:link[@type="application/pdf"]', ns)
    if link is not None and "href" in link.attrib:
        pdf_url = link.attrib['href']

        title = entry.find('ns:title', ns)
        title_text = title.text.strip() if title is not None else "Unknown Title"

        authors = entry.findall('ns:author/ns:name', ns)
        author_names = [author.text for author in authors] if authors else ["Unknown Author"]

        published = entry.find('ns:published', ns)
        published_date = published.text[:10] if published is not None else "Unknown Date"

        summary = entry.find('ns:summary', ns)
        summary_text = summary.text.strip() if summary is not None else "No summary available"

        metadata = {
            'title': title_text,
            'authors': author_names,
            'published': published_date,
            'summary': summary_text
        }

        articles_data.append({
            'pdf_url': pdf_url,
            'metadata': metadata
        })

print(f"Found {len(articles_data)} articles with PDF links")
for i, article in enumerate(articles_data):
    print(f"{i+1}. {article['metadata']['title'][:80]}...")

Found 100 articles with PDF links
1. An Electrochemical Potentiostat Interface for Mobile Devices: Enabling
  Remote ...
2. Practical Statistical Considerations for the Clinical Validation of
  AI/ML-enab...
3. Cross-device Federated Learning for Mobile Health Diagnostics: A First
  Study o...
4. Random Forests for Industrial Device Functioning Diagnostics Using
  Wireless Se...
5. D-Mag: a laboratory for studying plasma physics and diagnostics in
  strong magn...
6. Noninvasive Acute Compartment Syndrome Diagnosis Using Random Forest
  Machine L...
7. Diagnostic criterion for crystallized beams...
8. Plasma diagnostics using digital holographic interferometry...
9. Bioimpedance a Diagnostic Tool for Tobacco Induced Oral Lesions: a Mixed
  Model...
10. Active Sampling for MRI-based Sequential Decision Making...
11. Conceptual Study of a Collective Thomson Scattering Diagnostic for SPARC...
12. Integrated Data Analysis and Validation...
13. Special behavior of alkali beam emission spect

In [12]:
from sentence_transformers import SentenceTransformer
import faiss

dimension = 768
model = SentenceTransformer('pritamdeka/S-BioBert-snli-multinli-stsb')
chunk_index = faiss.IndexFlatL2(dimension)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
import numpy as np
def cos_sim(e1, e2):
    return np.dot(e1, e2) / (np.linalg.norm(e1) * np.linalg.norm(e2))

In [21]:
import requests
from PyPDF2 import PdfReader
import io
import nltk

chunks = []
chunk_size_sentences = 50

for i, article in enumerate(articles_data):
    try:
        pdf_response = requests.get(article['pdf_url'], timeout=30)
        pdf_response.raise_for_status()

        pdf_file = io.BytesIO(pdf_response.content)
        pdf_reader = PdfReader(pdf_file)
        pdf_text = ""

        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text and page_text.strip():
                pdf_text += page_text + " "

        pdf_text = re.sub(r' {2,}', ' ', pdf_text)
        pdf_text = re.sub(r'\n{3,}', '\n\n', pdf_text)
        pdf_text = re.sub(r'[\f\v\r]', ' ', pdf_text)
        pdf_text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', pdf_text)
        pdf_text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', pdf_text)
        pdf_text = pdf_text.strip()

        sentences = nltk.sent_tokenize(pdf_text)
        num_sentences = len(sentences)

        for j in range(0, num_sentences, chunk_size_sentences):
            chunk_sentences = sentences[j:j + chunk_size_sentences]
            chunk_text = " ".join(chunk_sentences)

            if chunk_text:
                chunk_embedding = model.encode(chunk_text, convert_to_tensor=True)
                chunk_embedding = chunk_embedding.cpu().numpy().astype('float32')

                chunk_data = {
                    'text': chunk_text,
                    'embedding': chunk_embedding,
                    'metadata': article['metadata'],
                    'sentence_count': len(chunk_sentences)
                }
                chunks.append(chunk_data)

    except Exception as e:
        print(f"Error processing article {i+1}: {str(e)}")



In [23]:
for chunk in chunks:
    chunk_index.add(chunk['embedding'].reshape(1, -1))

In [24]:
chunks[0]

{'text': '1 \n An Electrochemical Potentiostat Interface for Mobile Devices : Enabling \nRemote Medical Diagnostics \nHenry Fu, Henry Chow, Michael Lew, Shruti Menon, Craig Scratchley, M. Ash Parameswaran \nAbstract \nAn electrochemical potentiostat interface for mobile devices has been designed and \nimplemented. The interface consists of a potentiostat module, a microcontroller module , and a \nBluetooth module. The potentiostat module performs electrochemical measurements and detects \nthe response s from the samples. The microcontro ller module controls the test and \ncommunication processes. The Bluetooth module links the system to a mobile device, where the \nmobile device acts as a control -console, data storage system , communication unit , and graphica l \nplotter for the overall diagnosti c processes. This interface is suitable for point -of-care and remote \ndiagnostics , enhanc ing the capabilities of mobile devices in telemedicine. Keywords \nMobile device; Remote medical 

In [26]:
index_filename = "chunks.index"

faiss.write_index(chunk_index, index_filename)

print(f"FAISS index exported to {index_filename}")

FAISS index exported to chunks.index
