In [1]:
question = input("What is your question? ")

What is your question? What are some glucose monitors for athletes?


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')


sw_nltk = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

question = question.lower()
words = nltk.word_tokenize(question)
words_no_punct = [re.sub(r'[^\w\s]', '', word) for word in words]
words_no_punct = [word for word in words_no_punct if word]
filtered_words = [word for word in words_no_punct if word not in sw_nltk]
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
keyword = ' '.join(lemmatized_words)

print(f"Processed keyword: {keyword}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Processed keyword: glucose monitor athlete


In [3]:
num_articles = 5
encodingmethod = "utf-8"
errortype = "strict"

In [4]:
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET

encoded_search_term = urllib.parse.quote(keyword, encoding=encodingmethod, errors=errortype)
url = f'http://export.arxiv.org/api/query?search_query=all:{encoded_search_term}&start=0&max_results={num_articles}'

print(f"Searching for '{keyword}' on arXiv...")
print(f"URL: {url}")

try:
    response = urllib.request.urlopen(url)
    try:
        url_read = response.read().decode("utf-8")
    except UnicodeDecodeError:
        response = urllib.request.urlopen(url)
        url_read = response.read().decode("utf-8", errors="ignore")

    parse_xml = ET.fromstring(url_read)
    print("Successfully retrieved search results!")
except Exception as e:
    print(f"Error retrieving data: {e}")
    raise

Searching for 'glucose monitor athlete' on arXiv...
URL: http://export.arxiv.org/api/query?search_query=all:glucose%20monitor%20athlete&start=0&max_results=5
Successfully retrieved search results!


In [5]:
ns = {"ns": "http://www.w3.org/2005/Atom"}
entries = parse_xml.findall('ns:entry', ns)

articles_data = []
for entry in entries:
    link = entry.find('ns:link[@type="application/pdf"]', ns)
    if link is not None and "href" in link.attrib:
        pdf_url = link.attrib['href']

        title = entry.find('ns:title', ns)
        title_text = title.text.strip() if title is not None else "Unknown Title"

        authors = entry.findall('ns:author/ns:name', ns)
        author_names = [author.text for author in authors] if authors else ["Unknown Author"]

        published = entry.find('ns:published', ns)
        published_date = published.text[:10] if published is not None else "Unknown Date"

        summary = entry.find('ns:summary', ns)
        summary_text = summary.text.strip() if summary is not None else "No summary available"

        metadata = {
            'title': title_text,
            'authors': author_names,
            'published': published_date,
            'summary': summary_text
        }

        articles_data.append({
            'pdf_url': pdf_url,
            'metadata': metadata
        })

print(f"Found {len(articles_data)} articles with PDF links")
for i, article in enumerate(articles_data):
    print(f"{i+1}. {article['metadata']['title'][:80]}...")

Found 5 articles with PDF links
1. A personalized model and optimization strategy for estimating blood
  glucose co...
2. cgmquantify: Python and R packages for comprehensive analysis of
  interstitial ...
3. Everything You Wanted to Know About Noninvasive Glucose Measurement and
  Contro...
4. Distributed lag models to identify the cumulative effects of training
  and reco...
5. Non-Invasive Glucose Monitoring Techniques: A review and current trends...


In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('pritamdeka/S-BioBert-snli-multinli-stsb')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
import requests
import io
from PyPDF2 import PdfReader
import re
import numpy as np

def add_articles(index, articles):
  chunks = []
  chunk_size_sentences = 50
  for i, article in enumerate(articles_data):
    try:
        pdf_response = requests.get(article['pdf_url'], timeout=30)
        pdf_response.raise_for_status()

        pdf_file = io.BytesIO(pdf_response.content)
        pdf_reader = PdfReader(pdf_file)
        pdf_text = ""

        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text and page_text.strip():
                pdf_text += page_text + " "

        pdf_text = re.sub(r' {2,}', ' ', pdf_text)
        pdf_text = re.sub(r'\n{3,}', '\n\n', pdf_text)
        pdf_text = re.sub(r'[\f\v\r]', ' ', pdf_text)
        pdf_text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', pdf_text)
        pdf_text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', pdf_text)
        pdf_text = pdf_text.strip()

        sentences = nltk.sent_tokenize(pdf_text)
        num_sentences = len(sentences)

        for j in range(0, num_sentences, chunk_size_sentences):
            chunk_sentences = sentences[j:j + chunk_size_sentences]
            chunk_text = " ".join(chunk_sentences)

            if chunk_text:
                chunk_embedding = model.encode(chunk_text, convert_to_tensor=True)
                chunk_embedding = chunk_embedding.cpu().numpy().astype('float32')

                chunk_data = {
                    'text': chunk_text,
                    'embedding': chunk_embedding,
                    'metadata': article['metadata'],
                    'sentence_count': len(chunk_sentences)
                }
                chunks.append(chunk_data)

    except Exception as e:
        print(f"Error processing article {i+1}: {str(e)}")

    for chunk in chunks:
        index.add(chunk['embedding'].reshape(1, -1))

    return chunks

In [11]:
import faiss
chunk_index = faiss.read_index('chunks.index')

In [12]:
add_articles(chunk_index, articles_data)

[{'text': 'A personalized model and optimization strategy for estimating\nblood glucose concentrations from sweat measurements\nXiaoyu Yina,∗, Elisabetta Peria, Eduard Pelssersa, Jaap den Toondera, Lisa Klousb, Hein\nDaanencand Massimo Mischia\naEindhoven University of Technology, Eindhoven, NL\nbNetherlands Organisation for Applied Scientific Research, Soesterberg, NL\ncVrije Universiteit Amsterdam, Amsterdam, NL\nARTICLE INFO\nKeywords :\nSweat sensing\ndiabetes\npatient monitoring\npharmacokinetic modelingABSTRACT\nBackground and objective: Diabetes is one of the four leading causes of death worldwide,\nnecessitating daily blood glucose monitoring. While sweat offers a promising non-invasive\nalternative for glucose monitoring, its application remains limited due to the low to moderate\ncorrelationbetweensweatandbloodglucoseconcentrations,whichhasbeenobtaineduntilnow\nbyassumingalinearrelationship. Thisstudyproposesanovelmodel-basedstrategytoestimate\nblood glucose concentrations fr