## Imports

In [2]:
from config import env_config, logger

import pandas as pd

import os

import random

import pymupdf
import pymupdf4llm

from langchain.text_splitter import MarkdownTextSplitter

from openai_client import OpenAIClient

import xml.etree.ElementTree as ET

## JSON extraction
import json
import re

In [3]:
pdf_folder = "./data/sample_data/pdf"
xml_folder = "./data/sample_data/xml"

train_labels_files = "./data/train_labels.csv"

In [4]:
labels_df = pd.read_csv(train_labels_files)

labels_df.head()

Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary
1,10.1002_anie.201916483,Missing,Missing
2,10.1002_anie.202005531,Missing,Missing
3,10.1002_anie.202007717,Missing,Missing
4,10.1002_chem.201902131,Missing,Missing


## Helper Functions

In [5]:
def get_random_files(folder_path, n=1):
    files = os.listdir(folder_path)
    return random.choices(files, k=n)

In [6]:
def get_labels(filename):
    return labels_df[labels_df['article_id'] == '.'.join(filename.split('.')[:-1])]

## Looking at PDFs

In [8]:
## Select a file
#filename = "10.1002_anie.202005531.pdf"
filename = "10.1002_2017jc013030.pdf"

if not filename:
    filename = get_random_files(pdf_folder)[0]

logger.info(f"looking at {filename}")

2025-06-18 16:57:16,656 | INFO | looking at 10.1002_2017jc013030.pdf


In [9]:
# Get toc along with the text
def extract_pdf_text(filepath, get_blocks=False):
    if get_blocks:
        extracted_text = []
    else:
        extracted_text = ""

    file = pymupdf.open(filepath)

    for page in file:
        if get_blocks:
            extracted_text.extend(page.get_text("blocks"))
        else:
            extracted_text += '\n' + page.get_text()

    return file.get_toc(simple=False), extracted_text


In [10]:
get_labels(filename)

Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary


In [11]:
toc, text = extract_pdf_text(f"{pdf_folder}/{filename}", get_blocks=True)

text[0:2]

[(44.447200775146484,
  125.79997253417969,
  161.33412170410156,
  137.7599639892578,
  'RESEARCH ARTICLE\n',
  0,
  0),
 (44.447200775146484,
  142.6562042236328,
  131.0812530517578,
  150.9272003173828,
  '10.1002/2017JC013030\n',
  1,
  0)]

In [12]:
toc

[[1,
  'l',
  5,
  {'kind': 4,
   'xref': 392,
   'page': 4,
   'dest': '/FitR 0 686 630 686',
   'nameddest': 'l',
   'zoom': 0.0}],
 [1,
  'l',
  5,
  {'kind': 4,
   'xref': 393,
   'page': 4,
   'dest': '/FitR 0 686 630 686',
   'nameddest': 'l',
   'zoom': 0.0}]]

In [20]:
PROMPT = """The text provided to you is a snippet from a research paper. Your task is to identify mentions of datasets in the snippet, and provide appropriate details.
Provide a JSON with the following details for each mention of a dataset:
1. 'name': name of the dataset.
2. 'identifier': identifier of the dataset (unique identifier/url/accession number)
3. 'type': 'Primary',or 'Secondary'. Primary would mean the dataset was created/generated for this paper. Secondary if the dataset was taken from another paper/source (not directly related to this paper).
4. 'source': Source of the dataset (optional)
5. 'ref_no': Reference number (If applicable, provide the list number that can be looked up in the references section.)
6. 'details': Details (some brief explanation)

Reply with just 'None' if there is no mention of a dataset.

Make sure that:
1. The identifier is a reference to a dataset and not a paper.
2. Prefer DOI references, followed by other identifiers (PBD, GSE, etc.). Give lowest preference to obscure (not known/generally used) URLs.
---
Below is the snippet from the paper:
{TEXT}
"""

In [21]:
## Check with openai

openai_client = OpenAIClient()

responses = []
counter = 0
for te in text:
    # if counter >= 5:
    #     break
    if 'data' in te[4] or 'dataset' in te[4]:
        responses.append(openai_client.ask_openAI(PROMPT.format(TEXT=te[4]), model_name='gpt-4.1-mini'))

        counter += 1
        logger.debug(f"Done {counter}")




2025-06-18 17:42:28,057 | DEBUG | Done 1
2025-06-18 17:42:30,294 | DEBUG | Done 2
2025-06-18 17:42:31,096 | DEBUG | Done 3
2025-06-18 17:42:32,245 | DEBUG | Done 4
2025-06-18 17:42:36,492 | DEBUG | Done 5
2025-06-18 17:42:39,802 | DEBUG | Done 6
2025-06-18 17:42:40,721 | DEBUG | Done 7
2025-06-18 17:42:44,084 | DEBUG | Done 8
2025-06-18 17:42:44,743 | DEBUG | Done 9
2025-06-18 17:42:45,742 | DEBUG | Done 10
2025-06-18 17:42:48,354 | DEBUG | Done 11
2025-06-18 17:42:49,200 | DEBUG | Done 12
2025-06-18 17:42:49,893 | DEBUG | Done 13
2025-06-18 17:42:54,489 | DEBUG | Done 14
2025-06-18 17:42:55,578 | DEBUG | Done 15
2025-06-18 17:42:57,469 | DEBUG | Done 16
2025-06-18 17:42:59,306 | DEBUG | Done 17
2025-06-18 17:42:59,953 | DEBUG | Done 18
2025-06-18 17:43:00,798 | DEBUG | Done 19
2025-06-18 17:43:03,639 | DEBUG | Done 20
2025-06-18 17:43:04,222 | DEBUG | Done 21
2025-06-18 17:43:04,929 | DEBUG | Done 22
2025-06-18 17:43:07,941 | DEBUG | Done 23
2025-06-18 17:43:08,617 | DEBUG | Done 24
2

In [22]:
for response in responses:
    logger.info(openai_client.extract_response(response))

2025-06-18 17:43:56,815 | INFO | ```json
[
  {
    "name": "Biogeochemical-Argo (BGC-Argo) database",
    "identifier": "BGC-Argo database of more than 8,500 multivariable profiles",
    "type": "Secondary",
    "details": "A large dataset of autonomous high-frequency bio-optical measurements from Biogeochemical-Argo profiling floats, including chlorophyll fluorescence and particulate backscattering coefficient collected across various oceanic conditions globally."
  }
]
```
2025-06-18 17:43:56,815 | INFO | [
  {
    "name": "BGC-Argo float network",
    "identifier": "",
    "type": "Secondary",
    "source": "BGC-Argo program",
    "ref_no": null,
    "details": "A global network of biogeochemical Argo floats providing quality data for studying biogeochemical processes both globally and vertically in the ocean."
  }
]
2025-06-18 17:43:56,815 | INFO | None
2025-06-18 17:43:56,816 | INFO | None
2025-06-18 17:43:56,816 | INFO | ```json
[
  {
    "name": "BGC-Argo bio-optical database",


### Experimenting with Markdown, and text-splitters

In [37]:
text.find("10.5061")

AttributeError: 'list' object has no attribute 'find'

In [60]:
text[34583: 34740]

's.\nData accessibility. All data are available in Dryad (doi:10.5061/dryad.\n447sq).\nAuthors’ contributions. E.C.S.-R. devised the approach and supervised\ncoll'

In [62]:
md_text = pymupdf4llm.to_markdown(f"{pdf_folder}/{filename}")

In [87]:
splitter = MarkdownTextSplitter(chunk_size=40, chunk_overlap=0)

splitter.create_documents([md_text])

[Document(metadata={}, page_content='##### rspb.royalsocietypublishing.org'),
 Document(metadata={}, page_content='## Research'),
 Document(metadata={}, page_content='Cite this article: Swanson EM, Espeset'),
 Document(metadata={}, page_content='A,'),
 Document(metadata={}, page_content='Mikati I, Bolduc I, Kulhanek R, White'),
 Document(metadata={}, page_content='WA,'),
 Document(metadata={}, page_content='Kenzie S, Snell-Rood EC. 2016 Nutrition'),
 Document(metadata={}, page_content='shapes'),
 Document(metadata={}, page_content='life-history evolution across species.'),
 Document(metadata={}, page_content='Proc. R. Soc. B 283: 20152764.'),
 Document(metadata={}, page_content='http://dx.doi.org/10.1098/rspb.2015.276'),
 Document(metadata={}, page_content='4'),
 Document(metadata={}, page_content='Received: 16 November 2015'),
 Document(metadata={}, page_content='Accepted: 20 June 2016\n\nSubject Areas:'),
 Document(metadata={}, page_content='ecology, evolution\n\nKeywords:'),
 Docume

In [101]:
for te in text:
    if 'data' in te[4] or 'dataset' in te[4]:
        print(te)

(311.8097229003906, 267.99420166015625, 552.8659057617188, 720.09375, '(b) Quantification of host plant nutrition\nWe obtained plant nutrient data from three sources. Data on nitro-\ngen and phosphorus were obtained from Borer [39]. Plant sodium\ncontent was obtained from Watanabe et al. [25]. We additionally\ncollected 117 samples of host plants commonly used by butterflies\nthat were poorly represented for the nutrients in which we were\ninterested, in addition to a subset of re-sampled species to ensure\nour methods were in alignment with previous studies (samples\nfrom eight states across the USA; see the electronic supplementary\nmaterial, file 1). We measured nitrogen using the Dumas method\n[40] and used inductively coupled plasma mass spectrometry to\nmeasure phosphorus and sodium. Together, we drew from 8381\nplant samples for our measurements of plant nutrient levels. We\nfocused on percentage nutrient content for dried leaf samples.\nAngiosperm phylogeny group III (APG III) 

### Exoploring XML files

In [104]:
filename = None

if not filename:
    filename = get_random_files(xml_folder)[0]

logger.info(f"looking at {filename}")

2025-06-17 18:31:26,446 | INFO | looking at 10.1055_s-0039-1693681.xml


In [105]:
tags_to_avoid = ['']

def extract_xml_text(filepath):
    tree = ET.parse(filepath)
    root = tree.getroot()

    extracted_text = []
    for item in root.iter():
        if item.tag in ['title', 'p']:
            extracted_text.append(f"{item.tag}: {item.text}")
    return extracted_text

In [106]:
extracted_text = extract_xml_text(f"{xml_folder}/{filename}")

In [107]:
responses = []
counter = 0
for te in extracted_text:
    # if counter >= 5:
    #     break
    if 'data' in te or 'dataset' in te:
        logger.debug(f"Looking at text: {te}")
        responses.append(openai_client.ask_openAI(PROMPT.format(TEXT=te), model_name='gpt-4.1-mini'))

        counter += 1
        logger.debug(f"Done {counter}")


2025-06-17 18:31:27,303 | DEBUG | Looking at text: p: The data obtained was organized in electronic spreadsheets using the Microsoft Excel 2007 software (Microsoft Corporation, Redmond, WA, USA). The statistical analysis was performed using the software WinSTAT, version 2007.1 (R. Fitch Software, Cambridge, MA, USA). Continuous numerical data are presented in the average ± standard deviation (SD) format. Normal distribution was tested using the Kolmogorov-Smirnov test. Group comparisons were performed by analysis of variance (ANOVA), and multiple comparisons were corrected using the least significant difference method. The Kruskal-Wallis test was performed for data without a normal distribution, or when homogeneity of variance had not been proven. A significance level of 5% was adopted.
2025-06-17 18:31:28,087 | DEBUG | Done 1
2025-06-17 18:31:28,087 | DEBUG | Looking at text: p: The median values of MMP-2 and MMP-9 mRNA expression were lower in groups treated with estrogen or with est

In [108]:
for response in responses:
    logger.info(openai_client.extract_response(response))

2025-06-17 18:31:51,109 | INFO | None
2025-06-17 18:31:51,110 | INFO | None
2025-06-17 18:31:51,111 | INFO | None
