## Imports

In [1]:
from config import env_config, logger

import pandas as pd

import os

import random

import pymupdf
import pymupdf4llm

from langchain.text_splitter import MarkdownTextSplitter

from openai_client import OpenAIClient



In [3]:
pdf_folder = "./data/sample_data/pdf"
xml_folder = "./data/sample_data/xml"

train_labels_files = "./data/train_labels.csv"

In [4]:
labels_df = pd.read_csv(train_labels_files)

labels_df.head()

Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary
1,10.1002_anie.201916483,Missing,Missing
2,10.1002_anie.202005531,Missing,Missing
3,10.1002_anie.202007717,Missing,Missing
4,10.1002_chem.201902131,Missing,Missing


## Helper Functions

In [5]:
def get_random_files(folder_path, n=1):
    files = os.listdir(folder_path)
    return random.choices(files, k=n)

In [6]:
def get_labels(filename):
    return labels_df[labels_df['article_id'] == '.'.join(filename.split('.')[:-1])]

## Looking at PDFs

In [7]:
## Select a file
filename = "10.1098_rspb.2015.2764.pdf"

if not filename:
    filename = get_random_files(pdf_folder)[0]

logger.info(f"looking at {filename}")

2025-06-16 18:11:52,728 | INFO | looking at 10.1098_rspb.2015.2764.pdf


In [8]:
# Get toc along with the text
def extract_pdf_text(filepath, get_blocks=False):
    if get_blocks:
        extracted_text = []
    else:
        extracted_text = ""

    file = pymupdf.open(filepath)

    for page in file:
        if get_blocks:
            extracted_text.extend(page.get_text("blocks"))
        else:
            extracted_text += '\n' + page.get_text()

    return file.get_toc(simple=False), extracted_text


In [9]:
get_labels(filename)

Unnamed: 0,article_id,dataset_id,type
271,10.1098_rspb.2015.2764,https://doi.org/10.5061/dryad.447sq,Primary


In [10]:
toc, text = extract_pdf_text(f"{pdf_folder}/{filename}", get_blocks=True)

text[0:2]

[(42.51969909667969,
  82.55974578857422,
  188.48191833496094,
  94.5989761352539,
  'rspb.royalsocietypublishing.org\n',
  0,
  0),
 (42.51969909667969,
  146.05780029296875,
  89.96231079101562,
  162.11380004882812,
  'Research\n',
  1,
  0)]

In [11]:
toc

[[1,
  'Nutrition shapes life-history evolution across species',
  1,
  {'kind': 4,
   'xref': 396,
   'page': 0,
   'dest': '/FitR 0 842 596 0',
   'nameddest': 'head1',
   'collapse': True,
   'zoom': 0.0}],
 [2,
  'Significance statement',
  1,
  {'kind': 4,
   'xref': 397,
   'page': 0,
   'dest': '/FitR 233 427 562 406',
   'nameddest': 'head2',
   'zoom': 0.0}],
 [2,
  'Introduction',
  1,
  {'kind': 4,
   'xref': 399,
   'page': 0,
   'dest': '/FitR 233 271 562 250',
   'nameddest': 'head3',
   'zoom': 0.0}],
 [2,
  'Material and methods',
  2,
  {'kind': 4,
   'xref': 401,
   'page': 1,
   'dest': '/FitR 35 139 292 119',
   'nameddest': 'head4',
   'collapse': True,
   'zoom': 0.0}],
 [3,
  'Overview of butterfly specimens',
  2,
  {'kind': 4,
   'xref': 408,
   'page': 1,
   'dest': '/FitR 35 118 292 98',
   'nameddest': 'head5',
   'zoom': 0.0}],
 [3,
  'Quantification of host plant nutrition',
  2,
  {'kind': 4,
   'xref': 410,
   'page': 1,
   'dest': '/FitR 304 587 562 566

In [24]:
PROMPT = """The text provided to you is a snippet from a research paper. Your task is to identify mentions of datasets in the snippet, and provide appropriate details.
Provide a JSON with the following details for each mention of a dataset:
1. 'name': name of the dataset.
2. 'identifier': identifier of the dataset (unique identifier/url)
3. 'type': 'Primary',or 'Secondary'. Primary would mean the dataset was created/generated for this paper. Secondary if the dataset was taken from another paper/source (not directly related to this paper).
4. 'source': Source of the dataset (optional)
5. 'ref_no': Reference number (If applicable, provide the list number that can be looked up in the references section.)
6. 'details': Details (some brief explanation)

Reply with just 'None' if there is no mention of a dataset.
---
Below is the snippet from the paper:
{TEXT}
"""

In [25]:
## Check with openai

openai_client = OpenAIClient()

responses = []
counter = 0
for te in text:
    # if counter >= 5:
    #     break
    if 'data' in te[4] or 'dataset' in te[4]:
        responses.append(openai_client.ask_openAI(PROMPT.format(TEXT=te[4]), model_name='gpt-4.1-mini'))

        counter += 1
        logger.debug(f"Done {counter}")




2025-06-16 18:32:50,347 | DEBUG | Done 1
2025-06-16 18:32:53,490 | DEBUG | Done 2
2025-06-16 18:32:54,106 | DEBUG | Done 3
2025-06-16 18:32:56,640 | DEBUG | Done 4
2025-06-16 18:32:57,638 | DEBUG | Done 5
2025-06-16 18:33:00,799 | DEBUG | Done 6
2025-06-16 18:33:01,632 | DEBUG | Done 7
2025-06-16 18:33:02,349 | DEBUG | Done 8
2025-06-16 18:33:03,271 | DEBUG | Done 9


In [26]:
for response in responses:
    logger.info(openai_client.extract_response(response))

2025-06-16 18:33:41,690 | INFO | ```json
[
  {
    "name": "Plant nutrient data from Borer",
    "identifier": null,
    "type": "Secondary",
    "source": "Borer [39]",
    "ref_no": 39,
    "details": "Dataset containing nitrogen and phosphorus content in plants."
  },
  {
    "name": "Plant sodium content data",
    "identifier": null,
    "type": "Secondary",
    "source": "Watanabe et al. [25]",
    "ref_no": 25,
    "details": "Dataset containing sodium content in plants."
  },
  {
    "name": "Host plant samples collected in this study",
    "identifier": "electronic supplementary material, file 1",
    "type": "Primary",
    "source": "Collected from eight states across the USA",
    "ref_no": null,
    "details": "117 samples of host plants collected to complement existing datasets, measuring nitrogen, phosphorus, and sodium content."
  },
  {
    "name": "Host plant records for butterflies",
    "identifier": null,
    "type": "Secondary",
    "source": "Scott [43]",
    "ref

### Experimenting with Markdown, and text-splitters

In [56]:
text.find("10.5061")

34643

In [60]:
text[34583: 34740]

's.\nData accessibility. All data are available in Dryad (doi:10.5061/dryad.\n447sq).\nAuthors’ contributions. E.C.S.-R. devised the approach and supervised\ncoll'

In [62]:
md_text = pymupdf4llm.to_markdown(f"{pdf_folder}/{filename}")

In [87]:
splitter = MarkdownTextSplitter(chunk_size=40, chunk_overlap=0)

splitter.create_documents([md_text])

[Document(metadata={}, page_content='##### rspb.royalsocietypublishing.org'),
 Document(metadata={}, page_content='## Research'),
 Document(metadata={}, page_content='Cite this article: Swanson EM, Espeset'),
 Document(metadata={}, page_content='A,'),
 Document(metadata={}, page_content='Mikati I, Bolduc I, Kulhanek R, White'),
 Document(metadata={}, page_content='WA,'),
 Document(metadata={}, page_content='Kenzie S, Snell-Rood EC. 2016 Nutrition'),
 Document(metadata={}, page_content='shapes'),
 Document(metadata={}, page_content='life-history evolution across species.'),
 Document(metadata={}, page_content='Proc. R. Soc. B 283: 20152764.'),
 Document(metadata={}, page_content='http://dx.doi.org/10.1098/rspb.2015.276'),
 Document(metadata={}, page_content='4'),
 Document(metadata={}, page_content='Received: 16 November 2015'),
 Document(metadata={}, page_content='Accepted: 20 June 2016\n\nSubject Areas:'),
 Document(metadata={}, page_content='ecology, evolution\n\nKeywords:'),
 Docume

In [101]:
for te in text:
    if 'data' in te[4] or 'dataset' in te[4]:
        print(te)

(311.8097229003906, 267.99420166015625, 552.8659057617188, 720.09375, '(b) Quantification of host plant nutrition\nWe obtained plant nutrient data from three sources. Data on nitro-\ngen and phosphorus were obtained from Borer [39]. Plant sodium\ncontent was obtained from Watanabe et al. [25]. We additionally\ncollected 117 samples of host plants commonly used by butterflies\nthat were poorly represented for the nutrients in which we were\ninterested, in addition to a subset of re-sampled species to ensure\nour methods were in alignment with previous studies (samples\nfrom eight states across the USA; see the electronic supplementary\nmaterial, file 1). We measured nitrogen using the Dumas method\n[40] and used inductively coupled plasma mass spectrometry to\nmeasure phosphorus and sodium. Together, we drew from 8381\nplant samples for our measurements of plant nutrient levels. We\nfocused on percentage nutrient content for dried leaf samples.\nAngiosperm phylogeny group III (APG III) 