# 📄 Extracting Text from Scientific Papers (Test Batch) 

## Setup

In [1]:
# Imports
import os
import pandas as pd

# Add script paths
import sys
sys.path.append("../scripts")

from pdf_reader import extract_text_from_pdf
from rename_pdfs import rename_pdfs_in_folder

## 📁 1. Rename the Test PDFs


In [2]:
# Change this to process a different batch
batch_name = "first_batch"

pdf_folder = f"../data/batches/{batch_name}/"
output_file = f"../outputs/{batch_name}.csv"

# Rename PDFs in the folder
renamed_files = rename_pdfs_in_folder(pdf_folder)
print("Renamed files:", renamed_files)

Renamed files: ['../data/batches/first_batch/paper_001.pdf', '../data/batches/first_batch/paper_002.pdf', '../data/batches/first_batch/paper_003.pdf', '../data/batches/first_batch/paper_004.pdf', '../data/batches/first_batch/paper_005.pdf', '../data/batches/first_batch/paper_006.pdf', '../data/batches/first_batch/paper_007.pdf', '../data/batches/first_batch/paper_008.pdf', '../data/batches/first_batch/paper_009.pdf', '../data/batches/first_batch/paper_010.pdf']


## 📜 2. Extract Text from Each PDF

In [3]:
records = []

for file_path in renamed_files:
    raw_text = extract_text_from_pdf(file_path)

    # You can later extract title & abstract using rules or LLM
    title = "UNKNOWN"
    abstract = "UNKNOWN"

    records.append({
        "document_id": os.path.basename(file_path),
        "title": title,
        "abstract": abstract,
        "raw_text": raw_text
    })

df = pd.DataFrame(records)
df.head()

Unnamed: 0,document_id,title,abstract,raw_text
0,paper_001.pdf,UNKNOWN,UNKNOWN,Journal of Clinical Epidemiology 138 (2021) 24...
1,paper_002.pdf,UNKNOWN,UNKNOWN,Journal Pre-proof\nLarge language models for c...
2,paper_003.pdf,UNKNOWN,UNKNOWN,InformationandSoftwareTechnology136(2021)10658...
3,paper_004.pdf,UNKNOWN,UNKNOWN,"Preprint of: Sundaram, G. and Berleant, D., Au..."
4,paper_005.pdf,UNKNOWN,UNKNOWN,Cutting Through the Clutter: The Potential of ...


## 💾 3. Export as CSV

In [4]:
output_path = "../outputs/paper_data.csv"
df.to_csv(output_path, index=False)
print(f"Saved CSV to: {output_path}")

Saved CSV to: ../outputs/paper_data.csv
