# 📄 Extracting Text from Scientific Papers (Test Batch) 

## 🔧 1. Setup

In [None]:
# Imports
import os
import pandas as pd
import sys

# Make sure Python can find your scripts folder
sys.path.append("..")

# Import the functions
from scripts.pdf_reader import extract_full_text, extract_partial_text
from scripts.rename_pdfs import rename_pdfs_in_folder
from scripts.llm_extractor import extract_title_abstract_with_llm

### 🗂️ Define batch and folder path

In [None]:
# Define batch
batch_name = "second_batch"

# Define folder paths
pdf_folder = f"../data/batches/{batch_name}/"
output_file = f"../outputs/{batch_name}.csv"

#### 🖋️ Rename the PDFs


In [None]:
renamed_files = rename_pdfs_in_folder(pdf_folder)
print("Renamed files:", renamed_files)

#### Make some Tests 

In [None]:
single_pdf_path = renamed_files[0]  # or whichever index you want

partial_text = extract_partial_text(single_pdf_path, page_limit=2)

title, abstract = extract_title_abstract_with_llm(partial_text)

print("Single PDF Title:", title)
print("Single PDF Abstract:", abstract)

## 📜 2. Extract Text and Abstract

In [None]:
records = []

for file_path in renamed_files:
    full_text = extract_full_text(file_path)  # entire PDF text
    partial_text = extract_partial_text(file_path, page_limit=2)

    # We call the new function
    title, abstract = extract_title_abstract_with_llm(partial_text, model="gpt-4o-mini")

    records.append({
        "document_id": os.path.basename(file_path),
        "title": title,
        "abstract": abstract,
        "raw_text": full_text
    })

df = pd.DataFrame(records)
df.head()

In [None]:
df.head(11)

## 💾 3. Export as CSV

In [None]:
df.to_csv(output_file, index=False)
print(f"Saved CSV to: {output_file}")