# 📄 Extracting Text from Scientific Papers with Configuration

## 🔧 1. Setup"

In [1]:
# Imports
import os
import pandas as pd
import sys

# Make sure Python can find your scripts folder
sys.path.append("..")

# Import the functions
from scripts.pdf_reader import extract_full_text, extract_partial_text
from scripts.rename_pdfs import rename_pdfs_in_folder
from scripts.llm_extractor import extract_title_abstract_with_llm
from scripts.config_loader import load_config

### 🗂️ Load Configuration and Define Batch

In [2]:
# Load configuration with a specific profile
# Options: 'high_quality' or 'fast_processing' or None for default
# ⬇️ Decide which profile to use
config = load_config('high_quality')  # Change this to use different profiles

# Print out some key configuration settings
print("🔍 Processing Configuration:")
print(f"LLM Model: {config.get('llm.model')}")
print(f"Page Limit for Metadata: {config.get('pdf.extraction.page_limit_for_metadata')}")
print(f"Extract Full Text: {config.get('pdf.extraction.full_text')}\n")

# Define batch
# ⬇️ Decide which batch to use
batch_name = "second_batch"

# Define folder paths using the configuration 
pdf_folder = os.path.join(config.get("paths.data_dir"), batch_name)
output_file = os.path.join(config.get("paths.output_dir"), f"{batch_name}.csv")

print(f"📂 PDF Folder: {pdf_folder}")
print(f"📄 Output File: {output_file}")

🔍 Processing Configuration:
LLM Model: gpt-4o
Page Limit for Metadata: 5
Extract Full Text: True

📂 PDF Folder: ../data/batches/second_batch
📄 Output File: ../outputs/second_batch.csv


#### 🖋️ Rename the PDFs

In [3]:
# Rename PDFs using the configuration
renamed_files = rename_pdfs_in_folder(pdf_folder)
print(f"🔄 Renamed {len(renamed_files)} files")

🔄 Renamed 11 files


#### Test Extraction on a Single PDF

In [4]:
# Test on a single PDF
if renamed_files:
    single_pdf_path = renamed_files[0]  # Use the first PDF
    
    # Get page limit from config
    page_limit = config.get("pdf.extraction.page_limit_for_metadata")
    partial_text = extract_partial_text(single_pdf_path, page_limit)
    
    # Extract title and abstract using the configured model
    title, abstract = extract_title_abstract_with_llm(partial_text)
    
    print(f"🔬 Testing extraction on: {os.path.basename(single_pdf_path)}")
    print(f"\nTitle: {title}")
    print(f"\nAbstract: {abstract[:300]}...")
else:
    print("❌ No PDFs found to process")

🔬 Testing extraction on: paper_001.pdf

Title: Implementation and evaluation of an additional GPT-4-based reviewer in PRISMA-based medical systematic literature reviews

Abstract: Background: PRISMA-based literature reviews require meticulous scrutiny of extensive textual data by multiple reviewers, which is associated with considerable human effort. Objective: To evaluate feasibility and reliability of using GPT-4 API as a complementary reviewer in systematic literature revi...


## 📜 2. Extract Text and Abstract

In [5]:
records = []
processed_count = 0
total_files = len(renamed_files)

# Get configuration options
extract_full = config.get("pdf.extraction.full_text", True)
page_limit = config.get("pdf.extraction.page_limit_for_metadata")
model = config.get("llm.model")

for file_path in renamed_files:
    processed_count += 1
    print(f"Processing {processed_count} of {total_files}: {os.path.basename(file_path)}")
    
    try:
        # Extract text based on configuration
        if extract_full:
            full_text = extract_full_text(file_path)  # entire PDF text
        else:
            full_text = ""  # Skip full text extraction if disabled in config
            
        partial_text = extract_partial_text(file_path, page_limit)

        # Extract title and abstract
        title, abstract = extract_title_abstract_with_llm(partial_text, model)

        # Create record with dynamic fields based on config
        record = {
            "document_id": os.path.basename(file_path),
            "title": title,
            "abstract": abstract
        }
        
        # Only include raw_text if full text extraction is enabled
        if extract_full:
            record["raw_text"] = full_text
            
        records.append(record)
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        # Add error record
        records.append({
            "document_id": os.path.basename(file_path),
            "title": "ERROR: Processing failed",
            "abstract": f"Error: {str(e)}"
        })

# Create DataFrame
df = pd.DataFrame(records)
print(f"\n✅ Processed {len(records)} papers successfully")
df.head()

Processing 1 of 11: paper_001.pdf
Processing 2 of 11: paper_002.pdf
Processing 3 of 11: paper_003.pdf
Processing 4 of 11: paper_004.pdf
Processing 5 of 11: paper_005.pdf
Processing 6 of 11: paper_006.pdf
Processing 7 of 11: paper_007.pdf
Processing 8 of 11: paper_008.pdf
Processing 9 of 11: paper_009.pdf
Processing 10 of 11: paper_010.pdf
Processing 11 of 11: paper_011.pdf

✅ Processed 11 papers successfully


Unnamed: 0,document_id,title,abstract,raw_text
0,paper_001.pdf,Implementation and evaluation of an additional...,Background: PRISMA-based literature reviews re...,InternationalJournalofMedicalInformatics189(20...
1,paper_002.pdf,Automating Systematic Literature Reviews with ...,Objectives: An SLR is presented focusing on te...,"Preprint of: Sundaram, G. and Berleant, D., Au..."
2,paper_003.pdf,Cutting Through the Clutter: The Potential of ...,"In academic research, systematic literature re...",Cutting Through the Clutter: The Potential of ...
3,paper_004.pdf,Can large language models replace humans in sy...,Systematic reviews are vital for guiding pract...,Received:10October2023 Revised:6February2024 A...
4,paper_005.pdf,Title and abstract screening for literature re...,Background Systematically screening published ...,Dennstädt et al. Systematic Reviews (2024) 13:...


## 💾 3. Export Results

In [6]:
# Get output format from config
output_format = config.get("output.format", "csv")

if output_format == "csv":
    df.to_csv(output_file, index=False)
    print(f"💾 Saved CSV to: {output_file}")
elif output_format == "json":
    json_file = output_file.replace(".csv", ".json")
    df.to_json(json_file, orient="records", indent=2)
    print(f"💾 Saved JSON to: {json_file}")
else:
    print(f"⚠️ Unsupported output format: {output_format}")

💾 Saved CSV to: ../outputs/second_batch.csv


## 📊 4. Summary Statistics

In [8]:
# Display simple statistics about the extraction
print(f"Total papers processed: {len(df)}")
print(f"Papers with title extracted: {df['title'].count() - df['title'].str.contains('ERROR|I don\\'t know').sum()}")
print(f"Papers with abstract extracted: {df['abstract'].count() - df['abstract'].str.contains('ERROR|I don\\'t know').sum()}")

# Calculate average lengths
avg_title_length = df['title'].str.len().mean()
avg_abstract_length = df['abstract'].str.len().mean()
print(f"Average title length: {avg_title_length:.1f} characters")
print(f"Average abstract length: {avg_abstract_length:.1f} characters")

SyntaxError: f-string expression part cannot include a backslash (2702739586.py, line 3)