# 📄 Extracting Text from Scientific Papers with Configuration

## 🔧 1. Setup

In [1]:
# Imports
import os
import pandas as pd
import sys

# Make sure Python can find your scripts folder
sys.path.append("..")

# Import the functions
from scripts.pdf_reader import extract_full_text, extract_partial_text
from scripts.rename_pdfs import rename_pdfs_in_folder
from scripts.llm_extractor import extract_title_abstract_with_llm
from scripts.config_loader import load_config

### 🗂️ Load Configuration and Define Batch

In [3]:
# Load configuration with a specific profile
# Options: 'high_quality' or 'fast_processing' or None for default
# ⬇️ Decide which profile to use
config = load_config('fast_processing')  # Change this to use different profiles

# Print out some key configuration settings
print("🔍 Processing Configuration:")
print(f"LLM Model: {config.get('llm.model')}")
print(f"Page Limit for Metadata: {config.get('pdf.extraction.page_limit_for_metadata')}")
print(f"Extract Full Text: {config.get('pdf.extraction.full_text')}\n")

# Define batch
# ⬇️ Decide which batch to use
batch_name = "third_batch"

# Define folder paths using the configuration 
pdf_folder = os.path.join(config.get("paths.data_dir"), batch_name)
output_file = os.path.join(config.get("paths.output_dir"), f"{batch_name}.csv")

print(f"📂 PDF Folder: {pdf_folder}")
print(f"📄 Output File: {output_file}")

🔍 Processing Configuration:
LLM Model: gpt-3.5-turbo
Page Limit for Metadata: 2
Extract Full Text: True

📂 PDF Folder: ../data/batches/third_batch
📄 Output File: ../outputs/third_batch.csv


#### 🖋️ Rename the PDFs

In [4]:
# Rename PDFs using the configuration and get original filename mapping
renamed_files, original_filenames_map = rename_pdfs_in_folder(pdf_folder)
print(f"🔄 Renamed {len(renamed_files)} files")

# Show sample of original to new filename mapping
print("\n📋 Sample of original filenames:")
for i, (new_name, original_name) in enumerate(list(original_filenames_map.items())[:3]):
    print(f"  {new_name} ← {original_name}")
if len(original_filenames_map) > 3:
    print(f"  ... and {len(original_filenames_map) - 3} more")

🔄 Renamed 3 files

📋 Sample of original filenames:
  paper_001.pdf ← Technische Innovation, thereotische Sackgasse?.pdf
  paper_002.pdf ← The Dynamics of Political Incivility on Twitter.pdf
  paper_003.pdf ← text-as-data-the-promise-and-pitfalls-of-automatic-content-analysis-methods-for-political-texts.pdf


#### Test Extraction on a Single PDF

In [5]:
# Test on a single PDF
if renamed_files:
    single_pdf_path = renamed_files[0]  # Use the first PDF
    
    # Get page limit from config
    page_limit = config.get("pdf.extraction.page_limit_for_metadata")
    partial_text = extract_partial_text(single_pdf_path, page_limit)
    
    # Extract title and abstract using the configured model
    title, abstract = extract_title_abstract_with_llm(partial_text)
    
    print(f"🔬 Testing extraction on: {os.path.basename(single_pdf_path)}")
    print(f"\nTitle: {title}")
    print(f"\nAbstract: {abstract[:300]}...")
else:
    print("❌ No PDFs found to process")

🔬 Testing extraction on: paper_001.pdf

Title: Technische Innovation, theoretische Sackgasse? Chancen und Grenzen der automatisierten Inhaltsanalyse in Lehre und Forschung

Abstract: Die automatisierte Inhaltsanalyse wird auch in der Journalismusforschung zunehmend genutzt, um Texte (teil)-automatisiert zu analysieren. Sie hat damit zu einer Methodeninnovation im Fach beigetragen, die jedoch selten kritisch diskutiert wird. Der Beitrag gibt einen Überblick über Chancen und Grenz...


## 📜 2. Extract Text and Abstract

In [6]:
records = []
processed_count = 0
total_files = len(renamed_files)

# Get configuration options
extract_full = config.get("pdf.extraction.full_text", True)
page_limit = config.get("pdf.extraction.page_limit_for_metadata")
model = config.get("llm.model")

for file_path in renamed_files:
    processed_count += 1
    filename = os.path.basename(file_path)
    original_filename = original_filenames_map.get(filename, "Unknown")
    
    print(f"Processing {processed_count} of {total_files}: {filename}")
    
    try:
        # Extract text based on configuration
        if extract_full:
            full_text = extract_full_text(file_path)  # entire PDF text
        else:
            full_text = ""  # Skip full text extraction if disabled in config
            
        partial_text = extract_partial_text(file_path, page_limit)

        # Extract title and abstract
        title, abstract = extract_title_abstract_with_llm(partial_text, model)

        # Create record with dynamic fields based on config
        record = {
            "document_id": filename,
            "original_filename": original_filename,
            "title": title,
            "abstract": abstract
        }
        
        # Only include raw_text if full text extraction is enabled
        if extract_full:
            record["raw_text"] = full_text
            
        records.append(record)
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        # Add error record
        records.append({
            "document_id": filename,
            "original_filename": original_filename,
            "title": "ERROR: Processing failed",
            "abstract": f"Error: {str(e)}"
        })

# Create DataFrame
df = pd.DataFrame(records)
print(f"\n✅ Processed {len(records)} papers successfully")
df.head()

Processing 1 of 3: paper_001.pdf
Processing 2 of 3: paper_002.pdf
Processing 3 of 3: paper_003.pdf

✅ Processed 3 papers successfully


Unnamed: 0,document_id,original_filename,title,abstract,raw_text
0,paper_001.pdf,"Technische Innovation, thereotische Sackgasse?...","Technische Innovation, theoretische Sackgasse?...",Die automatisierte Inhaltsanalyse wird auch in...,"www.ssoar.info\nTechnische Innovation, theoret..."
1,paper_002.pdf,The Dynamics of Political Incivility on Twitte...,The Dynamics of Political Incivility,Online incivility and harassment in political ...,919447\nresearch-article20202020 SGOXXX10.1177...
2,paper_003.pdf,text-as-data-the-promise-and-pitfalls-of-autom...,Text as Data: The Promise and Pitfalls of Auto...,Politics and political conflict often occur in...,"AdvanceAccesspublicationJanuary22,2013 Politic..."


## 💾 3. Export Results

In [7]:
# Get output format from config
output_format = config.get("output.format", "csv")

if output_format == "csv":
    df.to_csv(output_file, index=False)
    print(f"💾 Saved CSV to: {output_file}")
elif output_format == "json":
    json_file = output_file.replace(".csv", ".json")
    df.to_json(json_file, orient="records", indent=2)
    print(f"💾 Saved JSON to: {json_file}")
else:
    print(f"⚠️ Unsupported output format: {output_format}")

💾 Saved CSV to: ../outputs/third_batch.csv


## 📊 4. Summary Statistics

In [8]:
# Display simple statistics about the extraction
print(f"Total papers processed: {len(df)}")

# Use raw strings (r prefix) for regex patterns to avoid escape issues
print(f"Papers with title extracted: {df['title'].count() - df['title'].str.contains(r'ERROR|I dont know').sum()}")
print(f"Papers with abstract extracted: {df['abstract'].count() - df['abstract'].str.contains(r'ERROR|I dont know').sum()}")

# Calculate average lengths
avg_title_length = df['title'].str.len().mean()
avg_abstract_length = df['abstract'].str.len().mean()
print(f"Average title length: {avg_title_length:.1f} characters")
print(f"Average abstract length: {avg_abstract_length:.1f} characters")

# Show original filename mapping
print("\n🔍 Sample with original filenames:")
display(df[['document_id', 'original_filename', 'title']].head(3))

Total papers processed: 3
Papers with title extracted: 3
Papers with abstract extracted: 3
Average title length: 85.3 characters
Average abstract length: 823.3 characters

🔍 Sample with original filenames:


Unnamed: 0,document_id,original_filename,title
0,paper_001.pdf,"Technische Innovation, thereotische Sackgasse?...","Technische Innovation, theoretische Sackgasse?..."
1,paper_002.pdf,The Dynamics of Political Incivility on Twitte...,The Dynamics of Political Incivility
2,paper_003.pdf,text-as-data-the-promise-and-pitfalls-of-autom...,Text as Data: The Promise and Pitfalls of Auto...
