In [14]:
import fitz  # PyMuPDF for text extraction
import pdfplumber  # For basic table extraction
import camelot  # For structured table extraction
from tabula import convert_into  # For backup table extraction
import pandas as pd
import os
import re
from datetime import datetime

# Dictionary containing all PDFs and their paths
pdf_files = {
    # Standard PDFs
    "CHFC7F_1": r"Developer 4/CHFC7F_1/CHFC7F_1.pdf",
    "Chapter_15_Summary": r"Developer 4/Chapter 15 Summary & Conclusions/Chapter 15 Summary & Conclusions.pdf",
    "CorrectionalEducationandRecidivism": r"Developer 4/CorrectionalEducationandRecidivism-TowardAToolforReduction/CorrectionalEducationandRecidivism-TowardAToolforReduction.pdf",
    "Dodson_et_al_2011-libre": r"Developer 4/Dodson_et_al_2011-libre/Dodson_et_al_2011-libre.pdf",
    "Does_incarceration_based_drug_treatment": r"Developer 4/Does_incarceration_based_drug_treatment (1)/Does_incarceration_based_drug_treatment (1).pdf",
    "drug_court_metaanalysis": r"Developer 4/drug court metaanalysis/drug court metaanalysis.pdf",
    "education_metaanalysis": r"Developer 4/education metaanlysis/education metaanlysis.pdf",
    "Meta-analysis-of-CBT": r"Developer 4/Meta-analysis-of-CBT-Landenberger-Lipsey_CBT_JEC-paper/Meta-analysis-of-CBT-Landenberger-Lipsey_CBT_JEC-paper.pdf",
    
    # IEEE format PDFs
    "CPA_Study": r"Developer 4/CPA Study 2015/CPA Study 2015.pdf",
    "Mackinac": r"Developer 4/Mackinac I/Mackinac I.pdf"
}

# PDFs requiring an alternative table extraction method
alternative_table_pdfs = [
    "Does_incarceration_based_drug_treatment",
    "Meta-analysis-of-CBT"
]

# IEEE format PDFs
ieee_format_pdfs = [
    "CPA_Study",
    "Mackinac"
]

def extract_metadata(text):
    """Extract metadata from IEEE format papers"""
    metadata = {}
    
    # Try to extract title (usually at the beginning, often in larger font)
    title_match = re.search(r'^(.*?)(?:\n|Introduction)', text, re.IGNORECASE | re.DOTALL)
    if title_match:
        title = title_match.group(1).strip()
        # Clean up multi-line titles
        title = re.sub(r'\n+', ' ', title)
        metadata['title'] = title
    
    # Try to extract authors
    author_pattern = r'By\s+([A-Za-z\s.,]+)'
    author_match = re.search(author_pattern, text[:1000])  # Look in first 1000 chars
    if author_match:
        metadata['authors'] = author_match.group(1).strip()
    
    # Try to extract date
    date_pattern = r'([A-Z][a-z]+\.?\s+\d{1,2},\s+\d{4})'
    date_match = re.search(date_pattern, text[:1000])
    if date_match:
        metadata['date'] = date_match.group(1)
    
    # Try to extract abstract (often after title but before introduction)
    abstract_match = re.search(r'(?:Abstract|Introduction)(.*?)(?:Introduction|\n\n)', text, re.IGNORECASE | re.DOTALL)
    if abstract_match:
        metadata['abstract'] = abstract_match.group(1).strip()
    
    return metadata

def extract_sections(text):
    """Extract major sections from the paper"""
    # Common section headers in IEEE papers
    section_patterns = [
        r'Introduction', r'Methods?', r'Results?', r'Discussion', 
        r'Conclusion', r'References', r'Endnotes', r'Future Research',
        r'Recommendations', r'Program Description', r'Evaluation Method'
    ]
    
    # Create regex pattern to find these sections
    pattern = '|'.join([f"({p})" for p in section_patterns])
    
    # Find all section headers and their positions
    sections = {}
    for match in re.finditer(pattern, text, re.IGNORECASE):
        section_name = match.group(0)
        start_pos = match.start()
        sections[section_name] = {'start': start_pos}
    
    # Sort sections by their position in the text
    sorted_sections = sorted(sections.items(), key=lambda x: x[1]['start'])
    
    # Extract content for each section
    extracted_sections = {}
    for i, (section_name, pos) in enumerate(sorted_sections):
        start = pos['start']
        if i < len(sorted_sections) - 1:
            end = sorted_sections[i+1][1]['start']
        else:
            end = len(text)
        
        content = text[start:end].strip()
        extracted_sections[section_name] = content
    
    return extracted_sections

def extract_tables_from_text(text):
    """Extract tables based on patterns in the text"""
    # Look for table-like structures with consistent patterns
    tables = []
    
    # Try to find tables with headers like "Table X" or "Graphic X"
    table_patterns = [
        r'(Table\s+\d+.*?(?:\n\n|\Z))',
        r'(Graphic\s+\d+.*?(?:\n\n|\Z))'
    ]
    
    for pattern in table_patterns:
        for match in re.finditer(pattern, text, re.DOTALL):
            tables.append(match.group(1))
    
    return tables

def extract_references(text):
    """Extract references or endnotes"""
    references = []
    
    # Look for references or endnotes section
    ref_section = None
    for section_name in ['References', 'Endnotes']:
        ref_pattern = f"{section_name}(.*?)(?:\n\n\w|\Z)"
        ref_match = re.search(ref_pattern, text, re.IGNORECASE | re.DOTALL)
        if ref_match:
            ref_section = ref_match.group(1).strip()
            break
    
    if ref_section:
        # Try to separate individual references
        # Most common format is numbered references like "1. Author..."
        ref_items = re.findall(r'\d+\.\s+(.*?)(?=\d+\.|\Z)', ref_section + "999.", re.DOTALL)
        if ref_items:
            references = [item.strip() for item in ref_items]
        else:
            # Alternative attempt - split by newlines for different format
            references = [line.strip() for line in ref_section.split('\n') if line.strip()]
    
    return references

def process_ieee_pdf(pdf_path, output_dir):
    """Process an IEEE format PDF file and extract structured information"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Extract filename without extension
    filename = os.path.splitext(os.path.basename(pdf_path))[0]
    
    try:
        # Open the PDF
        doc = fitz.open(pdf_path)
        
        # Extract all text
        full_text = ""
        for page in doc:
            full_text += page.get_text("text") + "\n"
        
        # Save the full text
        text_output_path = os.path.join(output_dir, f"{filename}_full_text.txt")
        with open(text_output_path, "w", encoding="utf-8") as f:
            f.write(full_text)
        
        # Extract metadata
        metadata = extract_metadata(full_text)
        
        # Extract sections
        sections = extract_sections(full_text)
        
        # Extract tables
        tables = extract_tables_from_text(full_text)
        
        # Extract references
        references = extract_references(full_text)
        
        # Save structured data
        structured_output = {
            "metadata": metadata,
            "sections": sections,
            "tables": tables,
            "references": references
        }
        
        # Save structured data as a text report
        report_path = os.path.join(output_dir, f"{filename}_structured_report.txt")
        with open(report_path, "w", encoding="utf-8") as f:
            f.write(f"=== EXTRACTED METADATA ===\n")
            for key, value in metadata.items():
                f.write(f"{key}: {value}\n\n")
            
            f.write(f"\n=== EXTRACTED SECTIONS ===\n")
            for name, content in sections.items():
                f.write(f"## {name} ##\n")
                f.write(f"{content}\n\n")
            
            f.write(f"\n=== EXTRACTED TABLES ===\n")
            for i, table in enumerate(tables):
                f.write(f"Table {i+1}:\n{table}\n\n")
            
            f.write(f"\n=== EXTRACTED REFERENCES ===\n")
            for i, ref in enumerate(references):
                f.write(f"{i+1}. {ref}\n")
        
        print(f"✅ Successfully processed IEEE format PDF {pdf_path}")
        print(f"   - Full text saved to: {text_output_path}")
        print(f"   - Structured report saved to: {report_path}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error processing IEEE format PDF {pdf_path}: {e}")
        return False

def process_standard_pdf(name, pdf_path, output_dir):
    """Process a standard PDF file with text and table extraction"""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Paths for output files
    text_output_path = os.path.join(output_dir, f"{name}_extracted_text.txt")
    csv_output_path = os.path.join(output_dir, f"{name}_extracted_tables.csv")

    ### Extract text from PDF ###
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text("text") + "\n"

        # Save extracted text
        with open(text_output_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"✅ Extracted text saved to: {text_output_path}")

    except Exception as e:
        print(f"❌ Error extracting text from {pdf_path}: {e}")

    ### Extract tables from PDF ###
    tables = []
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                table = page.extract_table()
                if table:
                    df = pd.DataFrame(table)
                    tables.append(df)

        # Save tables to CSV
        if tables:
            final_df = pd.concat(tables, ignore_index=True)
            final_df.to_csv(csv_output_path, index=False, header=False)
            print(f"✅ Extracted table data saved to: {csv_output_path}")
        else:
            print(f"⚠️ No tables found using pdfplumber in {pdf_path}")

            # Use alternative method if the PDF is in the special list
            if name in alternative_table_pdfs:
                print(f"🔄 Trying alternative table extraction for {name}...")

                # Try Camelot first (works best for structured tables)
                try:
                    tables = camelot.read_pdf(pdf_path, pages="all", flavor="stream")
                    if tables.n > 0:
                        camelot_csv_path = os.path.join(output_dir, f"{name}_camelot_tables.csv")
                        tables.export(camelot_csv_path, f="csv", compress=False)
                        print(f"✅ Extracted tables saved using Camelot to: {camelot_csv_path}")
                        return True
                except Exception as e:
                    print(f"⚠️ Camelot failed for {name}: {e}")

                # If Camelot fails, use Tabula
                try:
                    tabula_csv_path = os.path.join(output_dir, f"{name}_tabula_tables.csv")
                    convert_into(pdf_path, tabula_csv_path, output_format="csv", pages="all")
                    print(f"✅ Extracted tables saved using Tabula to: {tabula_csv_path}")
                    return True
                except Exception as e:
                    print(f"❌ Tabula also failed for {name}: {e}")
                    return False

    except Exception as e:
        print(f"❌ Error extracting tables from {pdf_path}: {e}")
        return False
        
    return True

def main():
    # Process each PDF
    for name, pdf_path in pdf_files.items():
        if not os.path.exists(pdf_path):
            print(f"⚠️ File not found: {pdf_path}")
            continue  # Skip this file if it doesn't exist

        # Set output directory
        output_dir = os.path.dirname(pdf_path)
        
        print(f"\nProcessing {name} ({pdf_path})...")
        
        # Process based on PDF type
        if name in ieee_format_pdfs:
            process_ieee_pdf(pdf_path, output_dir)
        else:
            process_standard_pdf(name, pdf_path, output_dir)
    
    print("\n=== PROCESSING COMPLETE ===")

if __name__ == "__main__":
    main()


Processing CHFC7F_1 (Developer 4/CHFC7F_1/CHFC7F_1.pdf)...
✅ Extracted text saved to: Developer 4/CHFC7F_1\CHFC7F_1_extracted_text.txt
✅ Extracted table data saved to: Developer 4/CHFC7F_1\CHFC7F_1_extracted_tables.csv

Processing Chapter_15_Summary (Developer 4/Chapter 15 Summary & Conclusions/Chapter 15 Summary & Conclusions.pdf)...
✅ Extracted text saved to: Developer 4/Chapter 15 Summary & Conclusions\Chapter_15_Summary_extracted_text.txt
✅ Extracted table data saved to: Developer 4/Chapter 15 Summary & Conclusions\Chapter_15_Summary_extracted_tables.csv

Processing CorrectionalEducationandRecidivism (Developer 4/CorrectionalEducationandRecidivism-TowardAToolforReduction/CorrectionalEducationandRecidivism-TowardAToolforReduction.pdf)...
✅ Extracted text saved to: Developer 4/CorrectionalEducationandRecidivism-TowardAToolforReduction\CorrectionalEducationandRecidivism_extracted_text.txt
✅ Extracted table data saved to: Developer 4/CorrectionalEducationandRecidivism-TowardAToolforRe

2025-03-17T19:40:39 - INFO - Processing page-1
2025-03-17 19:40:39,145 - INFO - Processing page-1
2025-03-17T19:40:39 - INFO - Processing page-2
2025-03-17 19:40:39,479 - INFO - Processing page-2
2025-03-17T19:40:40 - INFO - Processing page-3
2025-03-17 19:40:40,058 - INFO - Processing page-3
2025-03-17T19:40:40 - INFO - Processing page-4
2025-03-17 19:40:40,683 - INFO - Processing page-4
2025-03-17T19:40:41 - INFO - Processing page-5
2025-03-17 19:40:41,325 - INFO - Processing page-5
2025-03-17T19:40:42 - INFO - Processing page-6
2025-03-17 19:40:42,207 - INFO - Processing page-6
2025-03-17T19:40:43 - INFO - Processing page-7
2025-03-17 19:40:43,093 - INFO - Processing page-7
2025-03-17T19:40:43 - INFO - Processing page-8
2025-03-17 19:40:43,725 - INFO - Processing page-8
2025-03-17T19:40:44 - INFO - Processing page-9
2025-03-17 19:40:44,332 - INFO - Processing page-9
2025-03-17T19:40:44 - INFO - Processing page-10
2025-03-17 19:40:44,661 - INFO - Processing page-10
2025-03-17T19:40:4

✅ Extracted tables saved using Camelot to: Developer 4/Does_incarceration_based_drug_treatment (1)\Does_incarceration_based_drug_treatment_camelot_tables.csv

Processing drug_court_metaanalysis (Developer 4/drug court metaanalysis/drug court metaanalysis.pdf)...
✅ Extracted text saved to: Developer 4/drug court metaanalysis\drug_court_metaanalysis_extracted_text.txt
✅ Extracted table data saved to: Developer 4/drug court metaanalysis\drug_court_metaanalysis_extracted_tables.csv

Processing education_metaanalysis (Developer 4/education metaanlysis/education metaanlysis.pdf)...
✅ Extracted text saved to: Developer 4/education metaanlysis\education_metaanalysis_extracted_text.txt
✅ Extracted table data saved to: Developer 4/education metaanlysis\education_metaanalysis_extracted_tables.csv

Processing Meta-analysis-of-CBT (Developer 4/Meta-analysis-of-CBT-Landenberger-Lipsey_CBT_JEC-paper/Meta-analysis-of-CBT-Landenberger-Lipsey_CBT_JEC-paper.pdf)...
✅ Extracted text saved to: Developer 4/

2025-03-17T19:41:23 - INFO - Processing page-1
2025-03-17 19:41:23,477 - INFO - Processing page-1
2025-03-17T19:41:23 - INFO - Processing page-2
2025-03-17 19:41:23,711 - INFO - Processing page-2
2025-03-17T19:41:24 - INFO - Processing page-3
2025-03-17 19:41:24,484 - INFO - Processing page-3
2025-03-17T19:41:25 - INFO - Processing page-4
2025-03-17 19:41:25,229 - INFO - Processing page-4
2025-03-17T19:41:25 - INFO - Processing page-5
2025-03-17 19:41:25,885 - INFO - Processing page-5
2025-03-17T19:41:26 - INFO - Processing page-6
2025-03-17 19:41:26,190 - INFO - Processing page-6
2025-03-17T19:41:26 - INFO - Processing page-7
2025-03-17 19:41:26,301 - INFO - Processing page-7
2025-03-17T19:41:26 - INFO - Processing page-8
2025-03-17 19:41:26,385 - INFO - Processing page-8
2025-03-17T19:41:26 - INFO - Processing page-9
2025-03-17 19:41:26,456 - INFO - Processing page-9
2025-03-17T19:41:26 - INFO - Processing page-10
2025-03-17 19:41:26,678 - INFO - Processing page-10
2025-03-17T19:41:2

✅ Extracted tables saved using Camelot to: Developer 4/Meta-analysis-of-CBT-Landenberger-Lipsey_CBT_JEC-paper\Meta-analysis-of-CBT_camelot_tables.csv

Processing CPA_Study (Developer 4/CPA Study 2015/CPA Study 2015.pdf)...
✅ Successfully processed IEEE format PDF Developer 4/CPA Study 2015/CPA Study 2015.pdf
   - Full text saved to: Developer 4/CPA Study 2015\CPA Study 2015_full_text.txt
   - Structured report saved to: Developer 4/CPA Study 2015\CPA Study 2015_structured_report.txt

Processing Mackinac (Developer 4/Mackinac I/Mackinac I.pdf)...
✅ Successfully processed IEEE format PDF Developer 4/Mackinac I/Mackinac I.pdf
   - Full text saved to: Developer 4/Mackinac I\Mackinac I_full_text.txt
   - Structured report saved to: Developer 4/Mackinac I\Mackinac I_structured_report.txt

=== PROCESSING COMPLETE ===
