# Setup and Imports

In [1]:
import fitz  # PyMuPDF
import json
import os
from pathlib import Path
from datetime import datetime

## Configuration

BASE_PATH = "C:/CSR_Report"  # Change this to your actual path

# Companies and years
COMPANIES = ["Danone", "Indofood", "Mayora", "Ultra_jaya", "Unilever"]
YEARS = [2019, 2020, 2021, 2022, 2023, 2024]

# Output file
OUTPUT_FILE = "extracted_csr_data.json"

## Helper Functions

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text from PDF file using PyMuPDF
    Returns: dict with text and page count
    """
    try:
        doc = fitz.open(pdf_path)
        text = ""
        page_texts = []
        
        # Extract text from each page
        for page_num, page in enumerate(doc, start=1):
            page_text = page.get_text()
            page_texts.append({
                "page_number": page_num,
                "text": page_text
            })
            text += f"\n--- Page {page_num} ---\n{page_text}"
        
        doc.close()
        
        return {
            "success": True,
            "full_text": text,
            "page_texts": page_texts,
            "page_count": len(page_texts),
            "file_size": os.path.getsize(pdf_path)
        }
    
    except Exception as e:
        return {
            "success": False,
            "error": str(e),
            "full_text": "",
            "page_texts": [],
            "page_count": 0
        }

# %%
def clean_text(text):
    """
    Basic text cleaning
    """
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Remove excessive newlines (keep some structure)
    text = "\n".join([line.strip() for line in text.split("\n") if line.strip()])
    
    return text

## Process All PDFs

In [3]:
def process_all_pdfs():
    """
    Process all 25 PDFs and extract text with metadata
    """
    all_data = []
    stats = {
        "total_files": 0,
        "successful": 0,
        "failed": 0,
        "total_pages": 0
    }
    
    print("=" * 60)
    print("Starting PDF Extraction Process")
    print("=" * 60)
    
    for company in COMPANIES:
        for year in YEARS:
            # Construct file path
            pdf_path = os.path.join(BASE_PATH, company, f"CSR_{year}.pdf")
            
            stats["total_files"] += 1
            
            # Check if file exists
            if not os.path.exists(pdf_path):
                print(f"‚ö†Ô∏è  NOT FOUND: {company} - {year}")
                stats["failed"] += 1
                continue
            
            print(f"\nüìÑ Processing: {company} - {year}")
            print(f"   Path: {pdf_path}")
            
            # Extract text
            result = extract_text_from_pdf(pdf_path)
            
            if result["success"]:
                # Clean the text
                cleaned_text = clean_text(result["full_text"])
                
                # Create document with metadata
                document = {
                    "company": company,
                    "year": year,
                    "source_file": pdf_path,
                    "filename": f"CSR_{year}.pdf",
                    "extracted_at": datetime.now().isoformat(),
                    "page_count": result["page_count"],
                    "file_size_bytes": result["file_size"],
                    "text": cleaned_text,
                    "page_texts": result["page_texts"]  # Keep individual pages for reference
                }
                
                all_data.append(document)
                stats["successful"] += 1
                stats["total_pages"] += result["page_count"]
                
                print(f"   ‚úÖ Success! Pages: {result['page_count']}, Size: {result['file_size']/1024:.1f} KB")
                print(f"   Text length: {len(cleaned_text):,} characters")
            else:
                print(f"   ‚ùå Failed: {result['error']}")
                stats["failed"] += 1
    
    return all_data, stats

## Run Extraction

In [4]:
# Run the extraction process
extracted_data, statistics = process_all_pdfs()

Starting PDF Extraction Process

üìÑ Processing: Danone - 2019
   Path: C:/CSR_Report\Danone\CSR_2019.pdf
   ‚úÖ Success! Pages: 54, Size: 30149.1 KB
   Text length: 75,916 characters

üìÑ Processing: Danone - 2020
   Path: C:/CSR_Report\Danone\CSR_2020.pdf
   ‚úÖ Success! Pages: 54, Size: 30149.1 KB
   Text length: 75,916 characters

üìÑ Processing: Danone - 2021
   Path: C:/CSR_Report\Danone\CSR_2021.pdf
   ‚úÖ Success! Pages: 30, Size: 1764.4 KB
   Text length: 75,717 characters

üìÑ Processing: Danone - 2022
   Path: C:/CSR_Report\Danone\CSR_2022.pdf
   ‚úÖ Success! Pages: 30, Size: 1764.4 KB
   Text length: 75,717 characters
‚ö†Ô∏è  NOT FOUND: Danone - 2023

üìÑ Processing: Danone - 2024
   Path: C:/CSR_Report\Danone\CSR_2024.pdf
   ‚úÖ Success! Pages: 48, Size: 1424.4 KB
   Text length: 135,522 characters
‚ö†Ô∏è  NOT FOUND: Indofood - 2019

üìÑ Processing: Indofood - 2020
   Path: C:/CSR_Report\Indofood\CSR_2020.pdf
   ‚úÖ Success! Pages: 31, Size: 555.5 KB
   Text length: 

## Display Statistics

In [5]:
print("\n" + "=" * 60)
print("EXTRACTION COMPLETE - STATISTICS")
print("=" * 60)
print(f"Total files attempted: {statistics['total_files']}")
print(f"Successfully extracted: {statistics['successful']}")
print(f"Failed: {statistics['failed']}")
print(f"Total pages processed: {statistics['total_pages']}")
print(f"Average pages per document: {statistics['total_pages']/max(statistics['successful'], 1):.1f}")

# Show breakdown by company
print("\nüìä Breakdown by Company:")
company_stats = {}
for doc in extracted_data:
    company = doc["company"]
    if company not in company_stats:
        company_stats[company] = {"count": 0, "pages": 0}
    company_stats[company]["count"] += 1
    company_stats[company]["pages"] += doc["page_count"]

for company, stats in sorted(company_stats.items()):
    print(f"   {company}: {stats['count']} reports, {stats['pages']} total pages")


EXTRACTION COMPLETE - STATISTICS
Total files attempted: 30
Successfully extracted: 25
Failed: 5
Total pages processed: 1400
Average pages per document: 56.0

üìä Breakdown by Company:
   Danone: 5 reports, 216 total pages
   Indofood: 5 reports, 447 total pages
   Mayora: 5 reports, 150 total pages
   Ultra_jaya: 5 reports, 27 total pages
   Unilever: 5 reports, 560 total pages


## Save to JSON

In [6]:
# Save extracted data
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, ensure_ascii=False, indent=2)

print(f"\nüíæ Data saved to: {OUTPUT_FILE}")
print(f"   File size: {os.path.getsize(OUTPUT_FILE)/1024/1024:.2f} MB")


üíæ Data saved to: extracted_csr_data.json
   File size: 7.79 MB


## Quick Preview


In [7]:
# Show a sample of extracted text
if extracted_data:
    sample = extracted_data[0]
    print("\n" + "=" * 60)
    print("SAMPLE EXTRACTED TEXT")
    print("=" * 60)
    print(f"Company: {sample['company']}")
    print(f"Year: {sample['year']}")
    print(f"Pages: {sample['page_count']}")
    print(f"\nFirst 500 characters:")
    print("-" * 60)
    print(sample['text'][:500])
    print("...")


SAMPLE EXTRACTED TEXT
Company: Danone
Year: 2019
Pages: 54

First 500 characters:
------------------------------------------------------------
--- Page 1 --- Melestarikan Kebaikan Lingkungan Laporan Keberlanjutan 2020 PT Tirta Investama (Danone-AQUA) 28 --- Page 2 --- Komitmen Danone-AQUA terhadap pelestarian lingkungan tercermin dalam setiap langkah operasi kami. Perusahaan menaruh perhatian tinggi terhadap upaya kami dalam meminimalkan risiko dan dampak operasi terhadap lingkungan melalui penggunaan energi yang bertanggung jawab, pemanfaatan air secara lestari hingga pengurangan limbah dan emisi. Perumusan dan implementasi kebijakan 
...


## Verify Data Quality

In [8]:
print("\n" + "=" * 60)
print("DATA QUALITY CHECKS")
print("=" * 60)

issues = []

for doc in extracted_data:
    # Check for very short documents (might indicate extraction failure)
    if len(doc['text']) < 1000:
        issues.append(f"‚ö†Ô∏è  {doc['company']} {doc['year']}: Very short text ({len(doc['text'])} chars)")
    
    # Check for documents with very few pages
    if doc['page_count'] < 3:
        issues.append(f"‚ö†Ô∏è  {doc['company']} {doc['year']}: Only {doc['page_count']} pages")

if issues:
    print("\n‚ö†Ô∏è  Potential Issues Found:")
    for issue in issues:
        print(f"   {issue}")
else:
    print("\n‚úÖ All documents look good!")



DATA QUALITY CHECKS

‚úÖ All documents look good!


## Next Steps


In [9]:
print("\n" + "=" * 60)
print("‚úÖ STEP 1 COMPLETE!")
print("=" * 60)
print("\nNext: Open 02_chunking_and_embeddings.ipynb")


‚úÖ STEP 1 COMPLETE!

Next: Open 02_chunking_and_embeddings.ipynb
