# 10-K Filing Parser

This notebook parses SEC 10-K filings and segments them into a structured folder hierarchy based on specific delimiters:

## Structure:
- **Main folder**: Company name (e.g., `Apple_Inc_Parsed`)
- **Subfolders**: Major parts delimited by `╔═ § ══════════` 
- **Files**: Individual sections delimited by `╭─ • ───────`

## Features:
- ✅ Automatic company folder creation
- ✅ Part-based folder organization  
- ✅ Section-based file splitting
- ✅ Content cleaning (removes box drawing characters)
- ✅ Filename sanitization for Windows compatibility
- ✅ Handles large 10-K documents efficiently

## Usage:
Run the cell below to parse `AAPL_latest_10K.txt` into the structured format.

In [4]:
import os
import re
from pathlib import Path
import shutil

def sanitize_filename(filename):
    """Sanitize filename by removing invalid characters"""
    # Remove or replace invalid characters
    filename = re.sub(r'[<>:"/\\|?*]', '', filename)  # Remove invalid chars
    filename = re.sub(r'\s+', '_', filename)  # Replace spaces with underscores
    filename = re.sub(r'[^\w\-_.]', '', filename)  # Keep only alphanumeric, dash, underscore, dot
    filename = filename.strip('._')  # Remove leading/trailing dots and underscores
    # Limit length
    if len(filename) > 100:
        filename = filename[:100]
    return filename if filename else "unnamed"

def parse_10k_simple(file_path, output_dir="Apple_Inc_Parsed"):
    """
    Simple and robust parser for 10-K filing
    """
    
    # Clean previous output if exists
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    
    # Read the 10-K filing
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Create main company folder
    company_dir = Path(output_dir)
    company_dir.mkdir(parents=True, exist_ok=True)
    
    print("Starting to parse 10-K filing...")
    
    # Split by major part delimiter: ╔═ § ═════════════════════════════════════════════════════════════════════════════════════════════════════════════╗
    major_parts = content.split('╔═ § ═')
    
    part_counter = 1
    for i, part in enumerate(major_parts):
        if not part.strip() or i == 0:  # Skip empty parts and the content before first delimiter
            continue
            
        # Extract part title from the first few lines
        lines = part.split('\n')[:10]
        part_title = f"Part_{part_counter}"
        
        for line in lines:
            clean_line = re.sub(r'[═║╗╚│─\s]+', ' ', line).strip()
            if clean_line and len(clean_line) > 3 and 'PART' in clean_line.upper():
                part_title = clean_line
                break
        
        # Sanitize part title for folder name
        clean_part_title = sanitize_filename(part_title)
        part_folder = company_dir / f"Part_{part_counter}_{clean_part_title}"
        part_folder.mkdir(exist_ok=True)
        
        print(f"Processing {part_title}")
        
        # Split part by section delimiter: ╭─ • ─────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
        sections = part.split('╭─ • ─')
        
        section_counter = 1
        for j, section in enumerate(sections):
            if not section.strip():
                continue
                
            # Extract section title
            section_lines = section.split('\n')[:5]
            section_title = f"Section_{section_counter}"
            
            for line in section_lines:
                clean_line = re.sub(r'[╮│╰─\s]+', ' ', line).strip()
                if clean_line and len(clean_line) > 3:
                    section_title = clean_line
                    break
            
            # Clean the section content
            content_lines = []
            for line in section.split('\n'):
                # Remove box drawing characters
                cleaned_line = re.sub(r'^[╔╗╚╝║│╭╮╰╯─═•\s]*', '', line)
                cleaned_line = re.sub(r'[╔╗╚╝║│╭╮╰╯─═•\s]*$', '', cleaned_line)
                if cleaned_line.strip():
                    content_lines.append(cleaned_line.strip())
            
            final_content = '\n'.join(content_lines)
            
            # Only save if there's substantial content
            if len(final_content.strip()) > 100:
                # Sanitize section title for filename
                clean_section_title = sanitize_filename(section_title)
                section_file = part_folder / f"Section_{section_counter}_{clean_section_title}.txt"
                
                try:
                    with open(section_file, 'w', encoding='utf-8') as f:
                        f.write(final_content)
                    print(f"  Created: {section_file.name}")
                    section_counter += 1
                except Exception as e:
                    print(f"  Error creating file: {e}")
                    print(f"  Attempted filename: {section_file.name}")
        
        part_counter += 1
    
    print(f"\nParsing completed! Files saved in: {company_dir}")
    return company_dir

# Run the parser
result_dir = parse_10k_simple("AAPL_latest_10K.txt")

# Show the directory structure
print("\n" + "="*50)
print("DIRECTORY STRUCTURE CREATED:")
print("="*50)
for root, dirs, files in os.walk(result_dir):
    level = root.replace(str(result_dir), '').count(os.sep)
    indent = '  ' * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = '  ' * (level + 1)
    for file in files[:5]:  # Show first 5 files per folder
        print(f"{subindent}{file}")
    if len(files) > 5:
        print(f"{subindent}... and {len(files) - 5} more files")

Starting to parse 10-K filing...
Processing PART I
  Created: Section_1_Item_1._Business.txt
  Created: Section_2_Item_1A._Risk_Factors.txt
  Created: Section_3_Item_1C._Cybersecurity.txt
  Created: Section_4_Item_2._Properties.txt
  Created: Section_5_Item_3._Legal_Proceedings.txt
Processing PART II
  Created: Section_1_Item_5._Market_for_Registrants_Common_Equity_Related_Stockholder_Matters_and_Issuer_Purchases_of_Equ.txt
  Created: Section_2_Item_7._Managements_Discussion_and_Analysis_of_Financial_Condition_and_Results_of_Operations.txt
  Created: Section_3_Item_7A._Quantitative_and_Qualitative_Disclosures_About_Market_Risk.txt
  Created: Section_4_Item_8._Financial_Statements_and_Supplementary_Data.txt
  Created: Section_5_Item_9A._Controls_and_Procedures.txt
  Created: Section_6_Item_9B._Other_Information.txt
Processing PART III
  Created: Section_1_Item_10._Directors_Executive_Officers_and_Corporate_Governance.txt
  Created: Section_2_Item_11._Executive_Compensation.txt
  Created

In [8]:
import json
import os
from pathlib import Path
from datetime import datetime
import re

class TenKStructuredAnalyzer:
    """
    Convert parsed 10-K folder structure into comprehensive structured JSON
    """
    
    def __init__(self, parsed_folder_path="Apple_Inc_Parsed"):
        self.parsed_folder = Path(parsed_folder_path)
        self.company_data = {}
        
    def extract_financial_metrics(self, text):
        """Extract key financial metrics from text"""
        metrics = {}
        
        # Revenue patterns
        revenue_patterns = [
            r'net sales[:\s]+\$?[\d,\.]+\s*(?:million|billion)?',
            r'total net sales[:\s]+\$?[\d,\.]+\s*(?:million|billion)?',
            r'revenue[:\s]+\$?[\d,\.]+\s*(?:million|billion)?'
        ]
        
        # Extract revenue
        for pattern in revenue_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                metrics['revenue_mentions'] = matches[:3]  # Top 3 mentions
                break
        
        # Extract key numbers (billions/millions)
        financial_numbers = re.findall(r'\$[\d,]+\.?\d*\s*(?:billion|million)', text, re.IGNORECASE)
        if financial_numbers:
            metrics['key_financial_figures'] = financial_numbers[:10]  # Top 10 figures
        
        # Extract percentages
        percentages = re.findall(r'\d+\.?\d*%', text)
        if percentages:
            metrics['percentages'] = percentages[:5]  # Top 5 percentages
            
        return metrics
    
    def extract_business_segments(self, text):
        """Extract business segments and product information"""
        segments = {}
        
        # Product lines
        product_patterns = {
            'iPhone': r'iPhone[^.]*\.?[^.]*\.',
            'Mac': r'Mac[^.]*\.?[^.]*\.',
            'iPad': r'iPad[^.]*\.?[^.]*\.',
            'Services': r'Services[^.]*\.?[^.]*\.',
            'Wearables': r'Wearables[^.]*\.?[^.]*\.'
        }
        
        for product, pattern in product_patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                segments[product] = [match.strip() for match in matches[:3]]
        
        # Geographic segments
        geo_patterns = {
            'Americas': r'Americas[^.]*\.?[^.]*\.',
            'Europe': r'Europe[^.]*\.?[^.]*\.',
            'Greater China': r'Greater China[^.]*\.?[^.]*\.',
            'Japan': r'Japan[^.]*\.?[^.]*\.',
            'Rest of Asia Pacific': r'Rest of Asia Pacific[^.]*\.?[^.]*\.'
        }
        
        geographic_segments = {}
        for region, pattern in geo_patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                geographic_segments[region] = [match.strip() for match in matches[:2]]
        
        if geographic_segments:
            segments['geographic_segments'] = geographic_segments
            
        return segments
    
    def extract_risk_factors(self, text):
        """Extract key risk factors"""
        risks = []
        
        # Split by common risk factor indicators
        risk_sentences = re.split(r'\. (?=[A-Z])', text)
        
        for sentence in risk_sentences:
            sentence = sentence.strip()
            # Look for risk-related keywords
            if any(keyword in sentence.lower() for keyword in 
                   ['risk', 'adverse', 'uncertain', 'volatility', 'decline', 'competition']):
                if len(sentence) > 50 and len(sentence) < 300:  # Reasonable length
                    risks.append(sentence)
        
        return risks[:10]  # Top 10 risk factors
    
    def extract_key_metrics(self, text):
        """Extract various key metrics and KPIs"""
        metrics = {}
        
        # Employee count
        employee_match = re.search(r'approximately\s+([\d,]+)\s+(?:full-time\s+)?(?:equivalent\s+)?employees', text, re.IGNORECASE)
        if employee_match:
            metrics['employee_count'] = employee_match.group(1)
        
        # Market value
        market_value_match = re.search(r'market value[^$]*\$\s*([\d,]+(?:\.\d+)?)\s*(?:billion|million)?', text, re.IGNORECASE)
        if market_value_match:
            metrics['market_value'] = market_value_match.group(1)
        
        # Research and development mentions
        rd_mentions = len(re.findall(r'research and development|R&D', text, re.IGNORECASE))
        if rd_mentions:
            metrics['rd_mentions'] = rd_mentions
            
        return metrics
    
    def analyze_file_content(self, file_path):
        """Include full file content in analysis without further quantification"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                
            analysis = {
                'file_name': file_path.name,
                'content_length': len(content),
                'word_count': len(content.split()),
                'full_content': content,  # Include the complete raw content
                'financial_metrics': self.extract_financial_metrics(content),
                'business_segments': self.extract_business_segments(content),
                'key_metrics': self.extract_key_metrics(content)
            }
            
            # Special analysis for specific sections
            filename_lower = file_path.name.lower()
            
            if 'risk' in filename_lower:
                analysis['risk_factors'] = self.extract_risk_factors(content)
            
            if 'business' in filename_lower:
                analysis['business_description'] = content  # Full content instead of truncated
                
            if 'financial' in filename_lower or 'results' in filename_lower:
                analysis['financial_highlights'] = self.extract_financial_metrics(content)
            
            return analysis
            
        except Exception as e:
            return {
                'file_name': file_path.name,
                'error': str(e),
                'content_length': 0,
                'full_content': ''
            }
    
    def process_company_structure(self):
        """Process the entire company folder structure"""
        if not self.parsed_folder.exists():
            return {"error": f"Folder {self.parsed_folder} does not exist"}
        
        company_structure = {
            'company_name': 'Apple Inc.',
            'filing_type': '10-K',
            'analysis_date': datetime.now().isoformat(),
            'parts': {},
            'summary': {
                'total_parts': 0,
                'total_sections': 0,
                'total_files': 0,
                'total_content_length': 0
            }
        }
        
        # Process each part folder
        part_folders = [d for d in self.parsed_folder.iterdir() if d.is_dir()]
        part_folders.sort()
        
        for part_folder in part_folders:
            part_name = part_folder.name
            print(f"Processing {part_name}...")
            
            part_data = {
                'part_name': part_name,
                'sections': {},
                'part_summary': {
                    'total_sections': 0,
                    'total_content_length': 0,
                    'key_topics': []
                }
            }
            
            # Process each section file
            section_files = [f for f in part_folder.iterdir() if f.is_file() and f.suffix == '.txt']
            section_files.sort()
            
            for section_file in section_files:
                print(f"  Analyzing {section_file.name}...")
                
                section_analysis = self.analyze_file_content(section_file)
                section_name = section_file.stem
                
                part_data['sections'][section_name] = section_analysis
                part_data['part_summary']['total_sections'] += 1
                part_data['part_summary']['total_content_length'] += section_analysis.get('content_length', 0)
                
                # Add to overall summary
                company_structure['summary']['total_files'] += 1
                company_structure['summary']['total_content_length'] += section_analysis.get('content_length', 0)
            
            company_structure['parts'][part_name] = part_data
            company_structure['summary']['total_parts'] += 1
            company_structure['summary']['total_sections'] += part_data['part_summary']['total_sections']
        
        return company_structure
    
    def generate_executive_summary(self, structured_data):
        """Generate executive summary from structured data"""
        summary = {
            'company_overview': {},
            'financial_highlights': [],
            'business_segments': {},
            'key_risks': [],
            'operational_metrics': {}
        }
        
        # Aggregate data from all parts
        all_financial_metrics = []
        all_business_segments = {}
        all_risks = []
        all_key_metrics = {}
        
        for part_name, part_data in structured_data['parts'].items():
            for section_name, section_data in part_data['sections'].items():
                # Collect financial metrics
                if 'financial_metrics' in section_data:
                    fm = section_data['financial_metrics']
                    if fm.get('key_financial_figures'):
                        all_financial_metrics.extend(fm['key_financial_figures'])
                
                # Collect business segments
                if 'business_segments' in section_data:
                    bs = section_data['business_segments']
                    for segment, data in bs.items():
                        if segment not in all_business_segments:
                            all_business_segments[segment] = []
                        all_business_segments[segment].extend(data)
                
                # Collect risks
                if 'risk_factors' in section_data:
                    all_risks.extend(section_data['risk_factors'])
                
                # Collect key metrics
                if 'key_metrics' in section_data:
                    all_key_metrics.update(section_data['key_metrics'])
        
        # Compile summary
        summary['financial_highlights'] = list(set(all_financial_metrics))[:10]
        summary['business_segments'] = all_business_segments
        summary['key_risks'] = all_risks[:5]  # Top 5 risks
        summary['operational_metrics'] = all_key_metrics
        summary['filing_statistics'] = structured_data['summary']
        
        return summary
    
    def save_structured_data(self, output_file="apple_10k_structured_analysis.json"):
        """Save the complete structured analysis to JSON"""
        print("Starting comprehensive 10-K analysis...")
        
        # Process the entire structure
        structured_data = self.process_company_structure()
        
        # Generate executive summary
        executive_summary = self.generate_executive_summary(structured_data)
        
        # Combine everything
        final_output = {
            'executive_summary': executive_summary,
            'detailed_analysis': structured_data,
            'metadata': {
                'generated_by': '10-K Structured Analyzer',
                'version': '1.0',
                'analysis_timestamp': datetime.now().isoformat()
            }
        }
        
        # Save to JSON
        output_path = Path(output_file)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(final_output, f, indent=2, ensure_ascii=False)
        
        print(f"✅ Analysis complete! Saved to: {output_path}")
        print(f"📊 Total parts analyzed: {structured_data['summary']['total_parts']}")
        print(f"📄 Total sections analyzed: {structured_data['summary']['total_sections']}")
        print(f"📝 Total files processed: {structured_data['summary']['total_files']}")
        print(f"📏 Total content length: {structured_data['summary']['total_content_length']:,} characters")
        
        return final_output

# Initialize and run the analyzer
analyzer = TenKStructuredAnalyzer("Apple_Inc_Parsed")
result = analyzer.save_structured_data()

print("\n" + "="*60)
print("🎯 EXECUTIVE SUMMARY PREVIEW")
print("="*60)
print(f"📈 Financial Highlights Found: {len(result['executive_summary']['financial_highlights'])}")
print(f"🏢 Business Segments: {len(result['executive_summary']['business_segments'])}")
print(f"⚠️  Risk Factors Identified: {len(result['executive_summary']['key_risks'])}")
print(f"📊 Operational Metrics: {len(result['executive_summary']['operational_metrics'])}")

Starting comprehensive 10-K analysis...
Processing Part_1_PART_I...
  Analyzing Section_1_Item_1._Business.txt...
  Analyzing Section_2_Item_1A._Risk_Factors.txt...
  Analyzing Section_3_Item_1C._Cybersecurity.txt...
  Analyzing Section_4_Item_2._Properties.txt...
  Analyzing Section_5_Item_3._Legal_Proceedings.txt...
Processing Part_2_PART_II...
  Analyzing Section_1_Item_5._Market_for_Registrants_Common_Equity_Related_Stockholder_Matters_and_Issuer_Purchases_of_Equ.txt...
  Analyzing Section_2_Item_7._Managements_Discussion_and_Analysis_of_Financial_Condition_and_Results_of_Operations.txt...
  Analyzing Section_3_Item_7A._Quantitative_and_Qualitative_Disclosures_About_Market_Risk.txt...
  Analyzing Section_4_Item_8._Financial_Statements_and_Supplementary_Data.txt...
  Analyzing Section_5_Item_9A._Controls_and_Procedures.txt...
  Analyzing Section_6_Item_9B._Other_Information.txt...
Processing Part_3_PART_III...
  Analyzing Section_1_Item_10._Directors_Executive_Officers_and_Corporate

In [6]:
# Load and display key insights from our structured JSON
with open('apple_10k_structured_analysis.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

print("🍎 APPLE INC. 10-K FILING - STRUCTURED ANALYSIS RESULTS")
print("="*70)

# Executive Summary Display
exec_summary = data['executive_summary']

print("\n📈 FINANCIAL HIGHLIGHTS:")
print("-" * 30)
for i, highlight in enumerate(exec_summary['financial_highlights'][:5], 1):
    print(f"{i}. {highlight}")

print("\n🏢 BUSINESS SEGMENTS IDENTIFIED:")
print("-" * 35)
for segment, details in exec_summary['business_segments'].items():
    print(f"• {segment}: {len(details)} mentions")
    if details:
        print(f"  └─ Latest: {details[0][:100]}...")

print(f"\n📊 OPERATIONAL METRICS:")
print("-" * 25)
for metric, value in exec_summary['operational_metrics'].items():
    print(f"• {metric}: {value}")

print(f"\n⚠️  TOP RISK FACTORS:")
print("-" * 20)
for i, risk in enumerate(exec_summary['key_risks'][:3], 1):
    print(f"{i}. {risk[:150]}...")

print(f"\n📋 FILING STATISTICS:")
print("-" * 20)
stats = exec_summary['filing_statistics']
print(f"• Total Parts: {stats['total_parts']}")
print(f"• Total Sections: {stats['total_sections']}")
print(f"• Total Files: {stats['total_files']}")
print(f"• Content Length: {stats['total_content_length']:,} characters")

print("\n" + "="*70)
print("✅ COMPLETE STRUCTURED DATA AVAILABLE IN: apple_10k_structured_analysis.json")
print("="*70)

🍎 APPLE INC. 10-K FILING - STRUCTURED ANALYSIS RESULTS

📈 FINANCIAL HIGHLIGHTS:
------------------------------
1. $4.1 billion
2. $10.9 billion
3. $669 million
4. $538 million
5. $110 billion

🏢 BUSINESS SEGMENTS IDENTIFIED:
-----------------------------------
• iPhone: 7 mentions
  └─ Latest: iPhone
iPhone ® is the Company’s line of smartphones based on its iOS operating system. The iPhone l...
• Mac: 10 mentions
  └─ Latest: Mac
Mac ® is the Company’s line of personal computers based on its macOS ® operating system. The Mac...
• iPad: 9 mentions
  └─ Latest: iPad
iPad ® is the Company’s line of multipurpose tablets based on its iPadOS ® operating system. Th...
• Services: 14 mentions
  └─ Latest: services. The Company’s fiscal year is the 52- or 53-week period that ends on the
last Saturday of S...
• Wearables: 7 mentions
  └─ Latest: wearables and accessories,
and sells a variety of related services. The Company’s fiscal year is the...
• geographic_segments: 17 mentions
  └─ Latest:

In [7]:
# Data Validation and Export Utilities
import pandas as pd

def create_section_summary_table():
    """Create a summary table of all sections"""
    sections_data = []
    
    for part_name, part_data in data['detailed_analysis']['parts'].items():
        for section_name, section_info in part_data['sections'].items():
            sections_data.append({
                'Part': part_name.replace('Part_', '').replace('_', ' '),
                'Section': section_name,
                'File_Name': section_info.get('file_name', 'N/A'),
                'Content_Length': section_info.get('content_length', 0),
                'Word_Count': section_info.get('word_count', 0),
                'Financial_Metrics_Found': len(section_info.get('financial_metrics', {})),
                'Business_Segments_Found': len(section_info.get('business_segments', {})),
                'Has_Risk_Factors': 'risk_factors' in section_info
            })
    
    return pd.DataFrame(sections_data)

def export_key_data():
    """Export key data in multiple formats"""
    
    # Create summary table
    summary_df = create_section_summary_table()
    
    # Save to CSV
    summary_df.to_csv('apple_10k_sections_summary.csv', index=False)
    
    # Create financial metrics table
    financial_data = []
    for part_name, part_data in data['detailed_analysis']['parts'].items():
        for section_name, section_info in part_data['sections'].items():
            fm = section_info.get('financial_metrics', {})
            if fm.get('key_financial_figures'):
                for figure in fm['key_financial_figures'][:3]:  # Top 3 per section
                    financial_data.append({
                        'Part': part_name,
                        'Section': section_name,
                        'Financial_Figure': figure
                    })
    
    if financial_data:
        financial_df = pd.DataFrame(financial_data)
        financial_df.to_csv('apple_10k_financial_figures.csv', index=False)
    
    # Create simplified executive summary JSON
    simplified_summary = {
        'company': 'Apple Inc.',
        'filing_type': '10-K',
        'key_products': list(data['executive_summary']['business_segments'].keys()),
        'total_sections_analyzed': data['executive_summary']['filing_statistics']['total_sections'],
        'key_financial_highlights': data['executive_summary']['financial_highlights'][:5],
        'top_risk_factors': [risk[:200] for risk in data['executive_summary']['key_risks'][:3]]
    }
    
    with open('apple_10k_executive_summary.json', 'w', encoding='utf-8') as f:
        json.dump(simplified_summary, f, indent=2, ensure_ascii=False)
    
    return summary_df, len(financial_data)

# Execute export
print("📤 EXPORTING STRUCTURED DATA...")
print("-" * 40)

summary_table, financial_count = export_key_data()

print("✅ Files Created:")
print("  • apple_10k_structured_analysis.json (Complete analysis)")
print("  • apple_10k_sections_summary.csv (Sections overview)")
print("  • apple_10k_executive_summary.json (Executive summary)")
if financial_count > 0:
    print(f"  • apple_10k_financial_figures.csv ({financial_count} financial figures)")

print(f"\n📊 SECTIONS SUMMARY TABLE:")
print("-" * 30)
print(summary_table.to_string(index=False))

print(f"\n🎯 DATA QUALITY METRICS:")
print("-" * 25)
print(f"• Average content length per section: {summary_table['Content_Length'].mean():.0f} chars")
print(f"• Sections with financial data: {summary_table['Financial_Metrics_Found'].sum()}")
print(f"• Sections with business segments: {summary_table['Business_Segments_Found'].sum()}")
print(f"• Sections with risk factors: {summary_table['Has_Risk_Factors'].sum()}")

print("\n🏆 MISSION ACCOMPLISHED!")
print("="*50)
print("✅ 10-K filing successfully parsed into structured folders")
print("✅ Complete data extraction and analysis performed")  
print("✅ Structured JSON with accurate data created")
print("✅ Executive summary and insights generated")
print("✅ Multiple export formats created")
print("="*50)

📤 EXPORTING STRUCTURED DATA...
----------------------------------------
✅ Files Created:
  • apple_10k_structured_analysis.json (Complete analysis)
  • apple_10k_sections_summary.csv (Sections overview)
  • apple_10k_executive_summary.json (Executive summary)
  • apple_10k_financial_figures.csv (10 financial figures)

📊 SECTIONS SUMMARY TABLE:
------------------------------
      Part                                                                                                        Section                                                                                                          File_Name  Content_Length  Word_Count  Financial_Metrics_Found  Business_Segments_Found  Has_Risk_Factors
  1 PART I                                                                                     Section_1_Item_1._Business                                                                                     Section_1_Item_1._Business.txt           15757        2329                        1 

In [9]:
# SIMPLIFIED JSON CREATOR - RAW CONTENT ONLY
class SimpleTenKJSONCreator:
    """
    Simple creator that includes section text files as-is without quantification
    """
    
    def __init__(self, parsed_folder_path="Apple_Inc_Parsed"):
        self.parsed_folder = Path(parsed_folder_path)
    
    def create_simple_json(self, output_file="apple_10k_raw_content.json"):
        """Create JSON with raw text content of all sections"""
        
        if not self.parsed_folder.exists():
            return {"error": f"Folder {self.parsed_folder} does not exist"}
        
        company_data = {
            'company_name': 'Apple Inc.',
            'filing_type': '10-K',
            'created_date': datetime.now().isoformat(),
            'parts': {}
        }
        
        # Process each part folder
        part_folders = [d for d in self.parsed_folder.iterdir() if d.is_dir()]
        part_folders.sort()
        
        for part_folder in part_folders:
            part_name = part_folder.name
            print(f"Adding content from {part_name}...")
            
            part_data = {
                'part_name': part_name,
                'sections': {}
            }
            
            # Process each section file - add content as-is
            section_files = [f for f in part_folder.iterdir() if f.is_file() and f.suffix == '.txt']
            section_files.sort()
            
            for section_file in section_files:
                try:
                    with open(section_file, 'r', encoding='utf-8') as f:
                        content = f.read()
                    
                    section_name = section_file.stem
                    part_data['sections'][section_name] = {
                        'file_name': section_file.name,
                        'content': content  # Raw content as-is
                    }
                    print(f"  Added: {section_file.name}")
                    
                except Exception as e:
                    part_data['sections'][section_file.stem] = {
                        'file_name': section_file.name,
                        'error': str(e),
                        'content': ''
                    }
            
            company_data['parts'][part_name] = part_data
        
        # Save to JSON
        output_path = Path(output_file)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(company_data, f, indent=2, ensure_ascii=False)
        
        print(f"✅ Raw content JSON created: {output_path}")
        total_parts = len(company_data['parts'])
        total_sections = sum(len(part['sections']) for part in company_data['parts'].values())
        print(f"📊 Included {total_parts} parts with {total_sections} sections")
        
        return company_data

# Create the simplified JSON with raw content
simple_creator = SimpleTenKJSONCreator("Apple_Inc_Parsed")
raw_data = simple_creator.create_simple_json()

print("\n" + "="*60)
print("📄 RAW CONTENT JSON STRUCTURE PREVIEW")
print("="*60)
for part_name, part_data in list(raw_data['parts'].items())[:2]:  # Show first 2 parts
    print(f"\n📁 {part_name}:")
    for section_name in list(part_data['sections'].keys())[:3]:  # Show first 3 sections
        content_preview = part_data['sections'][section_name]['content'][:100]
        print(f"  📄 {section_name}: {content_preview}...")
    if len(part_data['sections']) > 3:
        print(f"  ... and {len(part_data['sections']) - 3} more sections")
        
print(f"\n✅ Complete raw text content available in: apple_10k_raw_content.json")

Adding content from Part_1_PART_I...
  Added: Section_1_Item_1._Business.txt
  Added: Section_2_Item_1A._Risk_Factors.txt
  Added: Section_3_Item_1C._Cybersecurity.txt
  Added: Section_4_Item_2._Properties.txt
  Added: Section_5_Item_3._Legal_Proceedings.txt
Adding content from Part_2_PART_II...
  Added: Section_1_Item_5._Market_for_Registrants_Common_Equity_Related_Stockholder_Matters_and_Issuer_Purchases_of_Equ.txt
  Added: Section_2_Item_7._Managements_Discussion_and_Analysis_of_Financial_Condition_and_Results_of_Operations.txt
  Added: Section_3_Item_7A._Quantitative_and_Qualitative_Disclosures_About_Market_Risk.txt
  Added: Section_4_Item_8._Financial_Statements_and_Supplementary_Data.txt
  Added: Section_5_Item_9A._Controls_and_Procedures.txt
  Added: Section_6_Item_9B._Other_Information.txt
Adding content from Part_3_PART_III...
  Added: Section_1_Item_10._Directors_Executive_Officers_and_Corporate_Governance.txt
  Added: Section_2_Item_11._Executive_Compensation.txt
  Added: Se