In [2]:
from pptx import Presentation

def find_my_template_ids(filename):
    try:
        prs = Presentation(filename)
    except Exception as e:
        print(f"Error: Could not open {filename}. Make sure it's in the folder. {e}")
        return

    print(f"{'Slide #':<10} | {'Index (idx)':<12} | {'Name':<25} | {'Type'}")
    print("-" * 70)

    for i, slide in enumerate(prs.slides):
        # We look at placeholders on the actual slides
        for shape in slide.placeholders:
            idx = shape.placeholder_format.idx
            name = shape.name
            ph_type = shape.placeholder_format.type
            print(f"{i:<10} | {idx:<12} | {name:<25} | {ph_type}")

if __name__ == "__main__":
    # CHANGE THIS to your actual template filename
    FILE_NAME = "template.pptx" 
    find_my_template_ids(FILE_NAME)

Slide #    | Index (idx)  | Name                      | Type
----------------------------------------------------------------------
0          | 0            | Title 1                   | CENTER_TITLE (3)
0          | 1            | Subtitle 2                | SUBTITLE (4)
1          | 0            | Title 1                   | TITLE (1)
1          | 1            | Picture Placeholder 2     | PICTURE (18)
1          | 2            | Text Placeholder 3        | BODY (2)
2          | 0            | Title 1                   | TITLE (1)
2          | 1            | Content Placeholder 2     | OBJECT (7)
3          | 0            | Title 1                   | TITLE (1)
3          | 1            | Content Placeholder 2     | OBJECT (7)
4          | 0            | Title 1                   | TITLE (1)
4          | 1            | Content Placeholder 2     | OBJECT (7)


In [4]:
import os
from pptx import Presentation
from pptx.chart.data import CategoryChartData, ChartData
from pptx.enum.chart import XL_CHART_TYPE

# --- CONFIGURATION ---
TEMPLATE_FILE = "template.pptx"
OUTPUT_FILE = "Final_Presentation.pptx"

# Replace these with your actual local image paths
IMAGE_PATH_1 = "bada_loda.png" 

def main():
    if not os.path.exists(TEMPLATE_FILE):
        print(f"Error: {TEMPLATE_FILE} not found in current directory.")
        return

    prs = Presentation(TEMPLATE_FILE)

    # --- SLIDE 0: TITLE SLIDE ---
    # idx 0: Title, idx 1: Subtitle
    s0 = prs.slides[0]
    s0.placeholders[0].text = "IIT Bombay: Robotics Project Report"
    s0.placeholders[1].text = "Automated System Analysis v1.0"

    # --- SLIDE 1: TEXT & IMAGE ---
    # idx 0: Title, idx 1: Picture, idx 2: Body Text
    s1 = prs.slides[1]
    s1.placeholders[0].text = "Hardware Architecture"
    s1.placeholders[2].text = (
        "• STM32 Blue Pill integrated with MPU9250.\n"
        "• Real-time sensor fusion for drone stability.\n"
        "• 3.7V LiPo power management logic."
    )
    if os.path.exists(IMAGE_PATH_1):
        s1.placeholders[1].insert_picture(IMAGE_PATH_1)
    else:
        print(f"Warning: {IMAGE_PATH_1} not found. Skipping image injection.")

    # --- SLIDE 2: BAR CHART (Clustered Column) ---
    # idx 1: Content Placeholder
    s2 = prs.slides[2]
    s2.placeholders[0].text = "Sensor Latency Analysis"
    
    chart_data_1 = CategoryChartData()
    chart_data_1.categories = ['MPU6050', 'MPU9250', 'BNO055']
    chart_data_1.add_series('Latency (ms)', (12.5, 8.2, 4.1))
    
    replace_with_chart(s2, 1, XL_CHART_TYPE.COLUMN_CLUSTERED, chart_data_1)

    # --- SLIDE 3: PIE CHART ---
    # idx 1: Content Placeholder
    s3 = prs.slides[3]
    s3.placeholders[0].text = "Power Consumption Breakdown"
    
    chart_data_2 = ChartData()
    chart_data_2.categories = ['Motors', 'MCU', 'RF Module', 'Sensors']
    chart_data_2.add_series('Usage', (0.70, 0.15, 0.10, 0.05))
    
    replace_with_chart(s3, 1, XL_CHART_TYPE.PIE, chart_data_2)

    # --- SLIDE 4: LINE CHART ---
    # idx 1: Content Placeholder
    s4 = prs.slides[4]
    s4.placeholders[0].text = "Thrust vs PWM Signal"
    
    chart_data_3 = CategoryChartData()
    chart_data_3.categories = ['1000us', '1250us', '1500us', '1750us', '2000us']
    chart_data_3.add_series('Thrust (g)', (0, 45, 110, 185, 240))
    
    replace_with_chart(s4, 1, XL_CHART_TYPE.LINE, chart_data_3)

    # --- SAVE ---
    prs.save(OUTPUT_FILE)
    print(f"Success! '{OUTPUT_FILE}' has been generated.")

def replace_with_chart(slide, idx, chart_type, data):
    """
    Finds placeholder by index, deletes the empty box, 
    and adds a native editable chart in the same spot.
    """
    try:
        ph = slide.placeholders[idx]
        # Store dimensions
        left, top, width, height = ph.left, ph.top, ph.width, ph.height
        # Delete placeholder
        sp = ph._element
        sp.getparent().remove(sp)
        # Create Chart
        slide.shapes.add_chart(chart_type, left, top, width, height, data)
    except KeyError:
        print(f"Error: Index {idx} not found on this slide.")

if __name__ == "__main__":
    main()

Success! 'Final_Presentation.pptx' has been generated.


In [5]:
"""
Agent 2: Data Extractor
Extracts structured data from markdown one-pagers using pandas and regex.
ZERO LLM USAGE - 100% deterministic extraction.
"""

import re
import pandas as pd
from typing import Dict, List, Any, Optional
from pathlib import Path
import logging

logger = logging.getLogger(__name__)


class DataExtractor:
    """Extract structured data from markdown one-pagers"""
    
    def __init__(self):
        self.data = {}
        self.validation_errors = []
    
    def extract(self, md_path: str) -> Dict[str, Any]:
        """Main extraction method - returns structured JSON"""
        logger.info(f"Extracting data from {md_path}")
        
        with open(md_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        self.data = {
            'source_file': md_path,
            'business_description': self._extract_section(content, '## Business Description'),
            'website': self._extract_section(content, '## Website').strip(),
            'products_services': self._extract_list_section(content, '## Product & Services'),
            'industries': self._extract_section(content, '## Application areas / Industries served'),
            'shareholders': self._extract_shareholders(content),
            'financials': self._extract_financials(content),
            'key_milestones': self._extract_milestones(content),
            'key_metrics': self._extract_key_metrics(content),
            'operational_indicators': self._extract_operational_indicators(content)
        }
        
        return self.data
    
    def _extract_section(self, content: str, header: str) -> str:
        """Extract text between header and next header or end"""
        pattern = f"{re.escape(header)}\n\n(.*?)(?=\n##|\Z)"
        match = re.search(pattern, content, re.DOTALL)
        return match.group(1).strip() if match else ""
    
    def _extract_list_section(self, content: str, header: str) -> List[str]:
        """Extract bullet point list items"""
        section = self._extract_section(content, header)
        
        # Match markdown list items (- ** or - *)
        pattern = r'-\s+\*\*(.+?)\*\*'
        items = re.findall(pattern, section)
        
        if not items:
            # Try simple dash lists
            pattern = r'-\s+(.+?)(?=\n-|\Z)'
            items = re.findall(pattern, section, re.DOTALL)
            items = [item.strip() for item in items]
        
        return items
    
    def _extract_shareholders(self, content: str) -> List[Dict[str, Any]]:
        """Extract shareholder tables"""
        section = self._extract_section(content, '## Shareholders')
        
        shareholders = []
        
        # Find all markdown tables in section
        table_pattern = r'\|(.+?)\|(.+?)\|(.+?)\|'
        matches = re.findall(table_pattern, section)
        
        for match in matches:
            name, value, share_type = [m.strip() for m in match]
            
            # Skip header rows
            if 'SHAREHOLDER NAME' in name or '---' in name:
                continue
            
            try:
                # Extract percentage value
                value_num = float(re.search(r'[\d.]+', value).group())
                
                shareholders.append({
                    'name': name,
                    'percentage': value_num,
                    'type': share_type
                })
            except (AttributeError, ValueError):
                continue
        
        return shareholders
    
    def _extract_financials(self, content: str) -> Dict[str, Dict[int, float]]:
        """
        Extract financial data from markdown tables.
        CRITICAL: This is ZERO LLM - direct regex parsing.
        """
        financials = {}
        
        # Key financial metrics to extract
        metrics = {
            'revenue': 'Revenue From Operations',
            'ebitda': 'Operating EBITDA',
            'pat_margin': 'PAT Margin',
            'roce': 'RoCE',
            'roe': 'ROE',
            'asset_turnover': 'Asset Turnover',
            'total_assets': 'Total Assets',
            'total_equity': 'Total Equity',
            'borrowings': 'Borrowings',
            'cash_flow_operations': 'Net cash flow from operating activities',
            'inventory_days': 'inventory_days',
            'receivable_days': 'receivable_days',
            'payable_days': 'payable_days'
        }
        
        for key, metric_name in metrics.items():
            pattern = f"{re.escape(metric_name)} \\| (.+)"
            match = re.search(pattern, content)
            
            if match:
                row_data = match.group(1)
                parsed_data = self._parse_financial_row(row_data)
                
                if parsed_data:
                    financials[key] = parsed_data
                    logger.debug(f"Extracted {key}: {len(parsed_data)} years")
        
        # Validate we have core metrics
        required = ['revenue', 'ebitda']
        for req in required:
            if req not in financials or len(financials[req]) < 3:
                self.validation_errors.append(f"Missing or insufficient data for {req}")
        
        return financials
    
    def _parse_financial_row(self, row_data: str) -> Dict[int, float]:
        """
        Parse financial row: 2014: 4251.81863 | 2015: 4879.97017 | ...
        Returns dict of {year: value}
        """
        data = {}
        
        # Split by pipe
        entries = row_data.split('|')
        
        for entry in entries:
            entry = entry.strip()
            
            if ':' not in entry:
                continue
            
            try:
                year_str, value_str = entry.split(':', 1)
                year = int(year_str.strip())
                value_str = value_str.strip()
                
                # Handle 'None' values
                if value_str.lower() == 'none':
                    continue
                
                # Remove commas and convert
                value = float(value_str.replace(',', ''))
                
                data[year] = value
                
            except (ValueError, AttributeError) as e:
                logger.debug(f"Failed to parse entry '{entry}': {e}")
                continue
        
        return data
    
    def _extract_milestones(self, content: str) -> List[Dict[str, str]]:
        """Extract key milestones table"""
        section = self._extract_section(content, '## Key Milestones')
        
        milestones = []
        
        # Match table rows
        pattern = r'\|\s*(.+?)\s*\|\s*(.+?)\s*\|'
        matches = re.findall(pattern, section)
        
        for match in matches:
            date, milestone = [m.strip() for m in match]
            
            # Skip headers
            if 'DATE' in date or '---' in date:
                continue
            
            milestones.append({
                'date': date,
                'description': milestone
            })
        
        return milestones
    
    def _extract_key_metrics(self, content: str) -> Dict[str, Any]:
        """Extract key operational metrics"""
        section = self._extract_section(content, '## Key Metrics')
        
        if section.lower() == 'not available':
            return {}
        
        metrics = {}
        
        # Try to extract specific metrics mentioned
        # Examples: "Employee count: 450", "Revenue per employee: ₹24L"
        patterns = {
            'employee_count': r'(\d+)\s*(?:employees|developers)',
            'facilities': r'(\d+)\s*facilities',
            'customers': r'(\d+)\+?\s*customers',
            'certifications': r'(ISO|CMMI|FSSC|GMP|USFDA)',
        }
        
        for key, pattern in patterns.items():
            match = re.search(pattern, section, re.IGNORECASE)
            if match:
                metrics[key] = match.group(1)
        
        return metrics
    
    def _extract_operational_indicators(self, content: str) -> List[str]:
        """Extract operational indicators as list"""
        section = self._extract_section(content, '## Key Operational Indicators')
        
        if not section:
            return []
        
        indicators = []
        
        # Match bullet points starting with **
        pattern = r'\*\s+\*\*(.+?):\*\*\s*(.+?)(?=\n\*|\Z)'
        matches = re.findall(pattern, section, re.DOTALL)
        
        for title, description in matches:
            indicators.append(f"{title}: {description.strip()}")
        
        return indicators
    
    def validate(self) -> bool:
        """Validate extracted data meets requirements"""
        errors = []
        
        # Check critical fields
        if not self.data.get('website'):
            errors.append("Missing website URL")
        
        if not self.data.get('business_description'):
            errors.append("Missing business description")
        
        # Check financials
        financials = self.data.get('financials', {})
        
        if 'revenue' not in financials:
            errors.append("Missing revenue data")
        elif len(financials['revenue']) < 3:
            errors.append(f"Insufficient revenue data: {len(financials['revenue'])} years (need 3+)")
        
        if 'ebitda' not in financials:
            errors.append("Missing EBITDA data")
        
        # Check years are consistent
        if 'revenue' in financials and 'ebitda' in financials:
            revenue_years = set(financials['revenue'].keys())
            ebitda_years = set(financials['ebitda'].keys())
            
            if revenue_years != ebitda_years:
                errors.append(f"Year mismatch between revenue and EBITDA")
        
        if errors:
            logger.error(f"Validation failed: {errors}")
            self.validation_errors.extend(errors)
            return False
        
        logger.info("Data validation passed")
        return True
    
    def calculate_derived_metrics(self) -> Dict[str, Any]:
        """Calculate derived financial metrics from raw data"""
        financials = self.data.get('financials', {})
        
        if not financials:
            return {}
        
        derived = {}
        
        # Get latest 5 years of data
        if 'revenue' in financials:
            years = sorted(financials['revenue'].keys())
            latest_years = years[-5:] if len(years) >= 5 else years
            
            revenues = [financials['revenue'][yr] for yr in latest_years]
            
            # Revenue CAGR
            if len(revenues) >= 2:
                n_years = len(revenues) - 1
                cagr = ((revenues[-1] / revenues[0]) ** (1/n_years) - 1) * 100
                derived['revenue_cagr'] = round(cagr, 1)
            
            # Latest revenue
            derived['latest_revenue'] = revenues[-1]
            derived['latest_year'] = latest_years[-1]
        
        # EBITDA margin (latest year)
        if 'revenue' in financials and 'ebitda' in financials:
            latest_year = max(financials['revenue'].keys())
            if latest_year in financials['ebitda']:
                revenue = financials['revenue'][latest_year]
                ebitda = financials['ebitda'][latest_year]
                
                if revenue > 0:
                    margin = (ebitda / revenue) * 100
                    derived['ebitda_margin'] = round(margin, 1)
        
        # PAT margin (latest)
        if 'pat_margin' in financials:
            latest_year = max(financials['pat_margin'].keys())
            derived['pat_margin'] = round(financials['pat_margin'][latest_year], 1)
        
        # ROCE (latest)
        if 'roce' in financials:
            latest_year = max(financials['roce'].keys())
            derived['roce'] = round(financials['roce'][latest_year], 1)
        
        # Working capital days (if available)
        if all(k in financials for k in ['inventory_days', 'receivable_days', 'payable_days']):
            latest_year = max(financials['inventory_days'].keys())
            inv = financials['inventory_days'].get(latest_year, 0)
            rec = financials['receivable_days'].get(latest_year, 0)
            pay = financials['payable_days'].get(latest_year, 0)
            
            derived['net_working_capital_days'] = int(inv + rec - pay)
        
        return derived
    
    def get_summary(self) -> Dict[str, Any]:
        """Get a summary of extracted data for logging/debugging"""
        return {
            'source': self.data.get('source_file'),
            'company_type': 'Unknown',  # Will be determined by Agent 1
            'has_website': bool(self.data.get('website')),
            'num_products': len(self.data.get('products_services', [])),
            'num_shareholders': len(self.data.get('shareholders', [])),
            'financial_years': len(self.data.get('financials', {}).get('revenue', {})),
            'num_milestones': len(self.data.get('key_milestones', [])),
            'validation_errors': self.validation_errors
        }


# Example usage
if __name__ == "__main__":
    import sys
    import json
    
    logging.basicConfig(level=logging.INFO)
    
    # if len(sys.argv) < 2:
    #     print("Usage: python 02_data_extractor.py <path_to_onepager.md>")
    #     sys.exit(1)
    
    extractor = DataExtractor()
    data = extractor.extract("Ksolves-OnePager.md")
    
    # Validate
    is_valid = extractor.validate()
    
    # Calculate derived metrics
    derived = extractor.calculate_derived_metrics()
    data['derived_metrics'] = derived
    
    # Print summary
    summary = extractor.get_summary()
    print("\n=== EXTRACTION SUMMARY ===")
    print(json.dumps(summary, indent=2))
    
    print("\n=== DERIVED METRICS ===")
    print(json.dumps(derived, indent=2))
    
    if is_valid:
        print("\n✅ Extraction successful and validated")
    else:
        print("\n❌ Validation failed")
        print("Errors:", extractor.validation_errors)

  pattern = f"{re.escape(header)}\n\n(.*?)(?=\n##|\Z)"
INFO:__main__:Extracting data from Ksolves-OnePager.md
ERROR:__main__:Validation failed: ['Missing website URL']



=== EXTRACTION SUMMARY ===
{
  "source": "Ksolves-OnePager.md",
  "company_type": "Unknown",
  "has_website": false,
  "num_products": 10,
  "num_shareholders": 11,
  "financial_years": 6,
  "num_milestones": 32,
  "validation_errors": [
    "Missing website URL"
  ]
}

=== DERIVED METRICS ===
{
  "revenue_cagr": 48.6,
  "latest_revenue": 1374.3,
  "latest_year": 2025,
  "ebitda_margin": 35.6,
  "pat_margin": 24.9,
  "roce": 79.2
}

❌ Validation failed
Errors: ['Missing website URL']
