In [None]:
import json
import re
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
import dspy
import pandas as pd

llm = dspy.LM(
    model="ollama/ministral-3:8b",
    temperature=0.1, 
    max_tokens=1500
)

dspy.settings.configure(lm=llm)

## Section 2: Data Models & Signatures

In [25]:
@dataclass
class ProjectRole:
    """Represents a single human role in a project."""
    role: str                      # e.g., "BACKEND_ENGINEER"
    quota: int                     # Number of people needed for this role
    responsibility: str            # Concrete, actionable responsibility
    required_skills: List[str]     # 3-6 concrete skills

    def validate(self) -> bool:
        """Validate role structure."""
        return (
            len(self.role.strip()) > 0
            and self.quota > 0
            and len(self.responsibility.strip()) > 0
            and 3 <= len(self.required_skills) <= 6
        )


@dataclass
class ProjectSpecification:
    """Complete, machine-consumable project specification."""
    project_summary: str           # Human-readable summary
    project_type: List[str]        # Categories: WEB_APPLICATION, AI_SYSTEM, etc.
    headcount: int                 # Total people needed
    duration_months: int           # Normalized to months
    roles: List[ProjectRole]       # List of roles
    assumptions: List[str]         # Non-obvious assumptions
    risks: List[str]               # Identified risks

    def validate(self) -> Dict[str, any]:
        """Validate specification integrity."""
        errors = []
        
        # Check headcount vs quota sum
        total_quota = sum(r.quota for r in self.roles)
        if total_quota != self.headcount:
            errors.append(
                f"Quota sum ({total_quota}) != headcount ({self.headcount})"
            )
        
        # Validate each role
        for role in self.roles:
            if not role.validate():
                errors.append(f"Invalid role: {role.role}")
        
        # Check project_type is not empty
        if not self.project_type or len(self.project_type) == 0:
            errors.append("project_type cannot be empty")
        
        return {
            "is_valid": len(errors) == 0,
            "errors": errors
        }

In [26]:
class ExtractProjectSpec(dspy.Signature):
    """
    Extract a structured project specification from an ambiguous project description.
    
    The output MUST be a valid JSON object with fields:
    - project_summary (str)
    - project_type (list of strings)
    - headcount (int)
    - duration_months (int)
    - roles (list of role objects with: role, quota, responsibility, required_skills)
    - assumptions (list of strings)
    - risks (list of strings)
    
    CRITICAL: The sum of all role quotas MUST equal headcount.
    Skills per role must be between 3 and 6.
    """
    project_description = dspy.InputField(desc="Unstructured project description")
    headcount = dspy.InputField(desc="Total number of people needed")
    duration = dspy.InputField(desc="Project duration (e.g., '3 months', '6 weeks', '2 quarters')")
    
    specification = dspy.OutputField(
        desc="Valid JSON object representing the structured project specification. "
             "Must include project_summary, project_type, headcount, duration_months, roles, assumptions, risks."
    )


class ProjectExtractor(dspy.Module):
    """
    Main module for extracting structured project specifications from free-text descriptions.
    
    This module acts as a project compiler: it takes ambiguous human-written descriptions
    and deterministically extracts role-based, quota-aware specifications suitable for
    downstream automation (matching, policy, resource allocation).
    """
    
    def __init__(self):
        super().__init__()
        self.extractor = dspy.ChainOfThought(ExtractProjectSpec)
    
    def forward(self, project_description: str, headcount: int, duration: str) -> ProjectSpecification:
        """
        Extract and parse project specification.
        
        Args:
            project_description: Free-text project description
            headcount: Total number of people needed
            duration: Duration string (e.g., "3 months")
        
        Returns:
            ProjectSpecification object (validated)
        """
        # Call the extractor
        result = self.extractor(
            project_description=project_description,
            headcount=headcount,
            duration=duration
        )
        
        # Parse JSON output
        spec_json = self._parse_json(result.specification)
        
        # Convert to ProjectSpecification with validation
        spec = self._build_specification(spec_json, headcount)
        
        return spec
    
    @staticmethod
    def _parse_json(json_str: str) -> Dict:
        """
        Robustly extract and parse the first valid JSON object from LLM output.
        Handles extra text, reasoning, markdown, and trailing content.
        """
        # Remove markdown fences
        cleaned = re.sub(r"```(?:json)?", "", json_str)
        cleaned = cleaned.strip()

        # Find first JSON object using brace matching
        start = cleaned.find("{")
        if start == -1:
            raise ValueError("No JSON object found in LLM output")

        stack = []
        end = None

        for i in range(start, len(cleaned)):
            if cleaned[i] == "{":
                stack.append("{")
            elif cleaned[i] == "}":
                stack.pop()
                if not stack:
                    end = i + 1
                    break

        if end is None:
            raise ValueError("Unbalanced JSON braces in LLM output")

        json_candidate = cleaned[start:end]

        try:
            return json.loads(json_candidate)
        except json.JSONDecodeError as e:
            raise ValueError(
                f"Failed to parse extracted JSON: {e}\nExtracted JSON:\n{json_candidate}"
            )

    
    @staticmethod
    def _build_specification(spec_dict: Dict, headcount: int) -> ProjectSpecification:
        """Convert parsed dict to ProjectSpecification with validation."""
        
        # Build roles
        roles = []
        for role_dict in spec_dict.get("roles", []):
            role = ProjectRole(
                role=role_dict.get("role", "").upper(),
                quota=int(role_dict.get("quota", 0)),
                responsibility=role_dict.get("responsibility", ""),
                required_skills=role_dict.get("required_skills", [])
            )
            roles.append(role)
        
        # Create specification
        spec = ProjectSpecification(
            project_summary=spec_dict.get("project_summary", ""),
            project_type=spec_dict.get("project_type", []),
            headcount=headcount,
            duration_months=int(spec_dict.get("duration_months", 0)),
            roles=roles,
            assumptions=spec_dict.get("assumptions", []),
            risks=spec_dict.get("risks", [])
        )
        
        # Validate
        validation = spec.validate()
        if not validation["is_valid"]:
            raise ValueError(f"Specification validation failed: {validation['errors']}")
        
        return spec
    
    def to_dict(self, spec: ProjectSpecification) -> Dict:
        """Convert ProjectSpecification back to dict for JSON serialization."""
        return {
            "project_summary": spec.project_summary,
            "project_type": spec.project_type,
            "headcount": spec.headcount,
            "duration_months": spec.duration_months,
            "roles": [
                {
                    "role": r.role,
                    "quota": r.quota,
                    "responsibility": r.responsibility,
                    "required_skills": r.required_skills
                }
                for r in spec.roles
            ],
            "assumptions": spec.assumptions,
            "risks": spec.risks
        }

In [None]:
extractor = ProjectExtractor()

## Section 5: Example Inputs & Execution

In [None]:
example_project_text = """
Build an e-commerce platform similar to Shopify for small businesses.
The system needs to support product catalogs, shopping carts, payment processing,
and order management. We want a modern, responsive web interface and a robust
backend API. The team will also need someone to handle deployment and infrastructure.

Expected timeline is around 12 weeks. We need 10 people total.
"""

example_headcount = 10
example_duration = "12 weeks"

print("=" * 80)
print("EXAMPLE PROJECT DESCRIPTION (UNSTRUCTURED)")
print("=" * 80)
print(f"\nProject Text:\n{example_project_text}")
print(f"\nHeadcount: {example_headcount}")
print(f"Duration: {example_duration}")
print("=" * 80)

EXAMPLE PROJECT DESCRIPTION (UNSTRUCTURED)

Project Text:

Build an e-commerce platform similar to Shopify for small businesses.
The system needs to support product catalogs, shopping carts, payment processing,
and order management. We want a modern, responsive web interface and a robust
backend API. The team will also need someone to handle deployment and infrastructure.

Expected timeline is around 12 weeks. We need 5 people total.


Headcount: 10
Duration: 12 weeks


## Section 6: Run Extraction & Validate Output

In [None]:
try:
    project_spec = extractor(
        project_description=example_project_text,
        headcount=example_headcount,
        duration=example_duration
    )
    print("✓ Extraction successful!")
    
except Exception as e:
    print(f"✗ Extraction failed: {e}")
    project_spec = None

✓ Extraction successful!


In [None]:
if project_spec:
    print("\n" + "=" * 80)
    print("EXTRACTED PROJECT SPECIFICATION (STRUCTURED)")
    print("=" * 80)
    
    spec_dict = extractor.to_dict(project_spec)
    print(json.dumps(spec_dict, indent=2))
    
    print("\n" + "=" * 80)
    print("VALIDATION REPORT")
    print("=" * 80)
    validation = project_spec.validate()
    print(f"Valid: {validation['is_valid']}")
    if validation['errors']:
        print(f"Errors: {validation['errors']}")
    else:
        print("✓ All constraints satisfied")


EXTRACTED PROJECT SPECIFICATION (STRUCTURED)
{
  "project_summary": "Build a Shopify-like e-commerce platform for small businesses with responsive UI, backend API, payment processing, and cloud infrastructure. Focus on MVP with scalability for future growth.",
  "project_type": [
    "Full-stack development",
    "SaaS-like platform",
    "Cloud-native application",
    "MVP development"
  ],
  "headcount": 10,
  "duration_months": 3,
  "roles": [
    {
      "role": "FRONTEND DEVELOPER (REACT.JS)",
      "quota": 3,
      "responsibility": "Design and implement responsive UI/UX, product catalog, shopping cart, and checkout flows. Ensure cross-browser compatibility.",
      "required_skills": [
        "React.js (Hooks, Context API)",
        "TypeScript",
        "CSS Frameworks (Tailwind/Bootstrap)",
        "REST/GraphQL API integration",
        "Responsive design principles"
      ]
    },
    {
      "role": "BACKEND DEVELOPER (NODE.JS)",
      "quota": 2,
      "responsibility"

## Section 8: Batch Processing & Persistence (Optional)

In [None]:
def extract_batch(projects_list: List[Dict]) -> pd.DataFrame:
    """
    Process multiple project descriptions in batch.
    
    Args:
        projects_list: List of dicts with 'description', 'headcount', 'duration'
    
    Returns:
        DataFrame with extraction results and validation status
    """
    results = []
    
    for idx, project in enumerate(projects_list):
        try:
            spec = extractor(
                project_description=project["description"],
                headcount=project["headcount"],
                duration=project["duration"]
            )
            
            spec_dict = extractor.to_dict(spec)
            validation = spec.validate()
            
            results.append({
                "project_index": idx,
                "status": "SUCCESS" if validation["is_valid"] else "VALIDATION_ERROR",
                "headcount": spec.headcount,
                "duration_months": spec.duration_months,
                "num_roles": len(spec.roles),
                "specification": json.dumps(spec_dict),
                "errors": "; ".join(validation["errors"]) if validation["errors"] else ""
            })
        
        except Exception as e:
            results.append({
                "project_index": idx,
                "status": "EXTRACTION_ERROR",
                "headcount": project.get("headcount", ""),
                "duration_months": "",
                "num_roles": "",
                "specification": "",
                "errors": str(e)
            })
    
    return pd.DataFrame(results)


# Example batch processing
test_projects = [
    {
        "description": example_project_text,
        "headcount": 10,
        "duration": "12 weeks"
    }
    # Additional projects can be added here
]

In [38]:
batch_results = extract_batch(test_projects)
batch_results

Unnamed: 0,project_index,status,headcount,duration_months,num_roles,specification,errors
0,0,SUCCESS,10,3,6,"{""project_summary"": ""Build a Shopify-like e-commerce platform for small businesses with responsive UI, backend API, payment processing, and cloud infrastructure. Focus on MVP with scalability for future growth."", ""project_type"": [""Full-stack development"", ""SaaS-like platform"", ""Cloud-native application"", ""MVP development""], ""headcount"": 10, ""duration_months"": 3, ""roles"": [{""role"": ""FRONTEND DEVELOPER (REACT.JS)"", ""quota"": 3, ""responsibility"": ""Design and implement responsive UI/UX, product catalog, shopping cart, and checkout flows. Ensure cross-browser compatibility."", ""required_skills"": [""React.js (Hooks, Context API)"", ""TypeScript"", ""CSS Frameworks (Tailwind/Bootstrap)"", ""REST/GraphQL API integration"", ""Responsive design principles""]}, {""role"": ""BACKEND DEVELOPER (NODE.JS)"", ""quota"": 2, ""responsibility"": ""Develop RESTful APIs for product management, orders, payments, and user authentication. Integrate with payment gateways (Stripe/PayPal)."", ""required_skills"": [""Node.js (Express/NestJS)"", ""MongoDB/PostgreSQL"", ""JWT/OAuth for authentication"", ""Payment API compliance (PCI-DSS basics)"", ""Microservices architecture (optional)""]}, {""role"": ""DATABASE ENGINEER"", ""quota"": 1, ""responsibility"": ""Design scalable database schema for product catalogs, orders, and user data. Optimize queries for performance."", ""required_skills"": [""PostgreSQL/MongoDB"", ""Indexing/Query optimization"", ""Data migration strategies"", ""Multi-tenant database design""]}, {""role"": ""DEVOPS/CLOUD ENGINEER"", ""quota"": 2, ""responsibility"": ""Set up cloud infrastructure (AWS/Azure), CI/CD pipelines, and deployment strategies. Ensure high availability and security."", ""required_skills"": [""AWS/Azure/GCP (EC2, S3, RDS, CDK/Terraform)"", ""Docker/Kubernetes (optional for scaling)"", ""CI/CD (GitHub Actions/Jenkins)"", ""Load balancing and auto-scaling""]}, {""role"": ""PAYMENT INTEGRATION SPECIALIST"", ""quota"": 1, ""responsibility"": ""Handle PCI-DSS compliance, payment gateway setup (Stripe/PayPal), and fraud detection integration."", ""required_skills"": [""Stripe/PayPal API"", ""PCI-DSS basics"", ""Fraud detection tools (e.g., Signifyd)"", ""Webhook handling for payment events""]}, {""role"": ""QA ENGINEER"", ""quota"": 1, ""responsibility"": ""Automated and manual testing for frontend/backend, payment flows, and edge cases. Performance and security testing."", ""required_skills"": [""Jest/Cypress (frontend)"", ""Postman/REST Assured (API)"", ""Load testing (JMeter/Gatling)"", ""Security scanning (OWASP ZAP)""]}], ""assumptions"": [""The platform will prioritize core e-commerce features (product catalog, cart, checkout, orders) in MVP phase."", ""Payment processing will use third-party gateways (Stripe/PayPal) with basic PCI-DSS compliance."", ""Cloud infrastructure will be AWS-based for scalability (can be adjusted to Azure/GCP)."", ""Multi-tenancy is supported but not fully isolated (shared database with tenant identifiers)."", ""No advanced analytics or CRM features in MVP; basic reporting via backend APIs.""], ""risks"": [""Scope expansion beyond MVP (e.g., adding inventory management, shipping integrations) could delay the 3-month timeline."", ""Payment gateway setup may require additional compliance efforts (e.g., PCI-DSS audits)."", ""Tech stack conflicts between frontend (React) and backend (Node.js) may introduce integration delays."", ""Cloud cost overruns if auto-scaling is not properly configured during development."", ""Third-party API changes (e.g., Stripe/PayPal) could require last-minute adjustments.""]}",


In [33]:
pd.set_option('display.max_colwidth', None)

In [39]:
batch_results['specification'] = batch_results['specification'].apply(
    lambda x: json.loads(x) if isinstance(x, str) else x
)

In [40]:
projects_records = []

for _, row in batch_results.iterrows():
    spec = row['specification']

    projects_records.append({
        'project_index': row['project_index'],
        'project_summary': spec.get('project_summary'),
        'project_type': spec.get('project_type'),
        'headcount': spec.get('headcount'),
        'duration_months': spec.get('duration_months'),
    })

projects_df = pd.DataFrame(projects_records)


In [42]:
roles_records = []

for _, row in batch_results.iterrows():
    spec = row['specification']
    roles = spec.get('roles', [])

    for r in roles:
        roles_records.append({
            'project_index': row['project_index'],
            'role': r.get('role'),
            'quota': r.get('quota'),
            'responsibility': r.get('responsibility'),
        })

roles_df = pd.DataFrame(roles_records)


In [43]:
skills_records = []

for _, row in batch_results.iterrows():
    spec = row['specification']
    roles = spec.get('roles', [])

    for r in roles:
        for skill in r.get('required_skills', []):
            skills_records.append({
                'project_index': row['project_index'],
                'role': r.get('role'),
                'skill': skill,
            })

skills_df = pd.DataFrame(skills_records)

In [44]:
projects_df.head()

Unnamed: 0,project_index,project_summary,project_type,headcount,duration_months
0,0,"Build a Shopify-like e-commerce platform for small businesses with responsive UI, backend API, payment processing, and cloud infrastructure. Focus on MVP with scalability for future growth.","[Full-stack development, SaaS-like platform, Cloud-native application, MVP development]",10,3


In [45]:
roles_df.head()

Unnamed: 0,project_index,role,quota,responsibility
0,0,FRONTEND DEVELOPER (REACT.JS),3,"Design and implement responsive UI/UX, product catalog, shopping cart, and checkout flows. Ensure cross-browser compatibility."
1,0,BACKEND DEVELOPER (NODE.JS),2,"Develop RESTful APIs for product management, orders, payments, and user authentication. Integrate with payment gateways (Stripe/PayPal)."
2,0,DATABASE ENGINEER,1,"Design scalable database schema for product catalogs, orders, and user data. Optimize queries for performance."
3,0,DEVOPS/CLOUD ENGINEER,2,"Set up cloud infrastructure (AWS/Azure), CI/CD pipelines, and deployment strategies. Ensure high availability and security."
4,0,PAYMENT INTEGRATION SPECIALIST,1,"Handle PCI-DSS compliance, payment gateway setup (Stripe/PayPal), and fraud detection integration."


In [46]:
skills_df

Unnamed: 0,project_index,role,skill
0,0,FRONTEND DEVELOPER (REACT.JS),"React.js (Hooks, Context API)"
1,0,FRONTEND DEVELOPER (REACT.JS),TypeScript
2,0,FRONTEND DEVELOPER (REACT.JS),CSS Frameworks (Tailwind/Bootstrap)
3,0,FRONTEND DEVELOPER (REACT.JS),REST/GraphQL API integration
4,0,FRONTEND DEVELOPER (REACT.JS),Responsive design principles
5,0,BACKEND DEVELOPER (NODE.JS),Node.js (Express/NestJS)
6,0,BACKEND DEVELOPER (NODE.JS),MongoDB/PostgreSQL
7,0,BACKEND DEVELOPER (NODE.JS),JWT/OAuth for authentication
8,0,BACKEND DEVELOPER (NODE.JS),Payment API compliance (PCI-DSS basics)
9,0,BACKEND DEVELOPER (NODE.JS),Microservices architecture (optional)
