In [None]:
#Code was developed in collaboration with Claude4-Sonnet
#Claude was used for code generation, documentation, and error handling
#Note: the code will run best when executing each section separately
#Required inputs: CSV file with securities law data

# 1. Randomly select a regulation in Python
import pandas as pd
import random
import json

def load_and_select_random_regulation(csv_path, filters=None, random_seed=None):
    """
    Load regulations from CSV and randomly select one
    
    Parameters:
    -----------
    csv_path : str
        Path to the CSV file containing regulations
    filters : dict, optional
        Dictionary of filters to apply before random selection
        Example: {'Year': 2017, 'Regulatory Body': 'SEC'}
    random_seed : int, optional
        Seed for reproducibility
    
    Returns:
    --------
    dict
        Dictionary containing regulation details
    """
    try:
        # Read the CSV file
        df = pd.read_csv(csv_path)
        
        print(f"‚úì Loaded {len(df)} regulations from database")
        
        # Apply filters if provided
        if filters:
            for column, value in filters.items():
                if column in df.columns:
                    df = df[df[column] == value]
                    print(f"‚úì Filtered by {column}={value}, {len(df)} regulations remaining")
        
        if len(df) == 0:
            print("‚úó No regulations match the specified filters")
            return None
        
        # Set random seed if provided for reproducibility
        if random_seed is not None:
            random.seed(random_seed)
        
        # Randomly select one row
        selected_row = df.sample(n=1, random_state=random_seed).iloc[0]
        
        # Extract relevant information
        regulation_info = {
            'title': selected_row.get('Regulation Title', 'N/A'),
            'date': selected_row.get('Date', 'N/A'),
            'year': selected_row.get('Year', 'N/A'),
            'regulatory_body': selected_row.get('Regulatory Body', 'N/A'),
            'description': selected_row.get('Description', 'N/A'),
            'impact': selected_row.get('Impact', 'N/A'),
            'references': selected_row.get('References', 'N/A'),
            # Channel categories
            'litigation_risk': selected_row.get('Litigation Risk', 'No'),
            'corporate_governance': selected_row.get('Corporate Governance', 'No'),
            'proprietary_costs': selected_row.get('Proprietary Costs', 'No'),
            'information_asymmetry': selected_row.get('Information Asymmetry', 'No'),
            'unsophisticated_investors': selected_row.get('Unsophisticated Investors', 'No'),
            'equity_issuance': selected_row.get('Equity Issuance', 'No'),
            'reputation_risk': selected_row.get('Reputation Risk', 'No'),
        }
        
        # Create a detailed description combining all information
        detailed_description = f"""{regulation_info['description']}

Regulation Details:
- Date Implemented: {regulation_info['date']} ({regulation_info['year']})
- Regulatory Body: {regulation_info['regulatory_body']}
- Overall Impact: {regulation_info['impact']}

Disclosure Channels:
- Litigation Risk: {regulation_info['litigation_risk']}
- Corporate Governance: {regulation_info['corporate_governance']}
- Proprietary Costs: {regulation_info['proprietary_costs']}
- Information Asymmetry: {regulation_info['information_asymmetry']}
- Unsophisticated Investors: {regulation_info['unsophisticated_investors']}
- Equity Issuance: {regulation_info['equity_issuance']}
- Reputation Risk: {regulation_info['reputation_risk']}

Reference: {regulation_info['references']}
"""
        
        regulation_info['detailed_description'] = detailed_description
        
        return regulation_info
        
    except FileNotFoundError:
        print(f"‚úó CSV file not found at: {csv_path}")
        return None
    except Exception as e:
        print(f"‚úó Error loading regulation data: {e}")
        return None


def preview_regulations_database(csv_path, n_samples=5):
    """
    Preview the regulations database to help with filtering decisions
    """
    try:
        df = pd.read_csv(csv_path)
        
        print("\n" + "="*80)
        print("REGULATIONS DATABASE SUMMARY")
        print("="*80)
        print(f"Total Regulations: {len(df)}")
        print(f"\nYears Available: {sorted(df['Year'].unique())}")
        print(f"Regulatory Bodies: {df['Regulatory Body'].unique()}")
        
        print(f"\nImpact Categories Summary:")
        impact_cols = ['Litigation Risk', 'Corporate Governance', 'Proprietary Costs', 
                      'Information Asymmetry', 'Unsophisticated Investors', 
                      'Equity Issuance', 'Reputation Risk']
        for col in impact_cols:
            if col in df.columns:
                yes_count = (df[col] == 'Yes').sum()
                print(f"  {col}: {yes_count} regulations ({yes_count/len(df)*100:.1f}%)")
        
        print(f"\n{n_samples} Random Sample Regulations:")
        print("-"*80)
        samples = df.sample(n=min(n_samples, len(df)))
        for idx, row in samples.iterrows():
            print(f"\n{row.get('Year', 'N/A')} - {row.get('Regulation Title', 'N/A')}")
            print(f"  Body: {row.get('Regulatory Body', 'N/A')}")
            print(f"  Impact: {row.get('Impact', 'N/A')[:100]}...")
        print("="*80 + "\n")
        
    except Exception as e:
        print(f"‚úó Error previewing database: {e}")



csv_path = r"enter file path here"

# Preview the database first
preview_database = True  # Set to True to see what's available
if preview_database:
    preview_regulations_database(csv_path, n_samples=10)

# Apply filters if needed
# Examples:
# filters = {'Year': 2017}
# filters = {'Regulatory Body': 'SEC'}
# filters = {'Information Asymmetry': 'Yes'}
filters = None  # No filters = completely random

# Set seed for reproducibility
random_seed = 5  # Set to number for reproducible results

# ============================================================================
# STEP 1 SELECT REGULATION
# ============================================================================

print("\n" + "="*80)
print("RANDOMLY SELECTING REGULATION...")
print("="*80 + "\n")

selected_regulation = load_and_select_random_regulation(
    csv_path=csv_path,
    filters=filters,
    random_seed=random_seed
)

if selected_regulation:
    print("\n" + "="*80)
    print("SELECTED REGULATION")
    print("="*80)
    print(f"\nüìã Title: {selected_regulation['title']}")
    print(f"üìÖ Year: {selected_regulation['year']}")
    print(f"üèõÔ∏è  Body: {selected_regulation['regulatory_body']}")
    print(f"\nüìù Description:\n{selected_regulation['description']}")
    
    print(f"\nüéØ Predicted Impact Areas:")
    impact_areas = []
    if selected_regulation['litigation_risk'] == 'Yes':
        impact_areas.append('Litigation Risk')
    if selected_regulation['corporate_governance'] == 'Yes':
        impact_areas.append('Corporate Governance')
    if selected_regulation['proprietary_costs'] == 'Yes':
        impact_areas.append('Proprietary Costs')
    if selected_regulation['information_asymmetry'] == 'Yes':
        impact_areas.append('Information Asymmetry')
    if selected_regulation['unsophisticated_investors'] == 'Yes':
        impact_areas.append('Unsophisticated Investors')
    if selected_regulation['equity_issuance'] == 'Yes':
        impact_areas.append('Equity Issuance')
    if selected_regulation['reputation_risk'] == 'Yes':
        impact_areas.append('Reputation Risk')
    
    if impact_areas:
        for area in impact_areas:
            print(f"   ‚Ä¢ {area}")
    else:
        print("   ‚Ä¢ None specified")
    
    print(f"\nüîó Reference: {selected_regulation['references']}")
    print("="*80)
    
    print("\n‚úÖ Regulation selected successfully!")
    print("   ‚Üí Run the next cell to generate research ideas")
    print("   ‚Üí Or re-run this cell to select a different regulation")
else:
    print("\n‚úó Failed to select regulation. Please check the file path and try again.")
    
    
# 2. Generate interesting research questions, identify the best possible outcome variable, write research proposals

import anthropic
import os
import json
import time
import logging
from datetime import datetime
import numpy as np

def convert_to_json_serializable(obj):
    """Convert numpy types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {k: convert_to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_json_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def generate_research_questions(regulation_name, regulation_description, llm_client, 
                                n_questions=10, model="claude-sonnet-4-20250514"):
    """
    STEP 2: Given a regulation, generate interesting research questions first
    """
    
    prompt = f"""
You are an expert researcher in accounting and auditing. See the following regulation:

Regulation: {regulation_name}

Description: {regulation_description}

Your task is to generate {n_questions} interesting and important research questions that examine 
the effects of this regulation.

For each research question, provide:
1. Research question 
2. Brief rationale 
3. Motivation

Format your response as a JSON array:
[
  {{
    "research_question": "...",
    "rationale": "...",
    "theoretical_motivation": "..."
  }},
  ...
]

Focus on research questions that:
- Examine important economic consequences of the regulation
- Have clear theoretical links to the regulation
- Are relevant to accounting/auditing research
- Can be answered with archival-empirical methods
"""
    
    try:
        response = llm_client.messages.create(
            model=model,
            max_tokens=8000,
            temperature=0.7,
            messages=[{"role": "user", "content": prompt}]
        )
        
        response_text = response.content[0].text.strip()
        
        # Clean JSON formatting
        if response_text.startswith("```json"):
            response_text = response_text.replace("```json", "", 1)
        if response_text.endswith("```"):
            response_text = response_text[:-3]
        
        clean_text = response_text.strip()
        questions = json.loads(clean_text)
        
        logger.info(f"Successfully generated {len(questions)} research questions")
        
        return {
            "regulation": regulation_name,
            "research_questions": questions,
            "timestamp": datetime.now().isoformat()
        }
        
    except json.JSONDecodeError as e:
        logger.error(f"JSON parsing error: {e}")
        return None
    except Exception as e:
        logger.error(f"Error generating research questions: {e}")
        return None


def identify_outcome_variable_for_question(regulation_name, regulation_description, 
                                          research_question, llm_client,
                                          model="claude-sonnet-4-20250514"):
    """
    STEP 3: Given a research question, identify the best possible outcome variable (Y) to answer it
    """
    
    prompt = f"""
You are an expert researcher in accounting and auditing.

Regulation: {regulation_name}

Description: {regulation_description}

Research Question: {research_question.get('research_question', '')}

Your task is to identify the best possible outcome variable (dependent variable Y) that would allow 
researchers to answer this specific research question.

Provide:
1. Variable name 
2. Brief definition
3. Explanation about why this outcome variable 
   is the best outcome variable for answering this specific research question 
4. Data source where the data can be obtained from

Format your response as a JSON object:
{{
  "variable_name": "...",
  "definition": "...",
  "why_best_for_question": "...",
  "data_source": "..."
}}

Focus on outcomes that:
- Are observable: measurable with archival data
- Have clear theoretical links to the regulation
- Are relevant to firm behavior, market outcomes, accounting quality, or audit quality
"""
    
    try:
        response = llm_client.messages.create(
            model=model,
            max_tokens=8000,
            temperature=0.7,
            messages=[{"role": "user", "content": prompt}]
        )
        
        response_text = response.content[0].text.strip()
        
        # Clean JSON formatting
        if response_text.startswith("```json"):
            response_text = response_text.replace("```json", "", 1)
        if response_text.endswith("```"):
            response_text = response_text[:-3]
        
        clean_text = response_text.strip()
        outcome = json.loads(clean_text)
        
        logger.info(f"Successfully identified outcome variable: {outcome.get('variable_name', 'N/A')}")
        
        return outcome
        
    except json.JSONDecodeError as e:
        logger.error(f"JSON parsing error: {e}")
        return None
    except Exception as e:
        logger.error(f"Error identifying outcome variable: {e}")
        return None


def generate_proposal_from_question_and_outcome(regulation_name, regulation_description,
                                               research_question, outcome_variable, llm_client,
                                               model="claude-sonnet-4-20250514"):
    """
    STEP 4: Generate complete research proposal from research question + outcome variable
    """
    
    prompt = f"""
You are an expert researcher in accounting and auditing preparing a research proposal.

Regulation Name: {regulation_name}

Regulation Details: {regulation_description}

Research Question: {research_question.get('research_question', '')}

Outcome Variable (Y): {outcome_variable.get('variable_name', '')}

Outcome Definition: {outcome_variable.get('definition', '')}

Your task is to develop a complete research proposal (at least 600 words) for this specific research questions
with the following sections:

1. Title: A concise statement of the main research question to be used as the paper title.

2. Research question: Clearly define the research question motivating the study. Explain clearly
    why this problem is interesting and important. The research question should be closely related to the
    regulation (the X variable) and the specified outcome variable (Y variable). Propose a theoretically supported 
    hypothesis about the relationship between the independent variable of interest and the dependent variable. This section should
    be approximately 100 words.

3. Motivation: Explain why it is important to answer this question in accounting/auditing.
    Specifically, clearly explain why answering this question is conceptually important. Include 2-3 citations in 
    parentheses, like (Author, Year) or (Author et al., Year). This section should be approximately 100 words.

4. Theoretical Framework and Existing Work: Mention the most relevant existing work from the provided papers. How has this question 
    been addressed thus far in the relevant literature? What are the competing theories for explanation of this question?
    Build logical arguments step by step think through whether prior literature suggests competing theoretical 
    predictions or if the literature suggests only one direction for the relationship. Include 2-3 citations
    in parentheses, like (Author, Year) or (Author et al., Year). 
    IMPORTANT: Make direct references to the papers provided.
    This section should be approximately 100 words.

5. Research Design: Propose a research method and describe it in detail. 
    The research approach should be archival-empirical.
    Make sure every step is executable. 
    Cover all essential details such as: the data sources, variables (independent and dependent 
    variables), the fixed effect structure, the time window for the study, the clustering of standard errors, and the 
    type of model specification (e.g., difference-in-differences, staggered difference-in-differences,
    stacked difference-in-differences) to be used. 
    
    Specifically:
        - Present the complete regression equation in proper mathematical notation
        - Do not include the subscripts i and t in the regression 
        - Format the equation professionally
        - IMPORTANT: Define the dependent variable in detail
            -Provide a comprehensive definition (not just a brief description)
            -Describe how this variable is measured
            -Specify the exact data source or data sources and data fields used
            -Describe the methodology for measuring the variable step-by-step
        - IMPORTANT: Define the independent variable of interest in detail
            -Provide a comprehensive definition (not just a brief description)
            -Describe how this variable is measured
            -Specify the exact data source or data sources and data fields used
            -Describe the methodology for measuring the variable step-by-step
        - IMPORTANT: Define each control variable in detail
            -Provide a comprehensive definition (not just a brief description)
            -Describe how this variable is measured
            -Specify the exact data source or data sources and data fields used
            -Describe the methodology for measuring the variable step-by-step
        - Specify fixed effects structure
        - Indicate standard error clustering approach
        - Describe the data sources. For example, data provided by the PCAOB, or data from Audit Analytics,
          Compustat, CRSP, I/B/E/S)
        - Describe the time window for the analysis 
        - Include 2-3 citations in parentheses, like (Author, Year) or (Author et al., Year)
        
    
    This section should be the most detailed at approximately 300 words.
    
6. Contribution: Describe the contribution of the proposed study. This section should be approximately 50-100 words.

7. References: Provide a complete list of all papers cited in the proposal above. 
    Use standard academic citation format (e.g., Author, A., & Author, B. (Year). Title. Journal, Volume(Issue), pages.)
    List references alphabetically by first author's last name.
    Include ALL papers that were cited in parentheses throughout the proposal.
    
    
    Each idea should:
    - Address a clear research gap not covered by existing literature
    - Propose a novel contribution with an executable research approach
    - Be clearly different from the papers listed above
    - Make each idea standalone and not dependent on the other ideas
    - Must include citations to the relevant papers provided to you
    - MUST BE AT LEAST 600 WORDS TOTAL - shorter ideas will be rejected
       
    IMPORTANT: Make sure to include ACTUAL CITATIONS to the papers provided. 
    These should be in the form of parenthetical citations such as (Author, Year) or (Author et al., Year). 
    Your proposals must reference specific papers from the list provided. 
    Don't just mention that prior literature has shown something without citing specific papers.
    
    **VERY IMPORTANT: Each idea must be approximately 600 words in length. Short ideas will be rejected.
    Make sure each section is detailed and thorough. The research design section should be the most detailed. It 
    should be at least 300 words.**
    
    LENGTH CHECK: After completing each idea, count the number of words and ensure it is at least 600 words. 
    If an idea is less than 600 words, expand it with more details until it meets the minimum length requirement.
    
    
Writing Guidelines:
- Each idea MUST be at least 600 words. 
- Use active voice (e.g., "We aim to examine" instead of "This paper will examine")
- Maintain formal academic tone suitable for a high-quality research proposal
- Include 2-3 citations per paragraph 
- Use present tense for established findings
- Make clear distinctions between correlation and causation
- Cite papers from, especially those from top accounting journal such as:
        The Accounting Review, Journal of Accounting Research, 
        Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
        and Review of Accounting Studies

Format as JSON:
{{
  "title": "...",
  "research_question": "...",
  "motivation": "...",
  "theoretical_framework": "...",
  "research_design": "...",
  "contribution": "...",
  "word_count": <number>
}}
"""
    
    try:
        response = llm_client.messages.create(
            model=model,
            max_tokens=8000,
            temperature=0.7,
            messages=[{"role": "user", "content": prompt}]
        )
        
        response_text = response.content[0].text.strip()
        
        # Clean JSON
        if response_text.startswith("```json"):
            response_text = response_text.replace("```json", "", 1)
        if response_text.endswith("```"):
            response_text = response_text[:-3]
        
        clean_text = response_text.strip()
        proposal = json.loads(clean_text)
        
        # Add metadata
        proposal['original_research_question'] = research_question
        proposal['outcome_variable'] = outcome_variable
        proposal['regulation'] = regulation_name
        
        return proposal
        
    except Exception as e:
        logger.error(f"Error generating proposal: {e}")
        return None


def create_summary_report(results, output_folder):
    """Create a markdown summary report"""
    
    report_path = os.path.join(output_folder, "SUMMARY_REPORT.md")
    
    with open(report_path, "w", encoding="utf-8") as f:
        f.write(f"# Research Proposals: {results['regulation']}\n\n")
        f.write(f"Generated: {results['generation_timestamp']}\n\n")
        f.write(f"**Approach**: Research Questions First ‚Üí Outcome Variables ‚Üí Proposals\n\n")
        f.write(f"---\n\n")
        
        f.write(f"## Regulation Details\n\n")
        f.write(f"{results['regulation_description']}\n\n")
        f.write(f"---\n\n")
        
        f.write(f"## Generation Summary\n\n")
        f.write(f"- **Research Questions Generated**: {results.get('n_questions', 0)}\n")
        f.write(f"- **Outcome Variables Identified**: {results.get('n_outcomes', 0)}\n")
        f.write(f"- **Research Proposals Generated**: {results.get('n_proposals_generated', 0)}\n\n")
        
        f.write(f"---\n\n")
        
        f.write(f"## Research Questions & Outcome Variables\n\n")
        for i, (rq, outcome) in enumerate(zip(results.get('research_questions', []), 
                                              results.get('outcome_variables', [])), 1):
            f.write(f"### {i}. Research Question\n")
            f.write(f"**Question**: {rq.get('research_question', 'N/A')}\n\n")
            f.write(f"**Rationale**: {rq.get('rationale', 'N/A')}\n\n")
            f.write(f"**Outcome Variable**: {outcome.get('variable_name', 'N/A')}\n\n")
            f.write(f"**Why Best**: {outcome.get('why_best_for_question', 'N/A')}\n\n")
            f.write(f"---\n\n")
        
        f.write(f"## Research Proposals\n\n")
        
        for i, proposal in enumerate(results['research_proposals'], 1):
            f.write(f"### Proposal {i}. {proposal.get('title', 'Untitled')}\n\n")
            f.write(f"**Word Count**: {proposal.get('word_count', 'N/A')}\n\n")
            
            f.write(f"**Research Question**:\n{proposal.get('research_question', 'N/A')}\n\n")
            f.write(f"**Motivation**:\n{proposal.get('motivation', 'N/A')}\n\n")
            f.write(f"**Theoretical Framework**:\n{proposal.get('theoretical_framework', 'N/A')}\n\n")
            f.write(f"**Research Design**:\n{proposal.get('research_design', 'N/A')}\n\n")
            f.write(f"**Contribution**:\n{proposal.get('contribution', 'N/A')}\n\n")
            f.write(f"---\n\n")
    
    logger.info(f"Summary report saved to: {report_path}")


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if 'selected_regulation' not in locals():
    print("="*80)
    print("‚ö†Ô∏è  ERROR: No regulation selected!")
    print("="*80)
    print("\nPlease run Cell 1 first to select a regulation.")
    
else:
    # ============================================================================
    # CONFIGURATION
    # ============================================================================
    
    print("\n" + "="*80)
    print("RESEARCH QUESTIONS FIRST APPROACH")
    print("="*80)
    print(f"\nRegulation: {selected_regulation['title']}")
    print(f"Year: {selected_regulation['year']}")
    
    # Configuration
    n_questions = 10  # Number of research questions to generate
    
    print(f"\nWorkflow:")
    print(f"  1. Generate {n_questions} research questions")
    print(f"  2. Identify best outcome variable for each question")
    print(f"  3. Write research proposal for each question-outcome pair")
    
    print(f"\nEstimated time:")
    print(f"  - Research questions: ~1 minute")
    print(f"  - Outcome variables: ~{n_questions * 0.5:.0f} minutes")
    print(f"  - Proposals: ~{n_questions * 1.5:.0f}-{n_questions * 2:.0f} minutes")
    print(f"  - Total: ~{n_questions * 2 + 1:.0f}-{n_questions * 2.5 + 1:.0f} minutes")
    
    # Create output folder
    folder_name = selected_regulation['title'][:50].replace(' ', '_').replace(':', '').replace('/', '_')
    output_folder = f"RQ_first_{selected_regulation['year']}_{folder_name}"
    os.makedirs(output_folder, exist_ok=True)
    
    print(f"\nOutput folder: {output_folder}")
    print("\n" + "="*80 + "\n")
    
    # ============================================================================
    # LOAD API KEY
    # ============================================================================
    
    api_key = "enter API key here" 
    print(f"API key loaded: {api_key[:20]}...")
    llm_client = anthropic.Anthropic(api_key=api_key)
    print("LLM client created successfully!\n") 
    
    # ============================================================================
    # STEP 2: GENERATE RESEARCH QUESTIONS
    # ============================================================================
    
    regulation_name = selected_regulation['title']
    regulation_description = selected_regulation['detailed_description']
    
    print("STEP 2: Generating research questions...\n")
    
    questions_results = generate_research_questions(
        regulation_name=regulation_name,
        regulation_description=regulation_description,
        llm_client=llm_client,
        n_questions=n_questions
    )
    
    if not questions_results or not questions_results['research_questions']:
        print("\n‚ùå Failed to generate research questions. Exiting.")
    else:
        print(f"\n‚úÖ Generated {len(questions_results['research_questions'])} research questions:\n")
        
        for i, rq in enumerate(questions_results['research_questions'], 1):
            print(f"{i:2d}. {rq.get('research_question', 'N/A')[:80]}...")
        
        print("\n" + "="*80 + "\n")
        
        # ============================================================================
        # STEP 3: IDENTIFY OUTCOME VARIABLES
        # ============================================================================
        
        print("STEP 3: Identifying best outcome variable for each research question...\n")
        
        outcome_variables = []
        
        for idx, rq in enumerate(questions_results['research_questions']):
            logger.info(f"[{idx+1}/{len(questions_results['research_questions'])}] Identifying outcome for: {rq.get('research_question', 'N/A')[:60]}...")
            
            outcome = identify_outcome_variable_for_question(
                regulation_name=regulation_name,
                regulation_description=regulation_description,
                research_question=rq,
                llm_client=llm_client
            )
            
            if outcome:
                outcome_variables.append(outcome)
                logger.info(f"  ‚Üí Outcome variable: {outcome.get('variable_name', 'N/A')}")
            else:
                # Add placeholder if failed
                outcome_variables.append({"variable_name": "Unknown", "definition": "Failed to generate"})
            
            time.sleep(1)  # Rate limiting
        
        print(f"\n‚úÖ Identified {len(outcome_variables)} outcome variables")
        print("\n" + "="*80 + "\n")
        
        # ============================================================================
        # STEP 4: GENERATE RESEARCH PROPOSALS
        # ============================================================================
        
        print("STEP 4: Generating research proposals...\n")
        
        research_proposals = []
        
        for idx, (rq, outcome) in enumerate(zip(questions_results['research_questions'], outcome_variables)):
            logger.info(f"[{idx+1}/{len(questions_results['research_questions'])}] Generating proposal for: {rq.get('research_question', 'N/A')[:60]}...")
            
            proposal = generate_proposal_from_question_and_outcome(
                regulation_name=regulation_name,
                regulation_description=regulation_description,
                research_question=rq,
                outcome_variable=outcome,
                llm_client=llm_client
            )
            
            if proposal:
                research_proposals.append(proposal)
                
                # Save individual proposal as JSON and TXT
                safe_varname = outcome.get('variable_name', 'outcome')[:30].replace(' ', '_').replace('/', '_')
                
                # Save JSON
                json_filename = f"proposal_{idx+1:03d}_{safe_varname}.json"
                with open(os.path.join(output_folder, json_filename), "w") as f:
                    json.dump(proposal, f, indent=2)
                
                # Save TXT
                txt_filename = f"proposal_{idx+1:03d}_{safe_varname}.txt"
                with open(os.path.join(output_folder, txt_filename), "w", encoding='utf-8') as f:
                    f.write("="*80 + "\n")
                    f.write(f"Research Proposal #{idx+1}\n")
                    f.write("="*80 + "\n\n")
                    f.write(f"Title:\n{proposal.get('title', 'N/A')}\n\n")
                    f.write("-"*80 + "\n\n")
                    f.write(f"Original Research Question: {rq.get('research_question', 'N/A')}\n\n")
                    f.write(f"Outcome Variable: {outcome.get('variable_name', 'N/A')}\n\n")
                    f.write(f"Research Question:\n{proposal.get('research_question', 'N/A')}\n\n")
                    f.write("-"*80 + "\n\n")
                    f.write(f"Motivation:\n{proposal.get('motivation', 'N/A')}\n\n")
                    f.write("-"*80 + "\n\n")
                    f.write(f"Theoretical Framework and Existing Work:\n{proposal.get('theoretical_framework', 'N/A')}\n\n")
                    f.write("-"*80 + "\n\n")
                    f.write(f"Research Design:\n{proposal.get('research_design', 'N/A')}\n\n")
                    f.write("-"*80 + "\n\n")
                    f.write(f"Contribution:\n{proposal.get('contribution', 'N/A')}\n\n")
                    f.write("="*80 + "\n")
                    f.write("References\n")
                    f.write("="*80 + "\n\n")
                    f.write(f"{proposal.get('references', 'N/A')}\n\n")
                    f.write("="*80 + "\n")
                    f.write(f"Word Count: {proposal.get('word_count', 'N/A')}\n")
                    f.write("="*80 + "\n")
                
                logger.info(f"  ‚Üí Saved: {json_filename} and {txt_filename} (Word count: {proposal.get('word_count', 'N/A')})")
            
            time.sleep(2)  # Rate limiting
        
        # ============================================================================
        # SAVE RESULTS
        # ============================================================================
        
        results = {
            "regulation": regulation_name,
            "regulation_description": regulation_description,
            "generation_timestamp": datetime.now().isoformat(),
            "approach": "Research Questions First",
            "n_questions": len(questions_results['research_questions']),
            "n_outcomes": len(outcome_variables),
            "n_proposals_generated": len(research_proposals),
            "research_questions": questions_results['research_questions'],
            "outcome_variables": outcome_variables,
            "research_proposals": research_proposals
        }
        
        # Save research questions
        with open(os.path.join(output_folder, "research_questions.json"), "w") as f:
            json.dump(questions_results, f, indent=2)
        
        # Save outcome variables
        with open(os.path.join(output_folder, "outcome_variables.json"), "w") as f:
            json.dump({"outcome_variables": outcome_variables}, f, indent=2)
        
        # Save regulation details
        with open(os.path.join(output_folder, "selected_regulation.json"), "w") as f:
            json.dump(convert_to_json_serializable(selected_regulation), f, indent=2)
        
        # Save complete results
        with open(os.path.join(output_folder, "complete_results.json"), "w") as f:
            json.dump(results, f, indent=2)
        
        # Create summary report
        create_summary_report(results, output_folder)
        
        # ============================================================================
        # DISPLAY RESULTS
        # ============================================================================
        
        print("\n" + "="*80)
        print("‚úÖ GENERATION COMPLETE!")
        print("="*80)
        print(f"\nRegulation: {regulation_name}")
        print(f"Approach: Research Questions First")
        print(f"Output Folder: {output_folder}")
        print(f"\nüìä Summary:")
        print(f"  - Research Questions: {results['n_questions']}")
        print(f"  - Outcome Variables: {results['n_outcomes']}")
        print(f"  - Research Proposals: {results['n_proposals_generated']}")
        
        # Show word count statistics
        word_counts = [p.get('word_count', 0) for p in research_proposals if isinstance(p.get('word_count'), int)]
        if word_counts:
            print(f"\nüìù Word Count Statistics:")
            print(f"  - Average: {sum(word_counts)/len(word_counts):.0f} words")
            print(f"  - Min: {min(word_counts)} words")
            print(f"  - Max: {max(word_counts)} words")
        
        print(f"\nüìÅ Generated files:")
        print("  üìä SUMMARY_REPORT.md")
        print("  üìÅ complete_results.json")
        print("  üìÅ research_questions.json")
        print("  üìÅ outcome_variables.json")
        print("  üìÅ selected_regulation.json")
        print(f"  üìÑ proposal_001_*.json + .txt ... proposal_{len(research_proposals):03d}_*.json + .txt")
        print("="*80)    