In [None]:
#Code was developed in collaboration with Claude4-Sonnet
#Claude was used for code generation, documentation, and error handling

# 7. Send regression results to Claude for interpretation - ROBUSTNESS TEST FOR SPURIOUS CORRELATIONS
import json
import os
import glob
import re
from typing import Dict, List
from anthropic import Anthropic


class RegressionInterpreter:
    def __init__(self, input_dir: str, output_dir: str, spurious_output_dir: str, api_key: str):
        """Initialize interpreter with input, output, and spurious output directories"""
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.spurious_output_dir = spurious_output_dir
        self.client = Anthropic(api_key=api_key)
        
    def _get_significance_stars(self, pvalue: float) -> str:
        """Get significance stars based on p-value."""
        if pvalue < 0.01:
            return "***"
        elif pvalue < 0.05:
            return "**"
        elif pvalue < 0.1:
            return "*"
        return ""
    
    def _get_significance_level(self, pvalue: float) -> str:
        """Convert p-value to significance level description"""
        if pvalue < 0.01:
            return "at the 1% level"
        elif pvalue < 0.05:
            return "at the 5% level"
        elif pvalue < 0.1:
            return "at the 10% level"
        return "not statistically significant"

    def read_regression_results(self, regulation_name: str) -> Dict:
        """Read regression results JSON file for a specific regulation"""
        results_path = os.path.join(self.output_dir, regulation_name, 'regression_results.json')
        
        # Use Windows long path prefix if path is too long
        if len(results_path) > 250:
            results_path = f"\\\\?\\{results_path}"
        
        try:
            with open(results_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            print(f"No results file found for {regulation_name}")
            return {}
        except json.JSONDecodeError:
            print(f"Error reading results file for {regulation_name}")
            return {}
        except Exception as e:
            print(f"Error accessing {regulation_name}: {str(e)}")
            return {}
        
    def _create_exact_hypothesis_mapping(self):
        """Create exact 1:1 mapping from panel names to hypothesis files"""
        # This method will be overridden by the corrected mapping
        # Just return empty dict since we use the corrected version
        return {}
    
    def _create_corrected_hypothesis_mapping(self):
        """Create corrected mapping that handles abbreviated hypothesis file names"""
    
        # Include mappings for both shortened and long directory names
        complete_mapping = {
            # Shortened AIFMD names -> corrected hypothesis files
            "panel_AIFMD_EU_CorpGov": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Eur_Corporate_Governance_background_hypothesis.txt",
            "panel_AIFMD_EU_InfoAsym": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Eu_Information_Asymmetry_background_hypothesis.txt",
            "panel_AIFMD_EU_UnsophInv": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Eu_Unsophisticated_Investors_background_hypothesis.txt",
            
            # Long AIFMD names -> corrected hypothesis files
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Reputation_Risk": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Reputation_Risk_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Equity_Issuance": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Equity_Issuance_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Litigation_Risk": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Litigation_Risk_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Proprietary_Costs": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Europe_Proprietary_Costs_background_hypothesis.txt"
        }
        
        return complete_mapping
    
    def _get_hypothesis_filename_for_directory(self, regulation_name: str) -> str:
        """Get hypothesis filename for a given directory name"""
        # First check if we have a manual mapping
        manual_mapping = self._create_corrected_hypothesis_mapping()
        if regulation_name in manual_mapping:
            return manual_mapping[regulation_name]
        
        # For directories not in manual mapping, create standard filename
        # Remove "panel_" prefix and add hypothesis suffix
        regulation_part = regulation_name[6:]  # Remove "panel_"
        hypothesis_filename = f"{regulation_part}_background_hypothesis.txt"
        
        return hypothesis_filename
    
    def read_hypothesis(self, regulation_name: str) -> str:
        """
        Read hypothesis file using corrected mapping that handles abbreviations
        """
        # Get the hypothesis filename for this directory
        hypothesis_filename = self._get_hypothesis_filename_for_directory(regulation_name)
        
        # Set up hypothesis directory - use the original directory structure
        original_base_dir = r"enter folder path here"
        hypothesis_dir = os.path.join(original_base_dir, 'background and hypothesis development')
        
        hypothesis_file = os.path.join(hypothesis_dir, hypothesis_filename)
        
        # Use Windows long path prefix if path is too long
        if len(hypothesis_file) > 250:
            hypothesis_file = f"\\\\?\\{hypothesis_file}"
        
        print(f"Looking for hypothesis file: {hypothesis_filename}")
        
        try:
            with open(hypothesis_file, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Extract hypothesis development section
            if "Hypothesis Development" in content:
                hypothesis_section = content.split("Hypothesis Development")[1]
            
                if "H1:" in hypothesis_section:
                    hypothesis_development = hypothesis_section.split("H1:")[0].strip()
                    h1_statement = "H1:" + hypothesis_section.split("H1:")[1].strip()
                    return f"Hypothesis Development:\n\n{hypothesis_development}\n\n{h1_statement}"
                else:
                    return f"Hypothesis Development:\n\n{hypothesis_section.strip()}"
            else:
                print(f"No Hypothesis Development section found in {hypothesis_filename}")
                return content  # Return full content if no specific section found
            
        except FileNotFoundError:
            print(f"Hypothesis file not found: {hypothesis_file}")
            return ""
        except Exception as e:
            print(f"Error reading hypothesis file {hypothesis_filename}: {str(e)}")
            return ""

    def format_results_text(self, regulation_title: str, regulation_year: int, results: Dict) -> str:
        """Format regression results into text for the academic prompt"""
        results_text = f"Regression Analysis for {regulation_title} (Year: {regulation_year})\n\n"
        
        for spec_name, res in results.items():
            results_text += f"\nSpecification {spec_name}:\n"
            results_text += f"Treatment Effect: {res['coefficients']['treatment_effect']:.4f}\n"
            results_text += f"T-statistic: {res['t_stats']['treatment_effect']:.2f}\n"
            results_text += f"P-value: {res['pvalues']['treatment_effect']:.4f}\n"
            results_text += f"R-squared: {res['r_squared']:.4f}\n"
            results_text += f"Number of observations: {int(res['n_obs'])}\n"
            results_text += f"Number of firms: {res['n_firms']}\n"
            
            if res['controls']:
                results_text += "\nControl Variables:\n"
                for control in res['controls']:
                    coef = res['coefficients'][control]
                    tstat = res['t_stats'][control]
                    pvalue = res['pvalues'][control]
                    stars = self._get_significance_stars(pvalue)
                    results_text += f"{control}: {coef:.4f}{stars} (t={tstat:.2f}, p={pvalue:.4f})\n"
            
            results_text += "\nFixed Effects:\n"
            for fe, included in res['fixed_effects'].items():
                results_text += f"{fe}: {'Yes' if included else 'No'}\n"
            
            results_text += "-" * 50 + "\n"
        
        return results_text

    def generate_claude_interpretation(self, regulation_title: str, regulation_year: int, results_text: str, hypothesis_text: str) -> str:
        """Generate interpretation using Claude API"""
        prompt = f"""You are an accounting academic with a PhD in accounting. 
        You should use active voice (e.g. "We find" instead of "It is found"). 
        Use present tense for all established findings. 
        Distinguish between correlation and causation. 
        
        
        IMPORTANT: Before writing any analysis, assess whether the results show a spurious relationship vs. 
        a causal relationship.
        
        A spurious correlation occurs when two variables are correlated but don’t have a causal relationship. 
        In other words, it appears like values of one variable cause changes in the other variable, but that’s not 
        actually happening. For example, ice cream sales in the U.S. and shark attacks in Australia correlate positively.
        As ice cream sales increase in the U.S., there are more shark attacks in Australia. However, common sense tells 
        us that ice cream sales in the U.S. do not cause shark attacks in Australia.
        
        Here’s another example of a spurious correlation: U.S. crude oil imports from Norway correlating with drivers 
        killed in a collision with a railway train. 
        
        For this assessment you will consider whether there are spurious correlations that look like causal relationships. 
        The best way to detect a spurious correlation is through subject-area knowledge. 
        Use your subject-area knowledge to assess correlations and ask lots of questions:
        
        Do they make sense as causal relationships?
        Do they fit established theory?
        Can you find a mechanism for causation?
        Is there a direct link, or are mediator variables involved? 
        
        If you identify spurious correlations, 
        respond with "SPURIOUS RELATIONSHIP DETECTED - CANNOT WRITE A RIGOROUS ACCOUNTING PAPER"
        
        ONLY proceed with writing the analysis if the results appear to represent a causal relationship. If so, follow
        the instructions below:
        
        Write the results description for this analysis as if you were writing an academic paper for an accounting journal, 
        you are studying the association between a change in mandatory disclosure and voluntary disclosure. 
        
        Here is the hypothesis that was developed:
        {hypothesis_text}
        
        Please provide a detailed academic analysis of these regression results:

        {results_text}

        Please structure your analysis as follows (3 paragraphs, ~600 words total):
        1. Label this section Regression Analysis
        2. Main finding (treatment effect interpretation)
        3. Statistical significance and economic magnitude
        4. Model specification comparison
        5. Control variable effects
           Describe whether the relationship is consistent with prior literature
        6. Explain whether the results support the hypothesis stated in the Hypothesis section above

        Write in an academic style suitable for a top accounting journal."""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text
        except Exception as e:
            print(f"Error getting Claude interpretation: {str(e)}")
            return f"Error in Claude analysis: {str(e)}"
        
    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content    

    def interpret_regulation_impact(self, regulation_name: str) -> str:
        """Generate interpretation for a single regulation's results"""
    
        # Check if regression results exist
        results = self.read_regression_results(regulation_name)
        if not results:
            print(f"No regression results found for {regulation_name}")
            return ""
    
        # Check if hypothesis exists
        hypothesis_text = self.read_hypothesis(regulation_name)
        if not hypothesis_text:
            print(f"No hypothesis found for {regulation_name}")
            return ""
    
        print(f"Both regression results and hypothesis found for {regulation_name} - proceeding with interpretation")
    
        # Use regulation name as title since we don't have panel files
        regulation_title = regulation_name.replace('panel_', '').replace('_', ' ').title()
        regulation_year = "N/A"
    
        # Format results text
        results_text = self.format_results_text(regulation_title, regulation_year, results)
    
        # Generate interpretation using Claude
        interpretation = self.generate_claude_interpretation(
            regulation_title, 
            regulation_year, 
            results_text,
            hypothesis_text
        )

        # Clean markdown formatting before saving
        clean_interpretation = self.clean_markdown_formatting(interpretation)
    
        # Save ALL results to spurious correlations directory
        spurious_regulation_dir = os.path.join(self.spurious_output_dir, regulation_name)
        
        # Use Windows long path prefix if path is too long
        if len(spurious_regulation_dir) > 250:
            spurious_regulation_dir = f"\\\\?\\{spurious_regulation_dir}"
        
        os.makedirs(spurious_regulation_dir, exist_ok=True)
        
        # Check if this is a spurious relationship detection to determine filename
        is_spurious = "SPURIOUS RELATIONSHIP DETECTED" in clean_interpretation
        
        if is_spurious:
            interpretation_path = os.path.join(spurious_regulation_dir, 'claude_spurious_detection.txt')
            print(f"SPURIOUS RELATIONSHIP DETECTED for {regulation_name}")
        else:
            interpretation_path = os.path.join(spurious_regulation_dir, 'claude_interpretation.txt')
            print(f"Valid relationship detected for {regulation_name}")
    
        # Use Windows long path prefix if path is too long
        if len(interpretation_path) > 250:
            interpretation_path = f"\\\\?\\{interpretation_path}"
    
        # Save interpretation to spurious correlations directory
        try:
            with open(interpretation_path, 'w', encoding='utf-8') as f:
                f.write(clean_interpretation)
            print(f"Saved interpretation to {interpretation_path}")
        except Exception as e:
            print(f"Error saving interpretation to file: {str(e)}")
    
        return clean_interpretation

    def analyze_all_regulations(self) -> None:
        """Analyze results for all regulations in the directory"""
        # Get regulation directories directly from the regression analyses folder
        regulation_dirs = [d for d in os.listdir(self.output_dir) 
                          if os.path.isdir(os.path.join(self.output_dir, d)) and d.startswith('panel_')]
        
        print(f"Found {len(regulation_dirs)} regulation directories to analyze")
        print("Directories found:")
        for dir_name in regulation_dirs:
            print(f"  - {dir_name}")
        
        spurious_count = 0
        valid_count = 0
        
        for regulation_name in regulation_dirs:
            try:
                print("\n" + "="*80)
                interpretation = self.interpret_regulation_impact(regulation_name)
                
                # Count spurious vs valid relationships
                if "SPURIOUS RELATIONSHIP DETECTED" in interpretation:
                    spurious_count += 1
                else:
                    valid_count += 1
                    
                print(interpretation)
                print("="*80 + "\n")
            except Exception as e:
                print(f"Error analyzing {regulation_name}: {str(e)}")
        
        # Print summary
        print(f"\n{'='*80}")
        print(f"ANALYSIS SUMMARY:")
        print(f"Valid relationships: {valid_count}")
        print(f"Spurious relationships detected: {spurious_count}")
        print(f"Total analyzed: {valid_count + spurious_count}")
        print(f"{'='*80}")


def main():
    # Configuration
    API_KEY = "enter API key here"  
    
    # All paths point to the spurious correlations directory
    SPURIOUS_BASE_DIR = r"enter folder path here"
    REGRESSION_ANALYSES_DIR = os.path.join(SPURIOUS_BASE_DIR, "regression_analyses") 
    SPURIOUS_OUTPUT_DIR = os.path.join(SPURIOUS_BASE_DIR, "regression_analyses")  

    # Create spurious output directory if it doesn't exist
    os.makedirs(SPURIOUS_OUTPUT_DIR, exist_ok=True)
    
    # Pass None for input_dir since we're not using panel files
    interpreter = RegressionInterpreter(None, REGRESSION_ANALYSES_DIR, SPURIOUS_OUTPUT_DIR, API_KEY)
    interpreter.analyze_all_regulations()


if __name__ == "__main__":
    main()