In [None]:
#Code was developed in collaboration with Claude4-Sonnet
#Claude was used for code generation, documentation, and error handling
#Note: the code will run best when executing each section separately
#Required inputs: CSV files with securities law data, data with "GVKEY" and "FYEAR" as identifiers

# 1. Claude identify global securities laws
import os
import anthropic
import pandas as pd
import re
from typing import List, Dict

def get_securities_laws(conversation_history=None):
    # Initialize the Anthropic client
    client = anthropic.Anthropic(
        api_key="enter API here"
    )
    
    # Initial prompt - modified to ensure consistent date format
    initial_content = """Your task is to identify and compile a comprehensive database of at least 100 securities 
    laws around the world - NOT in the U.S. Securities regulation is the field of law that covers transactions and other dealings with securities. 
    Securities laws aim at ensuring that investors receive accurate and necessary information regarding the type and value
    of the interest under consideration for purchase.

IMPORTANT: Only identify new non-U.S.securities regulations. Exclude amendments, updates, or revisions to existing rules. 
Focus on major new laws only.

IMPORTANT: Do not include laws with titles containing the following words: "Amendment", "Update", or "Revision"

IMPORTANT: Before adding any regulation to your list, check if you have already included it. 
Do not include any duplicate entries - each regulation should appear only once.
The goal is to create a dataset that captures the following key details for each law. 

Please follow these guidelines:

Data Fields to Collect:
• Date: The announcement or implementation date of the law (use YYYY-MM-DD format).
• Regulation Title or Name: The official name or designation of the regulatory change. Include the country or jurisdiction
  this law applies to after the law name.
• Regulatory Body/Authority: The government entity responsible for the law.
• Description: A brief overview of the law, including key provisions and the rationale behind it.
• Impact: The potential or observed effects on industries, markets, or stakeholders.
•Litigation Risk: Is this law related to the risk of litigation against managers? By risk of litigation we mean the probability that a manager will be sued or face legal action because of this law. Answer this question with Yes or No. If yes, label the entry "Litigation Risk".
•Corporate Governance: Is this law related to corporate governance of firms? Corporate governance refers to the internal monitoring system charged with overseeing managers and commonly focuses on matters such as board independence or insider trading policy. Answer this question with Yes or No.If yes, label the entry "Corporate Governance".
•Proprietary Costs: Is this law related to proprietary costs of firms? By proprietary costs, we mean costs that result from the disclosure of information to competitors which could harm a firm’s competitive position. Answer this question with Yes or No.If yes, label the entry "Proprietary Costs".
•Information Asymmetry: Is this law related to information asymmetry between owners and managers? By information asymmetry we mean that one party has more or better information than the other party. Answer this question with Yes or No. If yes, label the entry "Information Asymmetry".
•Unsophisticated Investors: Is the law related to protecting unsophisticated investors? By unsophisticated investors, we mean investors that are either new to investing or are not well informed. Answer this question with Yes or No. If yes, label the entry "Unsophisticated Investors".
•Equity Issuance in Public vs. Private Markets: Is this law related to the costs and benefits of issuing equity in public versus private markets? Answer this question with Yes or No. If yes, label the entry "Equity Issuance in Public vs. Private Markets".
•Reputation Risk: Is this law related to the reputation of firm managers? By of firm manager, we mean the career prospects and prestige of an individual manager. Answer this question with Yes or No. If yes, label the entry "Reputation Risk".

• References: References must link directly to the specific regulation or announcement, not to general websites. 
If you cannot find a specific document or article about the regulation, DO NOT INCLUDE THE LAW.

Requirements:
• Scope: Cover as many laws as possible that were announced or implemented in the last 25 years.
• Consistency: Ensure uniform formatting for all entries in the dataset.
• Dates must be in YYYY-MM-DD format (e.g., 2002-07-30).

Output:
Provide data in a tabular format with rows for each law and columns for the data fields listed above. 
Use credible, authoritative sources such as government websites, legal databases, academic journals, or credible news sources.
Do not include duplicate laws.
"""

    try:
        if conversation_history:
            messages = conversation_history
        else:
            messages = [{
                "role": "user",
                "content": initial_content
            }]

        response = client.messages.create(
            max_tokens=8000,
            model="claude-sonnet-4-20250514",
            temperature=0.5,
            messages=messages
        )
        return response.content[0].text, messages + [
            {"role": "assistant", "content": response.content[0].text}
        ]
    except Exception as e:
        print(f"Error making API call: {e}")
        return None, messages

def add_follow_up_prompt(conversation_history, follow_up_prompt):
    """Add a follow-up prompt to the conversation history"""
    return conversation_history + [{"role": "user", "content": follow_up_prompt}]

def standardize_date(date_str):
    """Attempt to standardize date format to YYYY-MM-DD"""
    try:
        # Convert to datetime and then back to string in desired format
        return pd.to_datetime(date_str).strftime('%Y-%m-%d')
    except:
        # If conversion fails, return original string
        return date_str

def parse_table_fallback(response_text: str) -> pd.DataFrame:
    """Fallback parser for when Claude returns table format instead of numbered format."""
    print("Debugging table parsing...")
    
    lines = response_text.split('\n')
    table_lines = [line.strip() for line in lines if line.strip().startswith('|') and len(line.strip()) > 5]
    
    print(f"Found {len(table_lines)} table lines")
    
    if len(table_lines) < 2:
        print("Not enough table lines found")
        return pd.DataFrame()
    
    # Remove separator lines (containing ---)
    data_lines = [line for line in table_lines if '---' not in line]
    print(f"Found {len(data_lines)} data lines (after removing separators)")
    
    if len(data_lines) < 2:
        print("Not enough data lines after removing separators")
        return pd.DataFrame()
    
    # Parse header line
    header_line = data_lines[0]
    raw_headers = header_line.split('|')
    headers = [col.strip() for col in raw_headers if col.strip()]
    
    print(f"Original headers ({len(headers)}): {headers}")
    
    # Parse data rows
    data_rows = []
    for i, line in enumerate(data_lines[1:], 1):
        raw_columns = line.split('|')
        columns = [col.strip() for col in raw_columns if col.strip()]
        
        if len(columns) == len(headers):
            data_rows.append(columns)
            print(f"Row {i}: ✓ Added ({len(columns)} columns)")
        else:
            print(f"Row {i}: ✗ Skipped - {len(columns)} columns vs {len(headers)} headers")
    
    print(f"Successfully parsed {len(data_rows)} data rows")
    
    if not data_rows:
        print("No valid data rows found")
        return pd.DataFrame()
    
    # Create DataFrame with original headers first
    df = pd.DataFrame(data_rows, columns=headers)
    print(f"Created DataFrame with columns: {list(df.columns)}")

    # Make the mapping more robust for title column
    for col in df.columns:
        if 'title' in col.lower() or 'name' in col.lower():
            df = df.rename(columns={col: 'Regulation Title'})
            print(f"Mapped column '{col}' to 'Regulation Title'")
            break
            
    # Now standardize the column names to match your expected format
    # Create a mapping from the table headers to your standard column names
    standard_columns = {
        'Date': 'Date',
        'Regulation Title/Name': 'Regulation Title', 
        'Regulatory Body': 'Regulatory Body',
        'Regulatory Body/Authority': 'Regulatory Body',  # Handle variation
        'Description': 'Description',
        'Impact': 'Impact',
        'Litigation Risk': 'Litigation Risk',
        'Corporate Governance': 'Corporate Governance', 
        'Proprietary Costs': 'Proprietary Costs',
        'Information Asymmetry': 'Information Asymmetry',
        'Unsophisticated Investors': 'Unsophisticated Investors',
        'Equity Issuance Public vs Private': 'Equity Issuance',
        'Equity Issuance in Public vs. Private Markets': 'Equity Issuance',  # Handle variation
        'Reputation Risk': 'Reputation Risk',
        'References': 'References'
    }
    
    # Rename columns using the mapping
    df_renamed = df.rename(columns=standard_columns)
    
    # Ensure all required columns exist
    required_columns = [
        'Date', 'Regulation Title', 'Regulatory Body', 'Description', 'Impact',
        'Litigation Risk', 'Corporate Governance', 'Proprietary Costs', 
        'Information Asymmetry', 'Unsophisticated Investors', 'Equity Issuance',
        'Reputation Risk', 'References'
    ]
    
    # Add missing columns with None values
    for col in required_columns:
        if col not in df_renamed.columns:
            print(f"Adding missing column: {col}")
            df_renamed[col] = None
    
    # Select only the required columns in the correct order
    final_df = df_renamed[required_columns].copy()
    
    # Standardize date format
    if 'Date' in final_df.columns:
        print("Standardizing dates...")
        final_df['Date'] = final_df['Date'].apply(lambda x: standardize_date(x) if pd.notna(x) and str(x).strip() else x)
    
    # Clean up any completely empty rows
    final_df = final_df.dropna(how='all')
    
    print(f"Final DataFrame: {len(final_df)} rows x {len(final_df.columns)} columns")
    print(f"Final columns: {list(final_df.columns)}")
    
    return final_df
    
def parse_response_to_dataframe(response_text: str) -> pd.DataFrame:
    """Parse the response text into a pandas DataFrame."""
    print("\nParsing response...")
    
    # First try the original numbered format parsing
    data = []
    current_entry = None
    entry_number = None
    
    lines = [line.strip() for line in response_text.split('\n') if line.strip()]
    
    for line in lines:
        number_match = re.match(r'^\*?\*?(\d+)\.\*?\*?', line)
        if number_match:
            if current_entry and len(current_entry) > 0:
                if 'Regulation Title' not in current_entry and entry_number:
                    current_entry['Regulation Title'] = f"Law {entry_number}"
                data.append(current_entry)
            current_entry = {}
            entry_number = number_match.group(1)
            continue
            
        if ':' in line and current_entry is not None:
            key, value = [x.strip() for x in line.split(':', 1)]
            
            key_mapping = {
                'Date': 'Date',
                'Title': 'Regulation Title',
                'Authority': 'Regulatory Body',
                'Description': 'Description',
                'Impact': 'Impact',
                'Litigation Risk': 'Litigation Risk',
                'Corporate Governance': 'Corporate Governance',
                'Proprietary Costs': 'Proprietary Costs',
                'Information Asymmetry': 'Information Asymmetry',
                'Unsophisticated Investors': 'Unsophisticated Investors',
                'Equity Issuance': 'Equity Issuance',
                'Reputation Risk': 'Reputation Risk',
                'References': 'References'
            }
            
            if key in key_mapping:
                column_name = key_mapping[key]
                if column_name == 'Date':
                    current_entry[column_name] = standardize_date(value)
                else:
                    current_entry[column_name] = value.strip()

    if current_entry and len(current_entry) > 0:
        if 'Regulation Title' not in current_entry and entry_number:
            current_entry['Regulation Title'] = f"Law {entry_number}"
        data.append(current_entry)
    
    print(f"\nFound {len(data)} entries in numbered format")
    
    # If numbered format parsing found data, use it
    if data:
        df = pd.DataFrame(data)
        required_columns = ['Date', 'Regulation Title', 'Regulatory Body', 'Description', 'Impact',
                          'Litigation Risk', 'Corporate Governance', 'Proprietary Costs',
                          'Information Asymmetry', 'Unsophisticated Investors', 'Equity Issuance',
                          'Reputation Risk', 'References']
        
        for col in required_columns:
            if col not in df.columns:
                print(f"Adding missing column: {col}")
                df[col] = None
        
        df['Regulation Title'] = df['Regulation Title'].fillna('Unknown')
        df['Regulation Title'] = df['Regulation Title'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
        
        df['dedup_key'] = df.apply(lambda row: f"{row['Date']}_{row['Regulation Title']}", axis=1)
        df = df.drop_duplicates(subset=['dedup_key'], keep='first')
        df = df.drop('dedup_key', axis=1)
        
        df = df[required_columns]
        print(f"Created DataFrame with {len(df)} rows")
        return df.copy()
    
    # If numbered format failed, try table format as fallback
    print("Numbered format parsing failed, trying table format...")
    df = parse_table_fallback(response_text)
    
    if not df.empty:
        print(f"Table format parsing succeeded with {len(df)} rows")
        return df
    else:
        print("Both parsing methods failed - no valid data to create DataFrame")
        return pd.DataFrame()
                
def compile_all_responses() -> pd.DataFrame:
    """Compile multiple API responses into a single DataFrame."""
    all_responses = []
    conversation_history = None

    # Get initial response
    initial_response, conversation_history = get_securities_laws()
    if initial_response:
        print("\nInitial response:")
        print(initial_response)
        all_responses.append(initial_response)

        follow_up_prompts = [
            """Starting with number {last_num}, list 20 more non-U.S. securities laws using this exact format for each:
Date: YYYY-MM-DD
Title: [title]
Authority: [body]
Description: [brief]
Impact: [impact]
Litigation Risk: Yes/No
Corporate Governance: Yes/No
Proprietary Costs: Yes/No
Information Asymmetry: Yes/No
Unsophisticated Investors: Yes/No
Equity Issuance: Yes/No
Reputation Risk: Yes/No
References: [link]""",

            "Continue from number {last_num}. Provide 20 more laws using the exact same format.",
            
            "List 20 more laws starting at number {last_num}. Use the same format.",
            
            "Add 20 more laws beginning with number {last_num}. Keep the same format.",
            
            "Provide 20 more laws from number {last_num}. Same format.",
            
            "Recall that you have to identify at least 100 securities laws. Recall securities regulation is the field of law that covers transactions and other dealings with securities. Securities laws aim at ensuring that investors receive accurate and necessary information regarding the type and value of the interest under consideration for purchase."
        ]
        last_num = len(parse_response_to_dataframe(initial_response)) + 1
        
        for i, prompt_template in enumerate(follow_up_prompts, 1):
            prompt = prompt_template.format(last_num=last_num)
            conversation_history = add_follow_up_prompt(conversation_history, prompt)
            response, conversation_history = get_securities_laws(conversation_history)
            
            if response:
                print(f"\nFollow-up response {i}:")
                print(response)
                all_responses.append(response)
                df = parse_response_to_dataframe(response)
                last_num += len(df)
            
    # Parse all responses into DataFrames and concatenate
    dfs = []
    for response in all_responses:
        df = parse_response_to_dataframe(response)
        if not df.empty:
            dfs.append(df)

    if not dfs:
        print("No valid data frames were created!")
        return pd.DataFrame()

    # Concatenate all DataFrames
    final_df = pd.concat(dfs, ignore_index=True)

    # Remove duplicates using multiple fields to better identify unique laws
    final_df['Title_clean'] = final_df['Regulation Title'].fillna('').str.lower().str.strip()
    final_df['Description_clean'] = final_df['Description'].fillna('').str.lower().str.strip()
    
    # Create composite key for deduplication
    final_df['dedup_key'] = final_df.apply(
        lambda row: f"{row['Date']}_{row['Title_clean']}_{row['Description_clean'][:50]}", 
        axis=1
    )
    
    # Remove duplicates and cleanup
    final_df = final_df.drop_duplicates(subset=['dedup_key'], keep='first')
    final_df = final_df.drop(['Title_clean', 'Description_clean', 'dedup_key'], axis=1)

    # Sort by date
    try:
        final_df['DateSort'] = pd.to_datetime(final_df['Date'], errors='coerce')
        final_df = final_df.dropna(subset=['DateSort'])
        final_df = final_df.sort_values('DateSort', ascending=False)
        final_df = final_df.drop('DateSort', axis=1)
    except Exception as e:
        print(f"Warning: Could not sort by date due to: {e}")
        print("Problematic dates:")
        print(final_df['Date'].value_counts())

    # Return the final DataFrame
    return final_df

if __name__ == "__main__":
    # Compile all responses into a DataFrame
    df = compile_all_responses()
    
    if df.empty:
        print("\nError: No data was collected!")
    else:
        # Display basic statistics
        print(f"\nTotal number of unique laws: {len(df)}")
        print("\nMost recent laws:")
        print(df.head().to_string())
        
        # Save to CSV
        output_path = 'enter file path here'
        df.to_csv(output_path, index=False)
        print(f"\nDatabase saved to: {output_path}")
        
# 2. Add column for Year 

import pandas as pd

df=pd.read_csv("enter file path here")

# Clean parentheses and dashes from text columns
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].str.replace('(', '').str.replace(')', '').str.replace('-', '')
    
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df

#Excluding years prior to 2002 2020 and 2021 since we don't have forecast data. We also exclude years 2018, 2019 from law file because we need 2 years after and we have data
#up to 2019
filtered_df = df[~df['Year'].isin([1986, 1987, 1988, 1989,1990, 1991, 1992, 1993, 1994, 1995, 1996,
                                   1997, 1998, 1999, 2000, 2001,2018, 2019, 2020, 2021, 2022, 2023, 2024])]

filtered_df_with_titles = filtered_df.dropna(subset=["Regulatory Body"])

filtered_df_with_titles.to_csv("enter file path here")

# 3. Create Panel Datasets for Each Law and Each Channel
import pandas as pd
import os
import re

def add_underscores_before_capitals(text):
    """Add underscores before capital letters in a string"""
    return re.sub(r'(?<=[a-z0-9])(?=[A-Z])', '_', text)

def create_channel_panels(laws_file: str, panel_file: str, output_dir: str) -> None:
    """
    Creates separate panel datasets for each law and each channel marked as "Yes".
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the input files
    laws_df = pd.read_csv(laws_file)
    panel_df = pd.read_csv(panel_file)
    
    # Convert Year columns to int
    laws_df['Year'] = pd.to_numeric(laws_df['Year'])
    panel_df['FYEAR'] = pd.to_numeric(panel_df['FYEAR'])
    
    # Define channels to check for "Yes"
    channels = [
        'Litigation Risk',
        'Corporate Governance',
        'Proprietary Costs',
        'Information Asymmetry',
        'Unsophisticated Investors',
        'Equity Issuance',
        'Reputation Risk'
    ]
    
    # List of columns to bring from laws dataset
    law_columns = [
        'Date', 'Regulation Title', 'Regulatory Body', 'Description', 
        'Impact', 'Litigation Risk', 'Corporate Governance', 
        'Proprietary Costs', 'Information Asymmetry', 
        'Unsophisticated Investors', 'Equity Issuance', 
        'Reputation Risk', 'References', 'Year'
    ]
    
    # Process each law
    for _, law in laws_df.iterrows():  # Fixed: removed asterisks
        try:
            # Check each channel
            for channel in channels:
                # Only create panel if channel is "Yes"
                if str(law[channel]).strip().lower() == "yes":
                    # Create a copy of the panel data
                    law_panel = panel_df.copy()
                    
                    # Add law information to each row
                    for col in law_columns:
                        law_panel[col] = law[col]
                    
                    # Create treatment indicator
                    law_panel['post_law'] = (law_panel['FYEAR'] >= law['Year']).astype(int)
                    law_panel['treated'] = 1
                    law_panel['treatment_effect'] = law_panel['post_law'] * law_panel['treated']
                    
                    # Create filename with both law and channel
                    safe_title = law['Regulation Title'].replace('/', '_').replace('\\', '_')
                    safe_title = ''.join(c for c in safe_title if c.isalnum() or c in ('_', '-'))
                    
                    # Add underscores before capital letters
                    safe_title = add_underscores_before_capitals(safe_title)
                    
                    safe_channel = channel.replace(' ', '_')
                    
                    # Save to CSV
                    output_file = os.path.join(output_dir, f"panel_{safe_title}_{safe_channel}.csv")
                    law_panel.to_csv(output_file, index=False)
                    
                    print(f"Created panel dataset for: {law['Regulation Title']} - {channel}")
            
        except Exception as e:
            print(f"Error processing law {law['Regulation Title']}: {str(e)}")
            continue

if __name__ == "__main__":  # Fixed: removed asterisks
    # Configuration
    laws_file = "enter file path here"
    panel_file = "enter file path here"
    output_dir = "enter folder path here"
    
    # Create panel datasets
    create_channel_panels(laws_file, panel_file, output_dir)
    
    print("\nPanel creation complete!")
    
# 4. Add time trend variable, run regression analyses and save regression tables
import pandas as pd
import numpy as np
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
from linearmodels.iv import AbsorbingLS
import os
import json
import glob
from fpdf import FPDF
import traceback
import shutil
import re


def add_underscores_before_capitals(text):
    """Add underscores before capital letters in a string"""
    return re.sub(r'(?<=[a-z0-9])(?=[A-Z])', '_', text)

class RegressionAnalyzer:
    def __init__(self):
        """Initialize regression analyzer"""
        pass

    def _get_significance_stars(self, pvalue: float) -> str:
        """Get significance stars based on p-value."""
        if pvalue < 0.01:
            return "***"
        elif pvalue < 0.05:
            return "**"
        elif pvalue < 0.1:
            return "*"
        return ""

    def add_time_trends(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add time trend variable to the dataset"""
        df_with_trends = df.copy()
        
        # Convert FYEAR to numeric if not already
        df_with_trends['FYEAR'] = pd.to_numeric(df_with_trends['FYEAR'])
        
        # Sort by firm and year to ensure proper ordering
        df_with_trends = df_with_trends.sort_values(['GVKEY', 'FYEAR'])
        
        # Time trend: FYEAR - first_year_in_panel (since each panel has different event windows)
        min_year_in_panel = df_with_trends['FYEAR'].min()
        df_with_trends['time_trend'] = df_with_trends['FYEAR'] - min_year_in_panel
        
        print(f"Added time trend. Range: {df_with_trends['time_trend'].min()} to {df_with_trends['time_trend'].max()}")
        
        return df_with_trends

    def filter_event_window(self, df: pd.DataFrame) -> pd.DataFrame:
        """Filter data to ±2 years around regulation year"""
        try:
            regulation_year = int(df['Year'].iloc[0])
            return df[
                (df['FYEAR'] >= regulation_year - 2) &
                (df['FYEAR'] <= regulation_year + 2)
            ]
        except KeyError as e:
            print(f"Missing column in DataFrame: {e}")
            raise
        except Exception as e:
            print(f"Error during filtering: {e}")
            raise

    def prepare_fixed_effects_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepare fixed effects columns for AbsorbingLS"""
        df_prep = df.copy()
        
        # Ensure categorical variables for fixed effects
        df_prep['firm_id'] = df_prep['GVKEY'].astype('category')
        
        print(f"Number of firms for fixed effects: {len(df_prep['firm_id'].unique())}")
        
        return df_prep

    def run_regressions(self, df: pd.DataFrame) -> dict:
        """Run multiple regression specifications with and without fixed effects"""
        results_dict = {}
        
        # Prepare data for fixed effects
        df_prep = self.prepare_fixed_effects_data(df.copy())
        
        specifications = {
            '(1)': {
                'dep_var': 'freqMF',
                'controls': [],
                'method': 'OLS',
                'absorb': None
            },
            '(2)': {
                'dep_var': 'freqMF',
                'controls': ['linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 'lcalrisk','time_trend'],
                'method': 'OLS',
                'absorb': None
            },
            '(3)': {
                'dep_var': 'freqMF',
                'controls': ['linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 'lcalrisk','time_trend'],
                'method': 'AbsorbingLS',
                'absorb': ['firm_id']
            }
        }

        for spec_name, spec in specifications.items():
            print(f"\nRunning regression for specification {spec_name}")
            try:
                dep_var = spec['dep_var']
                controls = spec.get('controls', [])
                variables = controls + ['treatment_effect']

                # Clean data
                print("Getting required columns...")
                required_columns = variables + [dep_var, 'GVKEY']
                
                # Add fixed effects columns if needed
                if spec.get('absorb'):
                    for absorb_var in spec['absorb']:
                        if absorb_var not in required_columns:
                            required_columns.append(absorb_var)
                
                # Check for missing columns
                if not all(col in df_prep.columns for col in required_columns):
                    missing_cols = [col for col in required_columns if col not in df_prep.columns]
                    print(f"Missing columns: {missing_cols}")
                    raise ValueError(f"Missing required columns: {missing_cols}")
            
                reg_data = df_prep[required_columns].copy()
                reg_data = reg_data.replace([np.inf, -np.inf], np.nan)
                reg_data = reg_data.dropna()
            
                print(f"Observations: {len(reg_data)}")
                print(f"Number of unique firms: {len(reg_data['GVKEY'].unique())}")

                # Prepare dependent and independent variables
                y = reg_data[dep_var]
                X = reg_data[variables]

                if spec['method'] == 'OLS':
                    # Standard OLS for specifications without fixed effects
                    X_with_const = add_constant(X)
                    model = OLS(y, X_with_const)
                    # Fit with clustered standard errors at firm level
                    results = model.fit(cov_type='cluster', cov_kwds={'groups': reg_data['GVKEY']})

                    # Store results
                    results_dict[spec_name] = {
                        'coefficients': results.params.to_dict(),
                        'pvalues': results.pvalues.to_dict(),
                        't_stats': (results.params / results.bse).to_dict(),
                        'r_squared': results.rsquared,
                        'n_obs': int(results.nobs),
                        'n_firms': len(reg_data['GVKEY'].unique()),
                        'controls': controls,
                        'fixed_effects': {
                            'firm': False
                        }
                    }
                    
                elif spec['method'] == 'AbsorbingLS':
                    # Use AbsorbingLS for high-dimensional fixed effects
                    absorb_vars = spec['absorb']
                    print(f"Absorbing fixed effects: {absorb_vars}")
                    
                    # Ensure absorption variables are always passed as a DF
                    absorb_data = reg_data[absorb_vars]
                    
                    # Create the AbsorbingLS model
                    model = AbsorbingLS(
                        dependent=y,
                        exog=X,
                        absorb=absorb_data
                    )
                    
                    # Fit with clustered standard errors at firm level
                    results = model.fit(cov_type='clustered', clusters=reg_data['GVKEY'])
                    
                    results_dict[spec_name] = {
                        'coefficients': results.params.to_dict(),
                        'pvalues': results.pvalues.to_dict(),
                        't_stats': results.tstats.to_dict(),
                        'r_squared': results.rsquared,
                        'n_obs': int(results.nobs),
                        'n_firms': len(reg_data['GVKEY'].unique()),
                        'controls': controls,
                        'fixed_effects': {
                            'firm': 'firm_id' in absorb_vars
                        }
                    }

                print(f"Successfully completed regression for specification {spec_name}")
                
            except Exception as e:
                print(f"Error in specification {spec_name}: {str(e)}")
                traceback.print_exc()
                continue
        
        if not results_dict:
            raise ValueError("No successful regressions completed")
        
        return results_dict

    def save_regression_table_as_pdf(self, results: dict, regulation_title: str, output_path: str):
        """Save regression table as PDF matching the target format for 3 columns"""
        try:
            pdf = FPDF(format='A4', orientation='L')  # Landscape for better fit
            pdf.set_margins(15, 15, 15)
            pdf.add_page()
            
            # Title - all in bold Times New Roman
            try:
                # First try Windows standard folder for Times New Roman
                pdf.add_font('Times New Roman', '', r'C:\Windows\Fonts\times.ttf', uni=True)
                pdf.add_font('Times New Roman', 'B', r'C:\Windows\Fonts\timesbd.ttf', uni=True)
                pdf.set_font('Times New Roman', 'B', 11)
            except:
                try:
                    # Try alternative paths for Times New Roman
                    pdf.add_font('Times New Roman', '', 'times.ttf', uni=True)
                    pdf.add_font('Times New Roman', 'B', 'timesbd.ttf', uni=True)
                    pdf.set_font('Times New Roman', 'B', 11)
                except:
                    print("Times New Roman font not found, using Arial Bold")
                    pdf.set_font('Arial', 'B', 11)

            # Both title and table number in bold
            pdf.cell(0, 8, "Table 3", ln=True, align='C')
            pdf.cell(0, 8, f"The Impact of {regulation_title} on Management Forecast Frequency", ln=True, align='C')
            pdf.ln(3)

            # Switch back to regular font for table content
            try:
                pdf.set_font('Times New Roman', '', 10)
            except:
                pdf.set_font('Arial', '', 10)
            
            # Calculate column widths for 3 columns (Variable name + 3 specifications)
            # Landscape A4: 297mm width - 30mm margins = 267mm available
            first_col_width = 70  # Wider for variable names since we have fewer columns
            col_width = (pdf.w - 30 - first_col_width) / 3  # Divide remaining space by 3

            # Table header
            pdf.cell(first_col_width, 7, "", 1)
            for i in range(1, 4):  # 3 specifications: (1), (2), (3)
                pdf.cell(col_width, 7, f"({i})", 1, align='C')
            pdf.ln()

            # Treatment Effect
            pdf.cell(first_col_width, 7, "Treatment Effect", 1)
            for i in range(1, 4):  # Only loop through (1), (2), (3)
                spec = f'({i})'
                if spec in results:
                    coef = results[spec]['coefficients']['treatment_effect']
                    tstat = abs(results[spec]['t_stats']['treatment_effect'])
                    stars = self._get_significance_stars(results[spec]['pvalues']['treatment_effect'])
                    pdf.cell(col_width, 7, f"{coef:.4f}{stars} ({tstat:.2f})", 1, align='C')
                else:
                    pdf.cell(col_width, 7, "", 1, align='C')
            pdf.ln()

            # Control variables for specifications (2) and (3)
            control_labels = {
                'linstown': 'Institutional ownership',
                'lsize': 'Firm size',
                'lbtm': 'Book-to-market',
                'lroa': 'ROA',
                'lsaret12': 'Stock return',
                'levol': 'Earnings volatility',
                'lloss': 'Loss',
                'lcalrisk': 'Class action litigation risk',
                'time_trend': 'Time Trend'
            }

            for var, label in control_labels.items():
                pdf.cell(first_col_width, 7, label, 1)
                for i in range(1, 4):  # Only loop through (1), (2), (3)
                    spec = f'({i})'
                    if spec in results and var in results[spec]['coefficients']:
                        coef = results[spec]['coefficients'][var]
                        tstat = abs(results[spec]['t_stats'][var])
                        stars = self._get_significance_stars(results[spec]['pvalues'][var])
                        pdf.cell(col_width, 7, f"{coef:.4f}{stars} ({tstat:.2f})", 1, align='C')
                    else:
                        pdf.cell(col_width, 7, "", 1, align='C')
                pdf.ln()

            # Fixed effects rows
            pdf.cell(first_col_width, 7, "Firm fixed effects", 1)
            for i in range(1, 4):  # Only loop through (1), (2), (3)
                spec = f'({i})'
                if spec in results:
                    text = "Yes" if results[spec]['fixed_effects']['firm'] else "No"
                    pdf.cell(col_width, 7, text, 1, align='C')
                else:
                    pdf.cell(col_width, 7, "No", 1, align='C')
            pdf.ln()

            # N and R²
            for stat in ['N', 'R²']:
                pdf.cell(first_col_width, 7, stat, 1)
                for i in range(1, 4):  # Only loop through (1), (2), (3)
                    spec = f'({i})'
                    if spec in results:
                        value = results[spec]['n_obs'] if stat == 'N' else results[spec]['r_squared']
                        text = f"{value:,}" if stat == 'N' else f"{value:.4f}"
                        pdf.cell(col_width, 7, text, 1, align='C')
                    else:
                        pdf.cell(col_width, 7, "", 1, align='C')
                pdf.ln()

            # Notes
            pdf.ln(10)
            pdf.set_font('Times', size=10)
            notes = "Notes: t-statistics in parentheses. *, **, and *** represent significance at the 10%, 5%, and 1% level, respectively."
            pdf.multi_cell(0, 5, notes)

            pdf.output(output_path)
            print(f"PDF saved at {output_path}")

        except Exception as e:
            print(f"Error saving PDF: {e}")
            traceback.print_exc()

    def analyze_panel(self, panel_file: str, output_dir: str):
        """Analyze a single panel dataset"""
        try:
            print(f"\nStarting analysis of {os.path.basename(panel_file)}...")
            print("Reading data...")
            df = pd.read_csv(panel_file)
            
            print("Filtering event window...")
            df_filtered = self.filter_event_window(df)
            
            print("Adding time trends...")
            df_with_trends = self.add_time_trends(df_filtered)
            
            print("Running regressions...")
            results = self.run_regressions(df_with_trends)
            
            print("Saving results...")
            os.makedirs(output_dir, exist_ok=True)
            
            # Save filtered data with time trends and results
            print("Saving filtered data with time trends...")
            df_with_trends.to_csv(os.path.join(output_dir, 'filtered_data_with_trends.csv'), index=False)
            with open(os.path.join(output_dir, 'regression_results.json'), 'w') as f:
                json.dump(results, f, indent=4)
            
            # Save regression table
            table_path = os.path.join(output_dir, 'regression_table.pdf')
            self.save_regression_table_as_pdf(
                results,
                df['Regulation Title'].iloc[0],
                table_path
            )
            print(f"Saved regression table to {table_path}")
            
        except Exception as e:
            print(f"Error analyzing panel: {str(e)}")
            traceback.print_exc()


def analyze_all_panels(input_dir: str, output_dir: str):
    """Analyze all panel datasets in a directory"""
    analyzer = RegressionAnalyzer()
    panel_files = glob.glob(os.path.join(input_dir, "*.csv"))
    total_files = len(panel_files)
    successful_runs = 0
    failed_runs = 0
    print(f"\nFound {total_files} panel files to analyze")
    
    for i, panel_file in enumerate(panel_files, 1):
        print(f"\n{'='*80}")
        print(f"Processing panel {i} of {total_files}: {os.path.basename(panel_file)}")
        print(f"{'='*80}")
        
        try:
            # Get the base filename without extension
            base_filename = os.path.basename(panel_file).replace('.csv', '')
            
            # Apply the same naming convention (add underscores before capitals)
            formatted_filename = add_underscores_before_capitals(base_filename)
            
            analyzer.analyze_panel(panel_file, os.path.join(output_dir, formatted_filename))
            successful_runs += 1
            print(f"Successfully processed panel {i}")
        except Exception as e:
            failed_runs += 1
            print(f"Failed to process panel {i}: {str(e)}")
            
        # Print progress summary
        print(f"\nProgress Summary:")
        print(f"Processed: {i}/{total_files} ({(i/total_files)*100:.1f}%)")
        print(f"Successful: {successful_runs}")
        print(f"Failed: {failed_runs}")


def process_significance(base_dir, delete_nonsig=False):
    """
    Process panels based on significance and either delete or move non-significant results.
    t-stat >= 1.96 is considered significant.
    """
    # Create directory for non-significant results if not deleting
    if not delete_nonsig:
        nonsig_dir = os.path.join(os.path.dirname(base_dir), 'nonsignificant_results')
        os.makedirs(nonsig_dir, exist_ok=True)
    
    # Find all panel directories
    panel_dirs = glob.glob(os.path.join(base_dir, 'panel_*'))
    print(f"Found {len(panel_dirs)} panel directories")
    
    # Track results
    significant_count = 0
    not_significant_count = 0
    
    # Process each panel
    for panel_dir in panel_dirs:
        panel_name = os.path.basename(panel_dir)
        json_file = os.path.join(panel_dir, 'regression_results.json')
        
        try:
            # Read regression results
            with open(json_file, 'r') as f:
                results = json.load(f)
            
            # Check specification (3) since that's the firm FE specification
            if '(3)' in results:
                t_stat = abs(results['(3)']['t_stats']['treatment_effect'])
                is_significant = t_stat >= 1.96
                
                if is_significant:
                    print(f"{panel_name}: t-stat = {t_stat:.2f} (significant - keeping)")
                    significant_count += 1
                else:
                    print(f"{panel_name}: t-stat = {t_stat:.2f} (not significant - {'deleting' if delete_nonsig else 'moving'})")
                    if delete_nonsig:
                        shutil.rmtree(panel_dir)
                    else:
                        shutil.move(panel_dir, os.path.join(nonsig_dir, panel_name))
                    not_significant_count += 1
            else:
                print(f"{panel_name}: No specification (3) found - {'deleting' if delete_nonsig else 'moving'}")
                if delete_nonsig:
                    shutil.rmtree(panel_dir)
                else:
                    shutil.move(panel_dir, os.path.join(nonsig_dir, panel_name))
                not_significant_count += 1
                
        except Exception as e:
            print(f"Error processing {panel_name}: {str(e)}")
    
    # Print summary
    total_processed = significant_count + not_significant_count
    if total_processed > 0:
        print("\nSummary:")
        print(f"Total panels processed: {total_processed}")
        print(f"Significant results: {significant_count} ({(significant_count/total_processed)*100:.1f}%)")
        print(f"Not significant results: {not_significant_count} ({(not_significant_count/total_processed)*100:.1f}%)")
        if not delete_nonsig:
            print(f"\nNon-significant results moved to: {nonsig_dir}")
        else:
            print("\nNon-significant results deleted")
    else:
        print("No panels processed successfully")


if __name__ == "__main__":
    # Configuration
    INPUT_DIR = "enter folder path here"
    OUTPUT_DIR = "enter folder path here"
    
    # Run analysis on all panels
    analyze_all_panels(INPUT_DIR, OUTPUT_DIR)
    
# 5. Check and keep significant results 
import os
import json
import glob
import shutil

def process_significance(base_dir, delete_nonsig=False):
    """
    Process panels based on significance and either delete or move non-significant results.
    t-stat >= 1.96 is considered significant.
    """
    # Create directory for non-significant results if not deleting
    if not delete_nonsig:
        nonsig_dir = os.path.join(os.path.dirname(base_dir), 'nonsignificant_results')
        os.makedirs(nonsig_dir, exist_ok=True)
    
    # Find all panel directories
    panel_dirs = glob.glob(os.path.join(base_dir, 'panel_*'))
    print(f"Found {len(panel_dirs)} panel directories")
    
    # Track results
    significant_count = 0
    not_significant_count = 0
    
    # Process each panel
    for panel_dir in panel_dirs:
        panel_name = os.path.basename(panel_dir)
        json_file = os.path.join(panel_dir, 'regression_results.json')
        
        try:
            # Read regression results
            with open(json_file, 'r') as f:
                results = json.load(f)
            
            # Check specification (3)
            if '(3)' in results:
                t_stat = abs(results['(3)']['t_stats']['treatment_effect'])
                is_significant = t_stat >= 1.96
                
                if is_significant:
                    print(f"{panel_name}: t-stat = {t_stat:.2f} (significant - keeping)")
                    significant_count += 1
                else:
                    print(f"{panel_name}: t-stat = {t_stat:.2f} (not significant - {'deleting' if delete_nonsig else 'moving'})")
                    if delete_nonsig:
                        shutil.rmtree(panel_dir)
                    else:
                        shutil.move(panel_dir, os.path.join(nonsig_dir, panel_name))
                    not_significant_count += 1
            else:
                print(f"{panel_name}: No specification (3) found - {'deleting' if delete_nonsig else 'moving'}")
                if delete_nonsig:
                    shutil.rmtree(panel_dir)
                else:
                    shutil.move(panel_dir, os.path.join(nonsig_dir, panel_name))
                not_significant_count += 1
                
        except Exception as e:
            print(f"Error processing {panel_name}: {str(e)}")
    
    # Print summary
    total_processed = significant_count + not_significant_count
    print("\nSummary:")
    print(f"Total panels processed: {total_processed}")
    print(f"Significant results: {significant_count} ({(significant_count/total_processed)*100:.1f}%)")
    print(f"Not significant results: {not_significant_count} ({(not_significant_count/total_processed)*100:.1f}%)")
    if not delete_nonsig:
        print(f"\nNon-significant results moved to: {nonsig_dir}")
    else:
        print("\nNon-significant results deleted")

# Usage
base_dir = r"enter folder path here"

# Choose whether to delete (True) or move (False) non-significant results
delete_nonsig = False  # Change to True to delete instead of move
process_significance(base_dir, delete_nonsig)

# 6. Ask Claude to write a background, theoretical framework, and hypothesis development section
import pandas as pd
import json
import os
import re
from anthropic import Anthropic
import glob

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def get_laws_with_regression_results(self, csv_file: str, regression_dir: str) -> list:
        """Read laws from CSV and filter to only those with regression analysis results"""
        df = pd.read_csv(csv_file)
        
        # Get all panel directories in regression_analyses folder
        panel_dirs = glob.glob(os.path.join(regression_dir, 'panel_*'))
        existing_panels = set()
        
        for panel_dir in panel_dirs:
            # Extract the panel identifier from directory name
            panel_name = os.path.basename(panel_dir)
            existing_panels.add(panel_name)
        
        print(f"Found {len(existing_panels)} regression analysis folders")
        print(f"Sample panel names: {list(existing_panels)[:5]}")
        
        # Extract unique law names from existing panel folders
        law_names_in_panels = set()
        for panel_name in existing_panels:
            # Remove 'panel_' prefix and mechanism suffix
            parts = panel_name.replace('panel_', '').split('_')
            # Take all parts except the last 2 (which are mechanism)
            law_part = '_'.join(parts[:-2])

            # Apply the EXACT same cleaning as CSV names - remove spaces and special chars entirely
            cleaned_law_part = law_part.replace('-', '').replace('(', '').replace(')', '')
            law_names_in_panels.add(cleaned_law_part)
    
        print(f"Unique law names found: {len(law_names_in_panels)}")
        print(f"Sample law names: {list(law_names_in_panels)[:5]}")
        
        print("\n=== DEBUGGING NAMES ===")
        print("Sample law names from panels:")
        for name in list(law_names_in_panels)[:10]:
            print(f"  '{name}'")

        print("\nSample law names from CSV (after cleaning):")
        for _, row in df.head(10).iterrows():
            clean_title = row['Regulation Title'].replace(' ', '_').replace('/', '').replace('-', '').replace('(', '').replace(')', '')
            # Remove multiple consecutive underscores
            clean_title = re.sub(r'_+', '_', clean_title)
            print(f"  '{clean_title}' (Original: '{row['Regulation Title']}')")
        
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        laws_found = 0
        laws_with_regression = 0
        
        for _, row in df.iterrows():
            laws_found += 1
            
            # Create expected panel name based on the law
            # This should match the naming convention used when creating panels
            clean_title = row['Regulation Title'].replace(' ', '_').replace('/', '').replace('-', '').replace('(', '').replace(')', '')
            # Remove multiple consecutive underscores
            clean_title = re.sub(r'_+', '_', clean_title)
            
            # Check if this law has a corresponding regression analysis folder
            if clean_title in law_names_in_panels:
                laws_with_regression += 1
                
                # Get active mechanisms (where value is 'Yes')
                active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
                
                law = {
                    'title': row['Regulation Title'],
                    'year': row['Year'],
                    'body': row['Regulatory Body'],
                    'description': row['Description'],
                    'impact': row['Impact'],
                    'mechanisms': active_mechanisms,
                    'panel_name': clean_title 
                }
                laws.append(law)
                print(f"✓ Including law: {row['Regulation Title']} (Panel: {clean_title})")
            else:
                print(f"✗ Skipping law: {row['Regulation Title']} (No panel: {clean_title})")
        
        print(f"\nSummary:")
        print(f"Total laws in CSV: {laws_found}")
        print(f"Laws with regression results: {laws_with_regression}")
        print(f"Laws to generate hypotheses for: {len(laws)}")
        
        return laws
    
    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - more precise pattern
        # This handles **text**, ***text***, ****text**** but preserves content
        content = re.sub(r'\*{2,4}([^*]+?)\*{2,4}', r'\1', content)
        content = re.sub(r'\*([^*]+?)\*', r'\1', content)  # Handle single asterisks
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content
    
    def create_short_filename(self, panel_name: str, mechanism: str, main_dir: str) -> str:
        """Create a shorter filename if the full path would be too long"""
        clean_mechanism = mechanism.replace(' ', '_')
        suffix = "_background_hypothesis.txt"
        
        # Calculate maximum allowed length for panel name
        max_panel_length = 250 - len(main_dir) - len(clean_mechanism) - len(suffix) - 10  # 10 char buffer
    
        if len(panel_name) > max_panel_length:
            truncated_panel = panel_name[:max_panel_length].rstrip('_')
            filename = f"{truncated_panel}_{clean_mechanism}{suffix}"
        else:
            filename = f"{panel_name}_{clean_mechanism}{suffix}"
    
        return filename

    def get_background_hypothesis(self, law: dict, mechanism: str) -> str:
        """Get background, theoretical framework, and hypothesis development for a law and specific mechanism"""
        prompt = f"""You are an accounting academic writing a research paper examining {law['title']} and its impact on 
        voluntary disclosure in the U.S. through the {mechanism} channel.
Please write the background, theoretical framework, and hypothesis development section following these guidelines:

Law Details:
Title: {law['title']} ({law['year']})
Regulatory Body: {law['body']}
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Please structure your response as follows:

1. Background (3 paragraphs, ~400 words total):
    - Label this subsection "Background"
    - Describe the relevant securities law in  {law['title']}
    -Include the date that the change is effective ({law['year']}), which firms are affected, and why the change was instituted.  
    -Discuss the effective date ({law['year']}) and implementation details
    -Please also discuss whether there were other contemporaneous securities law adoptions. 
    -Support each claim with citations to foundational papers 

2. Theoretical Framework
    - Begin with a brief introduction connecting the law to the relevant theoretical perspective {mechanism}
    - Explain core concepts of {mechanism}
    - Connect to voluntary disclosure decisions in U.S. firms
    - Link to the specific {mechanism} being studied
    - Support with 2-3 seminal citations

3. Hypothesis Development (3 paragraphs, ~800 words total):
    - Label this subsection "Hypothesis Development"
    - Present economic mechanisms linking {law['title']} to voluntary disclosure decisions in the U.S. through the {mechanism} channel
    - Draw on established theoretical frameworks specifically related to {mechanism}
    - Propose a theoretically supported hypothesis about the relationship between the  
    securities law from file {law['title']} and voluntary disclosure in the U.S. for the specific {mechanism} channel
    - Build logical arguments step by step think through whether prior literature suggests competing theoretical 
    predictions or if the literature suggests only one direction for the relationship. 
    - Present the formal hypothesis statement on its own line, clearly labeled "H1:"
    - Support each claim with citations to foundational papers 

Writing Guidelines:
- Use active voice (e.g., "We examine" instead of "This paper examines")
- Maintain formal academic tone suitable for a top journal
- Include 2-3 citations per paragraph 
- Use present tense for established findings
- Make clear distinctions between correlation and causation
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Accounting Studies
    
IMPORTANT: Include in-text citations but do not include a separate References section at the end.""" 

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting background and hypothesis: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def create_background_hypothesis_files(self, csv_file: str, regression_dir: str, output_dir: str):
        """Generate and save background and hypothesis sections for laws with regression results"""
        # Create main directory
        main_dir = os.path.join(output_dir, 'background and hypothesis development')
        
        # Create the directory if it doesn't exist
        try:
            os.makedirs(main_dir, exist_ok=True)
            print(f"Created/verified directory: {main_dir}")
        except Exception as e:
            print(f"Error creating directory {main_dir}: {str(e)}")
            return
    
        # Get laws that have regression results
        laws = self.get_laws_with_regression_results(csv_file, regression_dir)
        
        # Process each law and mechanism
        for law in laws:
            print(f"\nProcessing law: {law['title']}")
            
            # Generate separate background and hypothesis for each mechanism
            for mechanism in law['mechanisms']:
                print(f"Processing mechanism: {mechanism}")
                
                # Create filename, handling long paths
                filename = self.create_short_filename(law['panel_name'], mechanism, main_dir)
                file_path = os.path.join(main_dir, filename)
                
                # Debug: print the path length
                print(f"  File path length: {len(file_path)} characters")
                if len(file_path) > 250:
                    print(f"  WARNING: Path might be too long!")
            
                # Check if file already exists
                if os.path.exists(file_path):
                    print(f"  Skipping {law['title']} - {mechanism}: File already exists")
                    continue
                
                try:
                    # Get background and hypothesis content
                    content = self.get_background_hypothesis(law, mechanism)
                    
                    # Remove markdown formatting
                    content = self.clean_markdown_formatting(content)
                    
                    # Write the file
                    with open(file_path, 'w', encoding='utf-8') as f:
                        f.write(content)
                    print(f"  ✓ Saved: {filename}")
                    
                except Exception as e:
                    print(f"  ✗ Error saving file for {law['title']} - {mechanism}: {str(e)}")
                    continue

def main():
    # Configuration
    API_KEY = "enter API here"
    CSV_FILE = "enter file path here"
    REGRESSION_DIR = r"enter folder path here"
    OUTPUT_DIR = r"enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.create_background_hypothesis_files(CSV_FILE, REGRESSION_DIR, OUTPUT_DIR)
        print("\nBackground and hypothesis development sections complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()
    
# 7. Send regression results to Claude for interpretation
import json
import os
import glob
import re
from typing import Dict, List
from anthropic import Anthropic

class RegressionInterpreter:
    def __init__(self, input_dir: str, output_dir: str, api_key: str):
        """Initialize interpreter with input and output directories"""
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.client = Anthropic(api_key=api_key)
        
    def _get_significance_stars(self, pvalue: float) -> str:
        """Get significance stars based on p-value."""
        if pvalue < 0.01:
            return "***"
        elif pvalue < 0.05:
            return "**"
        elif pvalue < 0.1:
            return "*"
        return ""
    
    def _get_significance_level(self, pvalue: float) -> str:
        """Convert p-value to significance level description"""
        if pvalue < 0.01:
            return "at the 1% level"
        elif pvalue < 0.05:
            return "at the 5% level"
        elif pvalue < 0.1:
            return "at the 10% level"
        return "not statistically significant"

    def read_regression_results(self, regulation_name: str) -> Dict:
        """Read regression results JSON file for a specific regulation"""
        results_path = os.path.join(self.output_dir, regulation_name, 'regression_results.json')
        
        try:
            with open(results_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            print(f"No results file found for {regulation_name}")
            return {}
        except json.JSONDecodeError:
            print(f"Error reading results file for {regulation_name}")
            return {}

    def _create_exact_hypothesis_mapping(self):
        """Create exact 1:1 mapping from panel names to hypothesis files"""
        panel_names = [
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Reputation_Risk",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Unsophisticated_Investors",
            "panel_Capital_Market_Law_Lebanon_Corporate_Governance",
            "panel_Capital_Market_Law_Lebanon_Equity_Issuance",
            "panel_Capital_Market_Law_Lebanon_Information_Asymmetry",
            "panel_Capital_Market_Law_Lebanon_Litigation_Risk",
            "panel_Capital_Market_Law_Lebanon_Proprietary_Costs",
            "panel_Capital_Market_Law_Lebanon_Reputation_Risk",
            "panel_Capital_Market_Law_Lebanon_Unsophisticated_Investors",
            "panel_Capital_Markets_Act_Uganda_Corporate_Governance",
            "panel_Capital_Markets_Act_Uganda_Equity_Issuance",
            "panel_Capital_Markets_Act_Uganda_Information_Asymmetry",
            "panel_Capital_Markets_Act_Uganda_Litigation_Risk",
            "panel_Capital_Markets_Act_Uganda_Proprietary_Costs",
            "panel_Capital_Markets_Act_Uganda_Reputation_Risk",
            "panel_Capital_Markets_Act_Uganda_Unsophisticated_Investors",
            "panel_Capital_Markets_Law_Mexico_Corporate_Governance",
            "panel_Capital_Markets_Law_Mexico_Equity_Issuance",
            "panel_Capital_Markets_Law_Mexico_Information_Asymmetry",
            "panel_Capital_Markets_Law_Mexico_Litigation_Risk",
            "panel_Capital_Markets_Law_Mexico_Proprietary_Costs",
            "panel_Capital_Markets_Law_Mexico_Reputation_Risk",
            "panel_Capital_Markets_Law_Mexico_Unsophisticated_Investors",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Corporate_Governance",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Information_Asymmetry",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Litigation_Risk",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Proprietary_Costs",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Reputation_Risk",
            "panel_Financial_Instruments_and_Exchange_Act_Japan_Corporate_Governance",
            "panel_Financial_Instruments_and_Exchange_Act_Japan_Equity_Issuance",
            "panel_Financial_Instruments_and_Exchange_Act_Japan_Information_Asymmetry",
            "panel_Financial_Instruments_and_Exchange_Act_Japan_Litigation_Risk",
            "panel_Financial_Instruments_and_Exchange_Act_Japan_Reputation_Risk",
            "panel_Financial_Instruments_and_Exchange_Act_Japan_Unsophisticated_Investors",
            "panel_Financial_Market_Supervision_Act_Switzerland_Corporate_Governance",
            "panel_Financial_Market_Supervision_Act_Switzerland_Equity_Issuance",
            "panel_Financial_Market_Supervision_Act_Switzerland_Information_Asymmetry",
            "panel_Financial_Market_Supervision_Act_Switzerland_Litigation_Risk",
            "panel_Financial_Market_Supervision_Act_Switzerland_Reputation_Risk",
            "panel_Financial_Market_Supervision_Act_Switzerland_Unsophisticated_Investors",
            "panel_Financial_Services_Act_2012_United_Kingdom_Corporate_Governance",
            "panel_Financial_Services_Act_2012_United_Kingdom_Information_Asymmetry",
            "panel_Financial_Services_Act_2012_United_Kingdom_Litigation_Risk",
            "panel_Financial_Services_Act_2012_United_Kingdom_Reputation_Risk",
            "panel_Financial_Services_Act_2012_United_Kingdom_Unsophisticated_Investors",
            "panel_Financial_Services_Law_Brazil_Corporate_Governance",
            "panel_Financial_Services_Law_Brazil_Equity_Issuance",
            "panel_Financial_Services_Law_Brazil_Information_Asymmetry",
            "panel_Financial_Services_Law_Brazil_Litigation_Risk",
            "panel_Financial_Services_Law_Brazil_Proprietary_Costs",
            "panel_Financial_Services_Law_Brazil_Reputation_Risk",
            "panel_Financial_Services_Law_Brazil_Unsophisticated_Investors",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Corporate_Governance",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Equity_Issuance",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Information_Asymmetry",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Litigation_Risk",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Proprietary_Costs",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Reputation_Risk",
            "panel_Markets_in_Financial_Instruments_Directive_Italy_Unsophisticated_Investors",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Corporate_Governance",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Equity_Issuance",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Information_Asymmetry",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Litigation_Risk",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Reputation_Risk",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Unsophisticated_Investors",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Corporate_Governance",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Information_Asymmetry",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Litigation_Risk",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Reputation_Risk",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Unsophisticated_Investors",
            "panel_Securities_and_Exchange_Act_Ghana_Corporate_Governance",
            "panel_Securities_and_Exchange_Act_Ghana_Equity_Issuance",
            "panel_Securities_and_Exchange_Act_Ghana_Information_Asymmetry",
            "panel_Securities_and_Exchange_Act_Ghana_Litigation_Risk",
            "panel_Securities_and_Exchange_Act_Ghana_Proprietary_Costs",
            "panel_Securities_and_Exchange_Act_Ghana_Reputation_Risk",
            "panel_Securities_and_Exchange_Act_Ghana_Unsophisticated_Investors",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Corporate_Governance",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Equity_Issuance",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Information_Asymmetry",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Litigation_Risk",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Proprietary_Costs",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Reputation_Risk",
            "panel_Securities_and_Exchange_Ordinance_Bangladesh_Unsophisticated_Investors",
            "panel_Securities_Exchange_Act_Zambia_Corporate_Governance",
            "panel_Securities_Exchange_Act_Zambia_Equity_Issuance",
            "panel_Securities_Exchange_Act_Zambia_Information_Asymmetry",
            "panel_Securities_Exchange_Act_Zambia_Litigation_Risk",
            "panel_Securities_Exchange_Act_Zambia_Proprietary_Costs",
            "panel_Securities_Exchange_Act_Zambia_Reputation_Risk",
            "panel_Securities_Exchange_Act_Zambia_Unsophisticated_Investors",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Corporate_Governance",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Equity_Issuance",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Information_Asymmetry",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Litigation_Risk",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Proprietary_Costs",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Reputation_Risk",
            "panel_Securities_Industry_Act_Trinidad_and_Tobago_Unsophisticated_Investors",
            "panel_Securities_Law_Cambodia_Corporate_Governance",
            "panel_Securities_Law_Cambodia_Equity_Issuance",
            "panel_Securities_Law_Cambodia_Information_Asymmetry",
            "panel_Securities_Law_Cambodia_Litigation_Risk",
            "panel_Securities_Law_Cambodia_Proprietary_Costs",
            "panel_Securities_Law_Cambodia_Reputation_Risk",
            "panel_Securities_Law_Cambodia_Unsophisticated_Investors",
            "panel_Securities_Law_China_Corporate_Governance",
            "panel_Securities_Law_China_Equity_Issuance",
            "panel_Securities_Law_China_Information_Asymmetry",
            "panel_Securities_Law_China_Litigation_Risk",
            "panel_Securities_Law_China_Proprietary_Costs",
            "panel_Securities_Law_China_Reputation_Risk",
            "panel_Securities_Law_China_Unsophisticated_Investors",
            "panel_Securities_Market_Law_Laos_Corporate_Governance",
            "panel_Securities_Market_Law_Laos_Equity_Issuance",
            "panel_Securities_Market_Law_Laos_Information_Asymmetry",
            "panel_Securities_Market_Law_Laos_Litigation_Risk",
            "panel_Securities_Market_Law_Laos_Proprietary_Costs",
            "panel_Securities_Market_Law_Laos_Reputation_Risk",
            "panel_Securities_Market_Law_Laos_Unsophisticated_Investors",
            "panel_Securities_Market_Law_Myanmar_Corporate_Governance",
            "panel_Securities_Market_Law_Myanmar_Equity_Issuance",
            "panel_Securities_Market_Law_Myanmar_Information_Asymmetry",
            "panel_Securities_Market_Law_Myanmar_Litigation_Risk",
            "panel_Securities_Market_Law_Myanmar_Proprietary_Costs",
            "panel_Securities_Market_Law_Myanmar_Reputation_Risk",
            "panel_Securities_Market_Law_Myanmar_Unsophisticated_Investors",
            "panel_Securities_Market_Law_Pakistan_Corporate_Governance",
            "panel_Securities_Market_Law_Pakistan_Equity_Issuance",
            "panel_Securities_Market_Law_Pakistan_Information_Asymmetry",
            "panel_Securities_Market_Law_Pakistan_Litigation_Risk",
            "panel_Securities_Market_Law_Pakistan_Proprietary_Costs",
            "panel_Securities_Market_Law_Pakistan_Reputation_Risk",
            "panel_Securities_Market_Law_Pakistan_Unsophisticated_Investors",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Corporate_Governance",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Equity_Issuance",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Information_Asymmetry",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Litigation_Risk",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Proprietary_Costs"
        ]
        
        # Create the mapping dictionary
        mapping = {}
        for panel_name in panel_names:
            # Remove "panel_" prefix to get the regulation name
            regulation_name = panel_name[6:]  # Remove "panel_"
            # Create hypothesis filename by adding suffix
            hypothesis_filename = f"{regulation_name}_background_hypothesis.txt"
            mapping[panel_name] = hypothesis_filename
        
        return mapping
    
    def _create_corrected_hypothesis_mapping(self):
        """Create corrected mapping that handles abbreviated hypothesis file names"""
    
        # Manual mapping for files with abbreviations
        abbreviation_corrections = {
            # AIFMD European Union -> Various abbreviations
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Corporate_Governance": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Eur_Corporate_Governance_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Equity_Issuance": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Equity_Issuance_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Information_Asymmetry": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Eu_Information_Asymmetry_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Litigation_Risk": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Litigation_Risk_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Proprietary_Costs": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Europe_Proprietary_Costs_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Reputation_Risk": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Reputation_Risk_background_hypothesis.txt",
            "panel_Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union_Unsophisticated_Investors": 
                "Alternative_Investment_Fund_Managers_Directive_AIFMD_Eu_Unsophisticated_Investors_background_hypothesis.txt",
            
            # EMIR European Union -> Various abbreviations
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Corporate_Governance": 
                "European_Market_Infrastructure_Regulation_EMIR_European_Corporate_Governance_background_hypothesis.txt",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Information_Asymmetry": 
                "European_Market_Infrastructure_Regulation_EMIR_European_Information_Asymmetry_background_hypothesis.txt",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Litigation_Risk": 
                "European_Market_Infrastructure_Regulation_EMIR_European_Union_Litigation_Risk_background_hypothesis.txt",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Proprietary_Costs": 
                "European_Market_Infrastructure_Regulation_EMIR_European_Uni_Proprietary_Costs_background_hypothesis.txt",
            "panel_European_Market_Infrastructure_Regulation_EMIR_European_Union_Reputation_Risk": 
                "European_Market_Infrastructure_Regulation_EMIR_European_Union_Reputation_Risk_background_hypothesis.txt",
            
            # MiFID European Union -> Various abbreviations
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Corporate_Governance": 
                "Markets_in_Financial_Instruments_Directive_MiFID_Europea_Corporate_Governance_background_hypothesis.txt",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Equity_Issuance": 
                "Markets_in_Financial_Instruments_Directive_MiFID_European_Uni_Equity_Issuance_background_hypothesis.txt",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Information_Asymmetry": 
                "Markets_in_Financial_Instruments_Directive_MiFID_Europe_Information_Asymmetry_background_hypothesis.txt",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Litigation_Risk": 
                "Markets_in_Financial_Instruments_Directive_MiFID_European_Uni_Litigation_Risk_background_hypothesis.txt",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Reputation_Risk": 
                "Markets_in_Financial_Instruments_Directive_MiFID_European_Uni_Reputation_Risk_background_hypothesis.txt",
            "panel_Markets_in_Financial_Instruments_Directive_MiFID_European_Union_Unsophisticated_Investors": 
                "Markets_in_Financial_Instruments_Directive_MiFID_Eu_Unsophisticated_Investors_background_hypothesis.txt",
            
            # Canada abbreviations
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Corporate_Governance": 
                "National_Instrument_31103_Registration_Requirements_Cana_Corporate_Governance_background_hypothesis.txt",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Information_Asymmetry": 
                "National_Instrument_31103_Registration_Requirements_Can_Information_Asymmetry_background_hypothesis.txt",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Litigation_Risk": 
                "National_Instrument_31103_Registration_Requirements_Canada_Litigation_Risk_background_hypothesis.txt",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Reputation_Risk": 
                "National_Instrument_31103_Registration_Requirements_Canada_Reputation_Risk_background_hypothesis.txt",
            "panel_National_Instrument_31103_Registration_Requirements_Canada_Unsophisticated_Investors": 
                "National_Instrument_31103_Registration_Requirements_Unsophisticated_Investors_background_hypothesis.txt"
        }
    
        # Start with the standard mapping
        standard_mapping = self._create_exact_hypothesis_mapping()
    
        # Override with corrected abbreviations
        standard_mapping.update(abbreviation_corrections)
    
        return standard_mapping

    def read_hypothesis(self, regulation_name: str) -> str:
        """
        Read hypothesis file using corrected mapping that handles abbreviations
        """
        # Use the corrected mapping that handles abbreviations
        mapping = self._create_corrected_hypothesis_mapping()
    
        # Check if we have a mapping (either standard or corrected)
        if regulation_name in mapping:
            hypothesis_filename = mapping[regulation_name]
        
            # Set up hypothesis directory  
            hypothesis_dir = os.path.join(os.path.dirname(self.output_dir), 
                                    'background and hypothesis development')
        
            hypothesis_file = os.path.join(hypothesis_dir, hypothesis_filename)
        
            print(f"Looking for corrected match: {hypothesis_filename}")
        
            try:
                with open(hypothesis_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                
                # Extract hypothesis development section
                if "Hypothesis Development" in content:
                    hypothesis_section = content.split("Hypothesis Development")[1]
                
                    if "H1:" in hypothesis_section:
                        hypothesis_development = hypothesis_section.split("H1:")[0].strip()
                        h1_statement = "H1:" + hypothesis_section.split("H1:")[1].strip()
                        return f"Hypothesis Development:\n\n{hypothesis_development}\n\n{h1_statement}"
                    else:
                        return f"Hypothesis Development:\n\n{hypothesis_section.strip()}"
                else:
                    print(f"No Hypothesis Development section found in {hypothesis_filename}")
                    return content  # Return full content if no specific section found
                
            except FileNotFoundError:
                print(f"Hypothesis file not found: {hypothesis_file}")
                return ""
            except Exception as e:
                print(f"Error reading hypothesis file {hypothesis_filename}: {str(e)}")
                return ""
        else:
            print(f"No mapping found for regulation: {regulation_name}")
            return ""

    def format_results_text(self, regulation_title: str, regulation_year: int, results: Dict) -> str:
        """Format regression results into text for the academic prompt"""
        results_text = f"Regression Analysis for {regulation_title} (Year: {regulation_year})\n\n"
        
        for spec_name, res in results.items():
            results_text += f"\nSpecification {spec_name}:\n"
            results_text += f"Treatment Effect: {res['coefficients']['treatment_effect']:.4f}\n"
            results_text += f"T-statistic: {res['t_stats']['treatment_effect']:.2f}\n"
            results_text += f"P-value: {res['pvalues']['treatment_effect']:.4f}\n"
            results_text += f"R-squared: {res['r_squared']:.4f}\n"
            results_text += f"Number of observations: {int(res['n_obs'])}\n"
            results_text += f"Number of firms: {res['n_firms']}\n"
            
            if res['controls']:
                results_text += "\nControl Variables:\n"
                for control in res['controls']:
                    coef = res['coefficients'][control]
                    tstat = res['t_stats'][control]
                    pvalue = res['pvalues'][control]
                    stars = self._get_significance_stars(pvalue)
                    results_text += f"{control}: {coef:.4f}{stars} (t={tstat:.2f}, p={pvalue:.4f})\n"
            
            results_text += "\nFixed Effects:\n"
            for fe, included in res['fixed_effects'].items():
                results_text += f"{fe}: {'Yes' if included else 'No'}\n"
            
            results_text += "-" * 50 + "\n"
        
        return results_text

    def generate_claude_interpretation(self, regulation_title: str, regulation_year: int, results_text: str, hypothesis_text: str) -> str:
        """Generate interpretation using Claude API"""
        prompt = f"""You are an accounting academic with a PhD in accounting. 
        You should use active voice (e.g. "We find" instead of "It is found"). 
        Use present tense for all established findings. 
        Distinguish between correlation and causation. 
        Write the results description for this analysis as if you were writing an academic paper for an accounting journal, 
        you are studying the association between a change in mandatory disclosure and voluntary disclosure in U.S. firms. 
        
        Here is the hypothesis that was developed:
        {hypothesis_text}
        
        Please provide a detailed academic analysis of these regression results:

{results_text}

Please structure your analysis as follows (3 paragraphs, ~600 words total):
1. Label this section Regression Analysis
2. Main finding (treatment effect interpretation)
3. Statistical significance and economic magnitude
4. Model specification comparison
5. Control variable effects
   Describe whether the relationship is consistent with prior literature
6. Explain whether the results support the hypothesis stated in the Hypothesis section above

Write in an academic style suitable for a top accounting journal."""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text
        except Exception as e:
            print(f"Error getting Claude interpretation: {str(e)}")
            return f"Error in Claude analysis: {str(e)}"
        
    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content    
    

    def interpret_regulation_impact(self, regulation_name: str, force_rerun: bool = False) -> str:
        """Generate interpretation for a single regulation's results"""

        # FIRST: Check if regression results exist BEFORE creating any folders
        regulation_dir = os.path.join(self.output_dir, regulation_name)
        results_path = os.path.join(regulation_dir, 'regression_results.json')
    
        if not os.path.exists(results_path):
            print(f"No regression_results.json found for {regulation_name}")
            return ""

        # Create subfolder only AFTER confirming results exist
        os.makedirs(regulation_dir, exist_ok=True)
    
        # Check if interpretation already exists
        claude_path = os.path.join(regulation_dir, 'claude_interpretation.txt')
        if os.path.exists(claude_path) and not force_rerun:
            print(f"Interpretation already exists for {regulation_name}, skipping...")
            with open(claude_path, 'r', encoding='utf-8') as f:
                return f.read()

        # Check if regression results exist
        results = self.read_regression_results(regulation_name)
        if not results:
            print(f"No regression results found for {regulation_name}")
            return ""

        # Check if hypothesis exists
        hypothesis_text = self.read_hypothesis(regulation_name)
        if not hypothesis_text:
            print(f"No hypothesis found for {regulation_name}")
            return ""

        print(f"Both regression results and hypothesis found for {regulation_name} - proceeding with interpretation")

        # Read the original panel file to get regulation title
        panel_file = os.path.join(self.input_dir, f"{regulation_name}.csv")
        try:
            import pandas as pd
            df = pd.read_csv(panel_file)
            regulation_title = df['Regulation Title'].iloc[0]
            regulation_year = df['Year'].iloc[0]
        except:
            regulation_title = regulation_name
            regulation_year = "N/A"

        # Format results text
        results_text = self.format_results_text(regulation_title, regulation_year, results)

        # Generate interpretation using Claude
        interpretation = self.generate_claude_interpretation(
            regulation_title, 
            regulation_year, 
            results_text,
            hypothesis_text
        )

        # Clean markdown formatting before saving
        clean_interpretation = self.clean_markdown_formatting(interpretation)

        # Save interpretation to file
        try:
            with open(claude_path, 'w', encoding='utf-8') as f:
                f.write(clean_interpretation)
            print(f"Saved interpretation to {claude_path}")
        except Exception as e:
            print(f"Error saving interpretation to file: {str(e)}")

        return clean_interpretation

    def analyze_all_regulations(self, force_rerun: bool = False) -> None:
        """Analyze results for all regulations in the directory"""
    
        # Get all panel files first
        panel_files = glob.glob(os.path.join(self.input_dir, "panel_*_*.csv"))
        print(f"Found {len(panel_files)} panel CSV files")
    
        # Filter to only those that have regression results
        regulations_with_results = []
        regulations_without_results = []
    
        for panel_file in panel_files:
            regulation_name = os.path.splitext(os.path.basename(panel_file))[0]
            results_path = os.path.join(self.output_dir, regulation_name, 'regression_results.json')
        
            if os.path.exists(results_path):
                regulations_with_results.append(regulation_name)
            else:
                regulations_without_results.append(regulation_name)
    
        print(f"Found {len(regulations_with_results)} regulations WITH regression results")
        print(f"Found {len(regulations_without_results)} regulations WITHOUT regression results")
    
        if regulations_without_results:
            print("\nRegulations WITHOUT regression results (will be skipped):")
            for reg in regulations_without_results[:5]:  # Show first 5
                print(f"  - {reg}")
            if len(regulations_without_results) > 5:
                print(f"  ... and {len(regulations_without_results) - 5} more")
    
        print(f"\nProcessing {len(regulations_with_results)} regulations with results...")
        print(f"Force rerun: {force_rerun}")
        print("-" * 80)
    
        successful = 0
        failed = 0
        skipped = 0
    
        for regulation_name in regulations_with_results:
            # Quick check if interpretation already exists
            regulation_dir = os.path.join(self.output_dir, regulation_name)
            claude_path = os.path.join(regulation_dir, 'claude_interpretation.txt')
        
            if os.path.exists(claude_path) and not force_rerun:
                print(f"SKIPPED: {regulation_name} - interpretation already exists")
                skipped += 1
                continue
        
            try:
                print(f"\n[{successful + failed + 1}/{len(regulations_with_results)}] PROCESSING: {regulation_name}")
                print("="*80)
                result = self.interpret_regulation_impact(regulation_name, force_rerun)
            
                if result:
                    print(f"✅ SUCCESS: Generated interpretation")
                    successful += 1
                else:
                    print(f"❌ FAILED: No interpretation generated")
                    failed += 1
                
            except Exception as e:
                print(f"❌ ERROR: {str(e)}")
                failed += 1

        print(f"\n{'='*80}")
        print(f"FINAL SUMMARY:")
        print(f"✅ Successful: {successful}")
        print(f"❌ Failed: {failed}")
        print(f"⏭️  Skipped (already done): {skipped}")
        print(f"📁 Regulations without results: {len(regulations_without_results)}")
        print(f"📊 Total regulations with results: {len(regulations_with_results)}")
        print(f"📋 Total panel files found: {len(panel_files)}")
        print(f"{'='*80}")
    
def main():
    # Configuration
    API_KEY = "enter API here"  # Replace with your Claude API key
    BASE_DIR = r"enter folder path here"
    INPUT_DIR = os.path.join(BASE_DIR, "folder path here")
    OUTPUT_DIR = os.path.join(BASE_DIR, "folder path here")
    
    interpreter = RegressionInterpreter(INPUT_DIR, OUTPUT_DIR, API_KEY)
    interpreter.analyze_all_regulations(force_rerun=True)

if __name__ == "__main__":
    main()   

# 8. Create Correlation tables
import os
import re
import pandas as pd
import numpy as np
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_LEFT 
from scipy import stats
from reportlab.lib.pagesizes import letter, landscape

def add_underscores_before_capitals(text):
    """Add underscores before capital letters in a string"""
    return re.sub(r'(?<=[a-z0-9])(?=[A-Z])', '_', text)

def create_correlation_table(data_path, output_dir):
    """
    Creates a clean correlation table PDF in the style of academic papers.
    
    Args:
        data_path (str): Path to the panel data CSV
        output_dir (str): Path to save output files
    """
    # Read the CSV file
    df = pd.read_csv(data_path)
    
    # Select numerical variables for correlation
    numeric_vars = ['treatment_effect','freqMF','linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 
                    'lcalrisk']
                   
    # Create shorter variable names for the table
    var_mapping = {
        'treatment_effect': 'Treatment Effect',
        'freqMF': 'FreqMF',
        'linstown': 'Institutional ownership',
        'lsize': 'Firm size',
        'lbtm': 'Book-to-market',
        'lroa': 'ROA',
        'lsaret12': 'Stock return',
        'levol': 'Earnings volatility',
        'lloss': 'Loss',
        'lcalrisk': 'Class action litigation risk'
        
    }
    
    # Calculate correlation matrix
    corr_matrix = df[numeric_vars].corr()
    
    # Calculate p-values for significance testing
    def calculate_pvalue(x, y):
        return stats.pearsonr(x.dropna(), y.dropna())[1]
    
    p_values = pd.DataFrame(index=numeric_vars, columns=numeric_vars)
    for i in numeric_vars:
        for j in numeric_vars:
            p_values.loc[i,j] = calculate_pvalue(df[i], df[j])
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get panel name from path
    panel_name = os.path.basename(os.path.dirname(data_path))
    
    # Create PDF
    clean_name = panel_name.replace('panel_', '')
    pdf_path = os.path.join(output_dir, f'{clean_name}_correlation_table.pdf')
    doc = SimpleDocTemplate(pdf_path, pagesize=landscape(letter), rightMargin=30, leftMargin=30, topMargin=50, bottomMargin=50)
    
    # Prepare table data
    table_data = [['']]  # First cell empty
    
    # Add column headers
    for var in numeric_vars:
        table_data[0].append(var_mapping[var])
    
    # Add rows
    for i, var1 in enumerate(numeric_vars, 1):
        row = [var_mapping[var1]]  # Row header
        for var2 in numeric_vars:
            if var1 == var2:
                row.append('1.00')
            else:
                value = corr_matrix.loc[var1, var2]
                # Format to 2 decimal places
                formatted_value = f'{value:.2f}'
                row.append(formatted_value)
        table_data.append(row)
    
    # Create table style
    style = [
        ('FONTNAME', (0,0), (-1,-1), 'Times-Roman'),
        ('FONTSIZE', (0,0), (-1,-1), 8),
        ('ALIGN', (0,0), (-1,-1), 'CENTER'),
        ('TOPPADDING', (0,0), (-1,-1), 3),
        ('BOTTOMPADDING', (0,0), (-1,-1), 3),
        ('GRID', (0,0), (-1,-1), 0.25, colors.black),  # Lighter grid lines
        ('BOX', (0,0), (-1,-1), 0.25, colors.black),
        # Make column headers and row headers bold
        ('FONTNAME', (0,0), (-1,0), 'Times-Bold'),
        ('FONTNAME', (0,0), (0,-1), 'Times-Bold'),
    ]
    
    # Add bold style for significant correlations
    for i in range(1, len(table_data)):
        for j in range(1, len(table_data[0])):
            if i != j:  # Skip diagonal
                var1 = numeric_vars[i-1]
                var2 = numeric_vars[j-1]
                if p_values.loc[var1,var2] < 0.05:  # 5% significance level
                    style.append(('FONTNAME', (j,i), (j,i), 'Times-Bold'))
    
    # Create table
    table = Table(table_data)
    table.setStyle(TableStyle(style))
    
    # Create title
    styles = getSampleStyleSheet()
    title_style = ParagraphStyle(
        'CustomTitle',
        parent=styles['Normal'],
        fontSize=12,
        alignment=TA_CENTER,
        spaceBefore=12,
        spaceAfter=20,
        fontName='Times-Bold'
    )

    
    # Create Panel title in smaller text if needed
    panel_title = ""
    if panel_name:
        clean_panel_name = panel_name.replace('panel_', '').replace('_', ' ')
        # Format the law name with proper spacing
        if 'NominatingCommitteeRequirements' in clean_panel_name:
            law_name = 'Nominating Committee Requirements'
        elif 'ResourceExtractionDisclosureRules' in clean_panel_name:
            law_name = 'Resource Extraction Disclosure Rules'
        elif 'PayRatioDisclosureRule' in clean_panel_name:
            law_name = 'Pay Ratio Disclosure Rule'
        else:
            law_name = clean_panel_name
    
        panel_title = f"<br/>{law_name}"
    
    title = Paragraph(f"Table 2<br/>Pearson Correlations{panel_title}", title_style)
    
    # Add footnote
    footnote_style = ParagraphStyle(
        'Footnote',
        parent=styles['Normal'],
        fontSize=8,
        alignment=TA_LEFT,
        fontName='Times-Roman',
        spaceBefore=6,
        leading=10  # Controls line spacing
    )
    footnote = Paragraph("This table shows the Pearson correlations for the sample. "
                        "Correlations that are significant at the 0.05 level or better are highlighted in bold. ", footnote_style)
    
    # Build PDF
    doc.build([title, table, Spacer(1, 12), footnote])
    
    print(f"Created correlation table PDF for {panel_name}")
    return pdf_path

def batch_process_panels(base_dir,output_base_dir):
    """
    Process all panel folders and create correlation tables, skipping existing ones.
    """
    print(f"Starting to process panels in: {base_dir}")
    
    # Create output directory
    output_dir = os.path.join(r"enter folder path here")
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created output directory: {output_dir}")
    
    # Count total panels
    panel_folders = [f for f in os.listdir(base_dir) if f.startswith('panel_')]
    total_panels = len(panel_folders)
    processed = 0
    skipped = 0
    errors = 0
    
    print(f"\nFound {total_panels} panel folders to process")
    
    # Process each panel folder
    for i, panel_folder in enumerate(panel_folders, 1):
        print(f"\n{'='*80}")
        print(f"Processing panel {i} of {total_panels}: {panel_folder}")
        print(f"{'='*80}")
        
        panel_path = os.path.join(base_dir, panel_folder)
        
        # Check if correlation table already exists
        clean_name = panel_folder.replace('panel_', '')  # Keep as-is since already formatted
        existing_table = os.path.join(output_dir, f'{clean_name}_correlation_table.pdf')
        
        if os.path.exists(existing_table):
            print(f"Skipping {panel_folder}: Correlation table already exists")
            skipped += 1
            continue
            
        # Look for the data file
        data_file = 'filtered_data_with_trends.csv'
        data_path = os.path.join(panel_path, data_file)
            
        if os.path.exists(data_path):
            try:
                table_path = create_correlation_table(data_path, output_dir)
                print(f"Created correlation table for {panel_folder}")
                print(f"Table saved to: {table_path}")
                processed += 1
            except Exception as e:
                print(f"Error processing {panel_folder}: {str(e)}")
                errors += 1
        else:
            print(f"No data file found in {panel_folder}")
            errors += 1
        
        # Print progress summary
        print(f"\nProgress Summary:")
        print(f"Processed: {i}/{total_panels} ({(i/total_panels)*100:.1f}%)")
        print(f"Successfully created: {processed}")
        print(f"Skipped (already exist): {skipped}")
        print(f"Errors: {errors}")
        
if __name__ == "__main__":
    # Base directory containing panel folders
    BASE_DIR = r"enter folder path here"
    OUTPUT_BASE_DIR = r"enter folder path here"
    
    # Process all panels
    batch_process_panels(BASE_DIR, OUTPUT_BASE_DIR)
    
# 9. Send sample and descriptive statistics results to Claude for interpretation

import pandas as pd
import numpy as np
import os
import glob
from typing import Dict, List
import json
from anthropic import Anthropic
import traceback
from fpdf import FPDF
import re

class DescriptiveStatsAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)

    def convert_numpy_types(self, obj):
        """Convert numpy/pandas types to native Python types for JSON serialization"""
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {key: self.convert_numpy_types(value) for key, value in obj.items()}
        elif isinstance(obj, list):
            return [self.convert_numpy_types(item) for item in obj]
        else:
            return obj
    
    
    def calculate_descriptive_stats(self, df: pd.DataFrame) -> Dict:
        """Calculate descriptive statistics for the dataset"""
        # List of numeric columns to analyze (excluding GVKEY, FYEAR, etc.)
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['GVKEY', 'FYEAR', 'sic3', 'Year']]
        
        # Sort columns to match example order if possible
        preferred_order = [
            'linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 'lcalrisk'
        ]
        sorted_cols = sorted(numeric_cols, key=lambda x: 
                           preferred_order.index(x) if x in preferred_order else float('inf'))
        
        stats = {}
        for col in sorted_cols:  # Use sorted_cols instead of numeric_cols
            col_stats = {
                'n': len(df[col].dropna()),
                'mean': df[col].mean(),
                'median': df[col].median(),
                'std': df[col].std(),
                'p25': df[col].quantile(0.25),
                'p75': df[col].quantile(0.75),
                'min': df[col].min(),
                'max': df[col].max()
            }
            stats[col] = col_stats
        
        # Add additional summary statistics
        summary_stats = {
            'total_observations': len(df),
            'unique_firms': len(df['GVKEY'].unique()),
            'year_range': f"{df['FYEAR'].min()} to {df['FYEAR'].max()}",
            'industries': len(df['sic3'].unique())
        }

        # Convert numpy types to Python types
        summary_stats = self.convert_numpy_types(summary_stats)
        stats['summary'] = summary_stats

        stats = self.convert_numpy_types(stats)
        return stats
    
    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content
    
    def get_claude_interpretation(self, stats: Dict, regulation_title: str) -> str:
        """Get Claude's interpretation of descriptive statistics"""
        # Format statistics for Claude
        stats_text = f"Descriptive Statistics for {regulation_title}\n\n"
        
        # Add summary information
        summary = stats['summary']
        stats_text += "Sample Characteristics:\n"
        stats_text += f"Total observations: {summary['total_observations']:,}\n"
        stats_text += f"Number of unique firms: {summary['unique_firms']:,}\n"
        stats_text += f"Sample period: {summary['year_range']}\n"
        stats_text += f"Number of industries: {summary['industries']}\n\n"
        
        # Add variable statistics
        stats_text += "Variable Statistics:\n"
        for var, var_stats in {k: v for k, v in stats.items() if k != 'summary'}.items():
            stats_text += f"\n{var}:\n"
            stats_text += f"N: {var_stats['n']:,}\n"
            stats_text += f"Mean: {var_stats['mean']:.3f}\n"
            stats_text += f"Median: {var_stats['median']:.3f}\n"
            stats_text += f"Std Dev: {var_stats['std']:.3f}\n"
            stats_text += f"25th percentile: {var_stats['p25']:.3f}\n"
            stats_text += f"75th percentile: {var_stats['p75']:.3f}\n"
            stats_text += f"Min: {var_stats['min']:.3f}\n"
            stats_text += f"Max: {var_stats['max']:.3f}\n"
        
        # Create prompt for Claude
        prompt = f"""You are an accounting academic with a PhD in accounting. 
        You should use active voice (e.g. "We find" instead of "It is found"). 
        Use present tense for all established findings. Write the descriptive statistics section for this analysis as if 
        you were writing an academic paper for an accounting journal. The descriptive statistics are for U.S. firms. 
        Here are the descriptive statistics:

{stats_text}

Please structure your analysis as follows (400 words):
1. Label this section "Sample Description and Descriptive Statistics"
2. Describe the sample characteristics (number of firms, time period)
3. Describe the key variables' distributions
4. Highlight any notable patterns or potential outliers
5. Compare statistics to relevant benchmarks from prior literature where applicable

IMPORTANT: DO NOT include the number of industries in the sample. For example, DO NOT write that the sample represents
a specific number of industries. 

Write in an academic style suitable for a top accounting journal."""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text
        except Exception as e:
            print(f"Error getting Claude interpretation: {str(e)}")
            return f"Error in Claude analysis: {str(e)}"
    
    def create_descriptive_stats_table(self, stats: Dict, output_path: str, regulation_title: str):
        """Create a PDF table of descriptive statistics in academic paper format"""
        try:
            pdf = FPDF(format='A4', orientation='L')
            pdf.add_page()
        
            # Set Times New Roman font
            try:
                pdf.add_font('Times', '', 'times.ttf', uni=True)
                pdf.add_font('Times', 'B', 'timesbd.ttf', uni=True)
                pdf.set_font('Times', size=11)
            except:
                pdf.set_font('Times', size=11)
        
            pdf.set_margins(20, 20, 20)
        
            # Title
            pdf.set_font('Times', 'B', 14)
            pdf.cell(0, 10, 'Table 1', align='C', ln=True)
            pdf.set_font('Times', '', 12)
            pdf.cell(0, 10, 'Descriptive Statistics', align='C', ln=True)
            pdf.ln(5)
        
            # Calculate column widths
            var_width = 70
            num_width = 30
        
            # Table headers
            pdf.set_font('Times', 'B')
            headers = ['Variables', 'N', 'Mean', 'Std. Dev.', 'P25', 'Median', 'P75']
            pdf.cell(var_width, 8, headers[0], border=1)
            for header in headers[1:]:
                pdf.cell(num_width, 8, header, border=1, align='C')
            pdf.ln()
        
            # Variable name mappings with ordered display
            var_display_names = {
                'freqMF': 'FreqMF',
                'treatment_effect': 'Treatment Effect',
                'linstown': 'Institutional ownership',
                'lsize': 'Firm size',
                'lbtm': 'Book-to-market',
                'lroa': 'ROA',
                'lsaret12': 'Stock return',
                'levol': 'Earnings volatility',
                'lloss': 'Loss',
                'lcalrisk': 'Class action litigation risk',
                'time_trend': 'Time Trend'
            }
        
            # Excluded variables
            excluded_vars = {'sic4', 'permno', 'post-law', 'treated'}
        
            # Sort variables to ensure FreqMF is first
            variables = {k: v for k, v in stats.items() 
                        if k != 'summary' and k not in excluded_vars}
        
            # Define display order
            display_order = ['freqMF', 'treatment_effect'] + [
                k for k in var_display_names.keys() 
                if k not in ['freqMF', 'treatment_effect']
            ]
        
            pdf.set_font('Times', '')
            for var_name in display_order:
                if var_name in variables:
                    var_stats = variables[var_name]
                    display_name = var_display_names.get(var_name, var_name)
                    pdf.cell(var_width, 8, display_name, border=1)
                
                    pdf.cell(num_width, 8, f"{var_stats['n']:,}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['mean']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['std']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['p25']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['median']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['p75']:.4f}", border=1, align='C')
                    pdf.ln()
        
            # Footnote
            pdf.ln(10)
            pdf.set_font('Times', '', 10)
            footnote = "This table shows the descriptive statistics. All continuous variables are winsorized at the 1st and 99th percentiles."
            pdf.multi_cell(0, 5, footnote)
        
            pdf.output(output_path)
            print(f"Successfully saved descriptive statistics table to {output_path}")
        
        except Exception as e:
            print(f"Error creating descriptive statistics table: {str(e)}")
            print(f"Traceback: {traceback.format_exc()}")
        
            try:
                pdf = FPDF()
                pdf.add_page()
                pdf.set_font("Times", size=12)
                pdf.cell(0, 10, "Error occurred while creating descriptive statistics table")
                pdf.ln()
                pdf.cell(0, 10, f"Error: {str(e)}")
                pdf.output(output_path)
            except Exception as e2:
                print(f"Emergency PDF save also failed: {str(e2)}")
    
    def analyze_panel(self, panel_dir: str, output_dir: str) -> None:
        """Analyze descriptive statistics for a single panel dataset"""
        try:
            # Get panel name from directory name
            panel_name = os.path.basename(panel_dir)
            print(f"\nAnalyzing {panel_name}...")
            
            # Read filtered data
            data_file = os.path.join(panel_dir, 'filtered_data_with_trends.csv')
            if not os.path.exists(data_file):
                print(f"No filtered_data_with_trends.csv found in {panel_dir}")
                return
            
            print(f"Reading data from {data_file}")
            df = pd.read_csv(data_file)
            
            # Create output directory
            panel_output_dir = os.path.join(output_dir, panel_name)
            os.makedirs(panel_output_dir, exist_ok=True)
            print(f"Created output directory: {panel_output_dir}")
            
            # Calculate descriptive statistics
            print("Calculating descriptive statistics...")
            stats = self.calculate_descriptive_stats(df)
            
            # Save descriptive statistics to JSON
            stats_path = os.path.join(panel_output_dir, 'descriptive_stats.json')
            with open(stats_path, 'w') as f:
                json.dump(stats, f, indent=4, default=str)
            print(f"Saved descriptive statistics to {stats_path}")
            
            # Create and save descriptive statistics table
            table_path = os.path.join(panel_output_dir, 'descriptive_stats_table.pdf')
            print(f"Attempting to create PDF table at {table_path}")
            self.create_descriptive_stats_table(stats, table_path, panel_name)
            
            # Get Claude's interpretation
            print("Getting Claude's interpretation...")
            interpretation = self.get_claude_interpretation(stats, panel_name)
            interpretation = self.clean_markdown_formatting(interpretation)
            
            # Save Claude's interpretation
            interpretation_path = os.path.join(panel_output_dir, 'descriptive_stats_analysis.txt')
            with open(interpretation_path, 'w') as f:
                f.write(interpretation)
            print(f"Saved Claude's analysis to {interpretation_path}")
            
        except Exception as e:
            print(f"Error analyzing {panel_dir}: {str(e)}")
            print(f"Traceback: {traceback.format_exc()}")

def analyze_all_panels(base_dir: str, output_dir: str, api_key: str):
    """Analyze all panel datasets in subfolders"""
    analyzer = DescriptiveStatsAnalyzer(api_key)
    os.makedirs(output_dir, exist_ok=True)

    # Find all subfolders that start with 'panel_'
    panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
    print(f"Found {len(panel_dirs)} panel directories to analyze")

    for i, panel_dir in enumerate(panel_dirs, 1):
        print(f"\nProcessing panel {i} of {len(panel_dirs)}: {panel_dir}")
        analyzer.analyze_panel(panel_dir, output_dir)

if __name__ == "__main__":
    # Configuration
    API_KEY = "enter API here"
    
    # Updated paths for Windows using raw strings to handle backslashes
    BASE_DIR = r"enter folder path here"
    OUTPUT_DIR = r"enter folder path here""
    
    # Run analysis on all panels
    analyze_all_panels(BASE_DIR, OUTPUT_DIR, API_KEY)
    
# 10. Ask Claude to write introduction

import pandas as pd
import json
import os
from anthropic import Anthropic
import glob
import re

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def get_laws_with_regression_results(self, csv_file: str, regression_dir: str) -> list:
        """Read laws from CSV and filter to only those with regression analysis results"""
        df = pd.read_csv(csv_file)

        # Get all panel directories in regression_analyses folder
        panel_dirs = glob.glob(os.path.join(regression_dir, 'panel_*'))
        existing_panels = set()

        for panel_dir in panel_dirs:
            # Extract the panel identifier from directory name
            panel_name = os.path.basename(panel_dir)
            existing_panels.add(panel_name)

        print(f"Found {len(existing_panels)} regression analysis folders")

        # Extract unique law names from existing panel folders
        law_names_in_panels = set()
        for panel_name in existing_panels:
            # Remove 'panel_' prefix and mechanism suffix
            parts = panel_name.replace('panel_', '').split('_')
            # Take all parts except the last 2 (which are mechanism)
            law_part = '_'.join(parts[:-2])
            law_names_in_panels.add(law_part)

        print(f"Unique law names found: {len(law_names_in_panels)}")

        # Create comprehensive mapping from CSV names to panel names
        csv_to_panel_mapping = {
            'AssetBacked_Securities_Reform': 'Asset_Backed_Securities_Reform',
            'Interactive_Data_for_Financial_Reporting': 'Interactive_Datafor_Financial_Reporting', 
            'Internet_Availability_of_Proxy_Materials': 'Internet_Availabilityof_Proxy_Materials',
            'Jumpstart_Our_Business_Startups_JOBS_Act': 'Jumpstart_Our_Business_Startups_JOBSAct',
            'Political_Contributions_by_Investment_Advisers': 'Political_Contributionsby_Investment_Advisers',
            'Proxy_Voting_by_Investment_Advisers': 'Proxy_Votingby_Investment_Advisers',
            'Regulation_AB_AssetBacked_Securities': 'Regulation_ABAsset_Backed_Securities',
            'Regulation_BTR_Blackout_Trading_Restriction': 'Regulation_BTRBlackout_Trading_Restriction',
            'Regulation_R_Bank_Securities_Activities': 'Regulation_RBank_Securities_Activities',
            'Regulation_SBSR_SecurityBased_Swap_Reporting': 'Regulation_SBSRSecurity_Based_Swap_Reporting',
            'Regulation_SFPS_Securities_Financing_Transaction_Reporting': 'Regulation_SFPSSecurities_Financing_Transaction_Reporting',
            'Standards_for_Publicly_Traded_Companies_Audit_Committees': 'Standardsfor_Publicly_Traded_Companies_Audit_Committees'
        }

        laws = []

        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]

        laws_found = 0
        laws_with_regression = 0

        for _, row in df.iterrows():
            laws_found += 1
        
            # Create expected panel name based on the law
            clean_title = row['Regulation Title'].replace(' ', '_').replace('/', '').replace('-', '').replace('(', '').replace(')', '')
            
            # Check if we have a direct mapping, otherwise use the cleaned title
            panel_name_to_check = csv_to_panel_mapping.get(clean_title, clean_title)
            
            # If still no match, try fuzzy matching
            if panel_name_to_check not in law_names_in_panels:
                # Get words from the cleaned title
                clean_words = set(clean_title.lower().split('_'))
                best_match = None
                best_score = 0
                
                for panel_name in law_names_in_panels:
                    panel_words = set(panel_name.lower().split('_'))
                    # Calculate overlap score
                    overlap = len(clean_words.intersection(panel_words))
                    total_words = len(clean_words.union(panel_words))
                    score = overlap / total_words if total_words > 0 else 0
                    
                    # If >70% of words match, consider it a match
                    if score > 0.7 and score > best_score:
                        best_score = score
                        best_match = panel_name
                
                if best_match:
                    panel_name_to_check = best_match
                    print(f"DEBUG: Fuzzy match '{clean_title}' -> '{best_match}' (score: {best_score:.2f})")
            
            # Check if this law has a corresponding regression analysis folder
            if panel_name_to_check in law_names_in_panels:
                laws_with_regression += 1
            
                # Get active mechanisms (where value is 'Yes')
                active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
                law = {
                    'title': row['Regulation Title'],
                    'year': row['Year'],
                    'body': row['Regulatory Body'],
                    'description': row['Description'],
                    'impact': row['Impact'],
                    'mechanisms': active_mechanisms,
                    'panel_name': panel_name_to_check 
                }
                laws.append(law)
                print(f"✓ Including law: {row['Regulation Title']} (Panel: {panel_name_to_check})")
            else:
                print(f"✗ Skipping law: {row['Regulation Title']} (CSV: {clean_title}, Looking for: {panel_name_to_check})")

        print(f"\nSummary:")
        print(f"Total laws in CSV: {laws_found}")
        print(f"Laws with regression results: {laws_with_regression}")
        print(f"Laws to generate introductions for: {len(laws)}")

        return laws

    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content
    
    def read_regression_results(self, base_dir: str) -> dict:
        """Read regression results from all panel subfolders"""
        all_results = {}
        
        # Look for panel directories that include mechanism
        panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
        
        for panel_dir in panel_dirs:
            panel_name = os.path.basename(panel_dir)
            results_file = os.path.join(panel_dir, 'regression_results.json')
            
            if os.path.exists(results_file):
                print(f"Reading results from {panel_name}")
                with open(results_file, 'r') as f:
                    results = json.load(f)
                all_results[panel_name] = results
            else:
                print(f"No results file found in {panel_name}")
        
        return all_results

    def format_regression_results(self, results: dict) -> str:
        """Format regression results for a specific panel"""
        if not results:
            return "No regression results available."
            
        formatted_text = "\nRegression Results:\n\n"
        
        for spec_name, spec_results in results.items():
            formatted_text += f"\nSpecification {spec_name}:\n"
            try:
                formatted_text += f"Treatment Effect: {spec_results['coefficients']['treatment_effect']:.4f}\n"
                formatted_text += f"T-statistic: {abs(spec_results['t_stats']['treatment_effect']):.2f}\n"
                formatted_text += f"P-value: {spec_results['pvalues']['treatment_effect']:.4f}\n"
                formatted_text += f"R-squared: {spec_results['r_squared']:.4f}\n"
                
                if spec_results['controls']:
                    formatted_text += "\nControl Variables:\n"
                    for control in spec_results['controls']:
                        coef = spec_results['coefficients'][control]
                        tstat = spec_results['t_stats'][control]
                        pvalue = spec_results['pvalues'][control]
                        formatted_text += f"{control}: coef={coef:.4f}, t={tstat:.2f}, p={pvalue:.4f}\n"
                
                formatted_text += "\n" + "-"*50 + "\n"
            except KeyError as e:
                print(f"Missing key in regression results: {e}")
                continue
                
        return formatted_text

    def get_comprehensive_introduction(self, law: dict, mechanism: str, regression_results: dict) -> str:
        """Get comprehensive introduction for a law and specific mechanism"""
        regression_text = self.format_regression_results(regression_results)
        print(f"\nFormatted regression results for {law['title']} - {mechanism}:")
        print(regression_text)
        
        prompt = f"""As an accounting academic, please write a comprehensive introduction section examining {law['title']} 
        and its impact on voluntary disclosure in the U.S. through the {mechanism} channel.

Law Details:
Title: {law['title']} ({law['year']})
Regulatory Body: {law['body']}
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Empirical Results:
{regression_text}

Please structure the introduction as follows:

1. Motivation (2 paragraphs, ~200 words):
   - Begin with the importance of {law['title']}
   - Open with a broad statement about {law['title']}
   - Focus specifically on how it relates to {mechanism}
   - Explain its relevance to voluntary disclosure in the U.S. through this mechanism
   - Identify the specific gap or puzzle in the literature
   - Identify specific research questions

2. Hypothesis Development (3 paragraphs, ~300 words):
   - Present the economic mechanism linking the regulation to voluntary disclosure in the U.S. 
   - Explain how {mechanism} affects voluntary disclosure
   - Discuss theoretical underpinnings
   - Build on established theoretical frameworks
   - Develop clear, testable predictions
   - Build logical arguments step by step
   - Support each claim with citations to foundational papers
   - Support arguments with citations

3. Results Summary (3 paragraphs, ~300 words):
   - Lead with strongest statistical findings
   - Present the treatment effect coefficient of {regression_text}
   - Summarize the key findings of the analysis, 
     discussing the significance of the variable in terms of predictive power: {regression_text}
   - Discuss significance of variables and their predictive power
   - Present results in order of importance
   - Include economic significance
   - Use precise statistical language
   - Connect findings back to the {mechanism} channel

4. Contribution (2 paragraphs, ~200 words):
   - Position relative to 3-4 most closely related papers
   - Highlight novel findings about {mechanism}
   - Discuss broader implications for theory and practice
   - Emphasize contributions to understanding this specific economic channel

Guidelines:
- Do not include headers in the write up
- Do not include extra text or explanations
    -Example of what not to include: "Here's a comprehensive introduction section following your guidelines" or 
    "Here's a comprehensive introduction section examining Resource Extraction Disclosure Rules and its impact on voluntary disclosure through the Corporate Governance channel"
- Use active voice (e.g., "We find" instead of "It is found")
- Maintain formal academic tone
- Include 2-3 citations per paragraph 
- Use present tense for established findings
- Use past tense for your specific results
- Make clear distinctions between correlation and causation
- Avoid speculation beyond the data
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Accounting Studies"""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting introduction: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def analyze_and_save_introductions(self, base_dir: str, csv_file: str, output_dir: str):
        """Generate and save comprehensive introductions"""
        # Create introduction directory
        intro_dir = os.path.join(output_dir, 'introduction')
        os.makedirs(intro_dir, exist_ok=True)
    
        # Get filtered laws (only those with regression results) and regression results
        laws = self.get_laws_with_regression_results(csv_file, base_dir)
        regression_results = self.read_regression_results(base_dir)
        
        total_introductions = 0
        total_mechanisms = 0
    
        # Generate introduction for each law and each mechanism
        for law in laws:
            print(f"\nProcessing law: {law['title']}")
            print(f"Mechanisms for this law: {law['mechanisms']}")
            
            if not law['mechanisms']:
                print(f"WARNING: No active mechanisms found for {law['title']}")
                continue
                
            # Generate separate introduction for each mechanism
            for mechanism in law['mechanisms']:
                total_mechanisms += 1
                print(f"Writing introduction for mechanism: {mechanism}")
                
                # Create panel name to match actual folder structure
                clean_mechanism = mechanism.replace(' ', '_')
            
                # Use the panel name from the law object
                clean_title = law['panel_name']
            
                # Look for matching panel in regression results
                law_results = {}
                for panel_name, results in regression_results.items():
                    if clean_title in panel_name and clean_mechanism in panel_name:
                        law_results = results
                        print(f"Found matching regression results: {panel_name}")
                        break
            
                if not law_results:
                    print(f"Warning: No regression results found for {clean_title}_{clean_mechanism}")
                
                try:
                    # Generate introduction
                    intro = self.get_comprehensive_introduction(law, mechanism, law_results)
                    intro = self.clean_markdown_formatting(intro)
            
                    # Create filename using panel naming convention
                    filename = f"{law['panel_name']}_{clean_mechanism}_introduction.txt"
            
                    # Save introduction
                    with open(os.path.join(intro_dir, filename), 'w', encoding='utf-8') as f:
                        f.write(intro)
            
                    total_introductions += 1
                    print(f"✓ Saved introduction for {law['title']} - {mechanism}")
                except Exception as e:
                    print(f"✗ ERROR saving introduction for {law['title']} - {mechanism}: {str(e)}")
    
        print(f"\n" + "="*50)
        print(f"FINAL SUMMARY:")
        print(f"Laws processed: {len(laws)}")
        print(f"Total mechanisms found: {total_mechanisms}")
        print(f"Introductions successfully saved: {total_introductions}")
        print(f"="*50)

def main():
    # Configuration
    API_KEY = "enter API here"
    BASE_DIR = r"enter folder path here""
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = r"enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        
        # Run the analysis with fixed naming logic
        analyzer.analyze_and_save_introductions(BASE_DIR, CSV_FILE, OUTPUT_DIR)
        print("Analysis complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()
    
# 11. Ask Claude to write the model specification section of a paper 

import pandas as pd
import json
import os
from anthropic import Anthropic
import glob
import re

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
        
    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content
    
    def read_regression_results(self, base_dir: str) -> dict:
        """Read regression results from all panel subfolders"""
        all_results = {}
    

        panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
        
        
        for panel_dir in panel_dirs:
            panel_name = os.path.basename(panel_dir)
            results_file = os.path.join(panel_dir, 'regression_results.json')
            
            if os.path.exists(results_file):
                print(f"Reading results from {panel_name}")
                with open(results_file, 'r') as f:
                    results = json.load(f)
                all_results[panel_name] = results
            else:
                print(f"No results file found in {panel_name}")
        
        return all_results

    def format_regression_results(self, results: dict) -> str:
        """Format regression results for a specific panel"""
        formatted_text = "\nRegression Results:\n\n"
        
        for spec_name, spec_results in results.items():
            formatted_text += f"\nSpecification {spec_name}:\n"
            formatted_text += f"Treatment Effect: {spec_results['coefficients']['treatment_effect']:.4f}\n"
            formatted_text += f"T-statistic: {abs(spec_results['t_stats']['treatment_effect']):.2f}\n"
            formatted_text += f"P-value: {spec_results['pvalues']['treatment_effect']:.4f}\n"
            formatted_text += f"R-squared: {spec_results['r_squared']:.4f}\n"
            
            if spec_results['controls']:
                formatted_text += "\nControl Variables:\n"
                for control in spec_results['controls']:
                    coef = spec_results['coefficients'][control]
                    tstat = spec_results['t_stats'][control]
                    pvalue = spec_results['pvalues'][control]
                    formatted_text += f"{control.replace('_', ' ')}: coef={coef:.4f}, t={tstat:.2f}, p={pvalue:.4f}\n"
            
            formatted_text += "\n" + "-"*50 + "\n"
        
        return formatted_text

    def get_laws_analysis(self, csv_file: str) -> list:
        """Read and analyze laws from CSV file"""
        df = pd.read_csv(csv_file)
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        for _, row in df.iterrows():
            # Get active mechanisms (where value is 'Yes')
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
            law = {
                'title': row['Regulation Title'],
                'year': row['Year'],
                'body': row['Regulatory Body'],
                'description': row['Description'],
                'impact': row['Impact'],
                'mechanisms': active_mechanisms
            }
            laws.append(law)
        
        return laws

    def get_model_specification(self, law: dict, mechanism: str, regression_results: dict) -> str:
        """Get model specification section for a law and specific mechanism with regression results"""
        regression_text = self.format_regression_results(regression_results) if regression_results else "No regression results available."
        
        # Get number of observations from regression results
        n_obs = None
        if regression_results and '(3)' in regression_results:
            n_obs = regression_results['(3)'].get('n_obs', 'Not available')
        
        # Get list of control variables from regression results
        controls = []
        if regression_results:
            for spec in regression_results.values():
                if spec.get('controls'):
                    controls.extend(spec['controls'])
            controls = list(set(controls))  # Remove duplicates
        
        prompt = f"""You are an accounting academic writing a research paper examining {law['title']} and its impact 
        on voluntary disclosure in the U.S.  through the {mechanism} channel. 
        Please write the research design section for an academic journal in accounting.

Law Details:
Title: {law['title']} ({law['year']})
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Regression Information:
{regression_text}

IMPORTANT: This study examines all firms in the Compustat universe, not just firms directly subject to {law['title']}. 
This is a pre/post research design.

Please follow these detailed guidelines:

1. Sample selection and post-law indicator:
    - Explain that the sample includes all firms in the Compustat universe, in the U.S., during the sample period 
    - Describe the regulatory authority that is responsible for the law {law['body']} 
    - Clarify that while {law['title']} may directly target specific firms/industries, 
      the analysis examines all firms in the Compustat universe
    - Explain that the treatment variable affects all firms
    
2. Model Explanation (2-3 paragraphs, ~300 words total):
    - Explain the regression model used to examine the relationship between {law['title']} 
      and voluntary disclosure in the U.S. through the {mechanism} channel
          -The model is: FreqMF = β₀ + β₁Treatment Effect + γControls + ε
    - Only discuss the control variables that appear in the regression results {regression_text}
      These variables are based on prior literature and are: Institutional Ownership, Firm Size, Book-to-Market,
      ROA, Stock Return, Earnings volatility, Loss, Class action litigation risk
    - Support model choices with citations to foundational papers
    - Explain potential endogeneity concerns and how the research design addresses them
    - Use clear, academic language
    - Avoid using underscores in variable names

3. Mathematical Model:
    - Present the complete regression equation in proper mathematical notation {regression_text}
        - Label the equation as follows: FreqMF = β₀ + β₁Treatment Effect + γControls + ε
            - Label the dependent variable "FreqMF"
            - Label the variable of interest as "Treatment Effect"
            - Label the control variables in the regression equation as "Controls"
    - Do no include the subscripts i and t in the regression 
    - Format the equation professionally

4. Variable Definitions (2-3 paragraphs, ~300 words total):
    - Define the dependent variable (FreqMF - management forecast frequency)
    - Define the "Treatment Effect" variable as an indicator variable for the post-{law['title']} period (affecting all firms)
    - Define each control variable used in the model as they appear in {regression_text}
      These variables are based on prior literature and are: Institutional Ownership, Firm Size, Book-to-Market,
      ROA, Stock Return, Earnings volatility, Loss, Class action litigation risk
        -Cite the appropriate paper for these variables from the Journal of Accounting Research
    - Do no include the subscripts i and t in the variable definition
    - For each control variable, provide detailed explanations about their expected relationships with voluntary disclosure
    - Explain how variables relate to the {mechanism} channel
    

5. Sample Construction (2-3 paragraphs, ~300 words total):
    - Describe the event window around {law['year']}
        -Always clarify that the post-regulation period includes the regulation year by writing "from {law['year']} onwards"
        -The time window for this analysis is 2 years before and 2 years after the regulation is implemented. Therefore,
         The total number of years of the sample period is 5 years.
    - Describe the source of the data from Compustat, I/B/E/S, Audit Analytics, and CRSP
    - Describe the sample construction process based on the number of observations: {n_obs if n_obs else 'Not available'}
    - Note any sample restrictions

Writing Guidelines:
-Recall that the data is for U.S. firms
- Provide only the write up, no extra text or explanations 
    -Example of what not to include: "Here's a comprehensive model specifaction section following your guidelines"
- Use active voice (e.g., "We find" instead of "It is found")
- Maintain formal academic tone
- Include 2-3 citations per paragraph
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Accounting Studies
- Use precise statistical language
- Make clear connections between variables and theoretical predictions
- Do not include Latex format"""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting model specification: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def create_model_specifications(self, base_dir: str, csv_file: str, output_dir: str):
        """Generate and save model specification sections for all laws"""
        # Create main directory
        main_dir = os.path.join(output_dir, 'model_specification')
        os.makedirs(main_dir, exist_ok=True)
        
        # Get regression results for existing panels
        regression_results = self.read_regression_results(base_dir)
        total_panels = len(regression_results)
        processed = 0
        
        print(f"\nFound {total_panels} panels in regression folder")

        
        # Process each panel that exists
        for panel_name, results in regression_results.items():
            print(f"\n{'='*80}")
            print(f"Processing panel {processed + 1} of {total_panels}: {panel_name}")
        
            try:
                # Parse panel name to get law and mechanism
                parts = panel_name.replace('panel_', '').split('_')
                mechanism = parts[-1]  # Last part is the mechanism
                law_name = '_'.join(parts[:-1])  # Everything else is the law name
            
                # Read the original panel file to get law details
                panel_file = os.path.join(base_dir, panel_name, "filtered_data_with_trends.csv")
                df = pd.read_csv(panel_file)
            
                law = {
                    'title': df['Regulation Title'].iloc[0],
                    'year': df['Year'].iloc[0],
                    'body': df['Regulatory Body'].iloc[0] if 'Regulatory Body' in df.columns else 'Unknown',
                    'description': df['Description'].iloc[0] if 'Description' in df.columns else 'Not available',
                    'impact': df['Impact'].iloc[0] if 'Impact' in df.columns else 'Not available',
                    'mechanisms': [mechanism]
                }
            
                # Generate model specification
                content = self.get_model_specification(law, mechanism, results)
                
                # Clean markdown formatting
                clean_content = self.clean_markdown_formatting(content)
            
                # Create filename
                filename = f"{panel_name}_model_specification.txt"
                file_path = os.path.join(main_dir, filename)
            
                # Save model specification
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(clean_content)
                print(f"Saved model specification for {panel_name}")
            
            except Exception as e:
                print(f"Error processing panel {panel_name}: {str(e)}")
        
            processed += 1
            print(f"\nProgress: {processed}/{total_panels} ({(processed/total_panels)*100:.1f}%)")
    
        print("\nModel specification generation complete!")
    
def main():
    # Configuration
    API_KEY = "enter API here"
    BASE_DIR = r"enter folder path here"
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = r"enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.create_model_specifications(BASE_DIR, CSV_FILE, OUTPUT_DIR)
        print("\nModel specification sections complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

# 12. Ask Claude to write a conclusion 

import pandas as pd
import json
import os
from anthropic import Anthropic
import glob
import re

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)

    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content
    
    def read_regression_results(self, base_dir: str) -> dict:
        """Read regression results from all panel subfolders"""
        all_results = {}
        
        panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
        
        for panel_dir in panel_dirs:
            panel_name = os.path.basename(panel_dir)
            results_file = os.path.join(panel_dir, 'regression_results.json')
            
            if os.path.exists(results_file):
                print(f"Reading results from {panel_name}")
                with open(results_file, 'r') as f:
                    results = json.load(f)
                all_results[panel_name] = results
            else:
                print(f"No results file found in {panel_name}")
        
        return all_results

    def format_regression_results(self, results: dict) -> str:
        """Format regression results for a specific panel"""
        formatted_text = "\nRegression Results:\n\n"
        
        for spec_name, spec_results in results.items():
            formatted_text += f"\nSpecification {spec_name}:\n"
            formatted_text += f"Treatment Effect: {spec_results['coefficients']['treatment_effect']:.4f}\n"
            formatted_text += f"T-statistic: {abs(spec_results['t_stats']['treatment_effect']):.2f}\n"
            formatted_text += f"P-value: {spec_results['pvalues']['treatment_effect']:.4f}\n"
            formatted_text += f"R-squared: {spec_results['r_squared']:.4f}\n"
            
            if spec_results['controls']:
                formatted_text += "\nControl Variables:\n"
                for control in spec_results['controls']:
                    coef = spec_results['coefficients'][control]
                    tstat = spec_results['t_stats'][control]
                    pvalue = spec_results['pvalues'][control]
                    formatted_text += f"{control.replace('_', ' ')}: coef={coef:.4f}, t={tstat:.2f}, p={pvalue:.4f}\n"
            
            formatted_text += "\n" + "-"*50 + "\n"
        
        return formatted_text

    def get_laws_analysis(self, csv_file: str) -> list:
        """Read and analyze laws from CSV file"""
        df = pd.read_csv(csv_file)
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        for _, row in df.iterrows():
            # Get active mechanisms (where value is 'Yes')
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
            law = {
                'title': row['Regulation Title'],
                'year': row['Year'],
                'body': row['Regulatory Body'],
                'description': row['Description'],
                'impact': row['Impact'],
                'mechanisms': active_mechanisms
            }
            laws.append(law)
        
        return laws

    def get_conclusion(self, law: dict, mechanism: str, regression_results: dict) -> str:
        """Get conclusion section for a law and specific mechanism with regression results"""
        regression_text = self.format_regression_results(regression_results) if regression_results else "No regression results available."
        
        prompt = f"""You are an accounting academic writing a research paper examining {law['title']} and its 
        impact on voluntary disclosure in the U.S. through the {mechanism} channel. 
        Please write a conclusion section for an academic journal in accounting.

Law Details:
Title: {law['title']} ({law['year']})
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Empirical Results:
{regression_text}

Please write a comprehensive conclusion following these guidelines:

1. Summary of Main Findings (2-3 paragraphs):
    - Restate the research question, focusing on the {mechanism} channel
    - Summarize key empirical findings
    - Discuss statistical and economic significance
    - Interpret the results in the context of {law['title']} and {mechanism}

2. Implications (1-2 paragraphs):
    - Discuss implications for regulators
    - Discuss implications for managers
    - Discuss implications for investors
    - Connect findings to broader literature on {mechanism}

3. Limitations and Future Research (1-2 paragraphs):
    - Acknowledge key limitations
    - Suggest promising avenues for future research
    - Discuss potential extensions, particularly related to {mechanism}
    
Writing Guidelines:
- Use active voice (e.g., "We find" instead of "It is found")
- Maintain formal academic tone
- Use past tense for your specific results
- Use present tense for implications
- Make clear distinctions between correlation and causation
- Focus on the practical significance of the findings
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Accounting Studies
- Do not include section headers
- Do not include journal names after the citation. For example, for these citations: (Christensen et al., 2013,
  Journal of Accounting and Economics; Shroff et al., 2013, The Accounting Review). 
  The Journal of Accounting and Economics and The Accounting Review names
  should not be included.
- Length: approximately 750 words"""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting conclusion: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def create_conclusions(self, base_dir: str, csv_file: str, output_dir: str):
        """Generate and save conclusion sections for all laws"""
        # Create main directory
        main_dir = os.path.join(output_dir, 'conclusion')
        os.makedirs(main_dir, exist_ok=True)
    
        # Get regression results for existing panels
        regression_results = self.read_regression_results(base_dir)
        total_panels = len(regression_results)
        processed = 0
    
        print(f"\nFound {total_panels} panels in regression folder")
    
        # Process each panel that exists
        for panel_name, results in regression_results.items():
            print(f"\n{'='*80}")
            print(f"Processing panel {processed + 1} of {total_panels}: {panel_name}")
        
            try:
                # Parse panel name to get law and mechanism
                parts = panel_name.replace('panel_', '').split('_')
                mechanism = parts[-1]  # Last part is the mechanism
                law_name = '_'.join(parts[:-1])  # Everything else is the law name
                
                # Read the original panel file to get law details
                panel_file = os.path.join(base_dir, panel_name, "filtered_data_with_trends.csv")
                if not os.path.exists(panel_file):
                    print(f"No filtered data file found for {panel_name}")
                    continue
                
                df = pd.read_csv(panel_file)
            
                law = {
                    'title': df['Regulation Title'].iloc[0] if 'Regulation Title' in df.columns else 'Unknown',
                    'year': df['Year'].iloc[0] if 'Year' in df.columns else 'Unknown',
                    'body': df['Regulatory Body'].iloc[0] if 'Regulatory Body' in df.columns else 'Unknown',
                    'description': df['Description'].iloc[0] if 'Description' in df.columns else 'Not available',
                    'impact': df['Impact'].iloc[0] if 'Impact' in df.columns else 'Not available',
                    'mechanisms': [mechanism]
                }
            
                # Generate conclusion for that specific law-mechanism combination
                content = self.get_conclusion(law, mechanism, results)
            
                # Clean markdown formatting
                clean_conclusion = self.clean_markdown_formatting(content)
            
                # Create filename
                filename = f"{panel_name}_conclusion.txt"
                file_path = os.path.join(main_dir, filename)
            
                # Check if file already exists
                if os.path.exists(file_path):
                    print(f"Skipping {panel_name}: File already exists")
                    continue
            
                # Save conclusion
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(clean_conclusion)
                print(f"Saved conclusion for {panel_name}")
            
            except Exception as e:
                print(f"Error processing panel {panel_name}: {str(e)}")
        
            processed += 1
            print(f"\nProgress: {processed}/{total_panels} ({(processed/total_panels)*100:.1f}%)")
    
        print("\nConclusion generation complete!")
    
def main():
    # Configuration
    API_KEY = "enter API here"
    BASE_DIR = r"enter folder path here"
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = r"enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.create_conclusions(BASE_DIR, CSV_FILE, OUTPUT_DIR)
        print("\nConclusion sections complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()
    
# 13. Ask Claude to write an abstract
import os
import json
import pandas as pd
from anthropic import Anthropic
import re

class AbstractGenerator:
    def __init__(self, api_key: str):
        """Initialize abstract generator with Claude API key"""
        self.client = Anthropic(api_key=api_key)
        
    def clean_markdown_formatting(self, content: str) -> str:
        """Remove all markdown formatting from content"""
        # Remove headers (##, ###, etc.)
        content = re.sub(r'^#+\s*', '', content, flags=re.MULTILINE)
        
        # Remove bold/italic formatting - this handles **text**, ***text***, ****text****
        content = re.sub(r'\*{1,4}([^*]*?)\*{1,4}', r'\1', content)
        
        # Clean up any remaining asterisks
        content = re.sub(r'\*+', '', content)
        
        # Clean up any extra whitespace that might be left
        content = re.sub(r'\n\s*\n\s*\n', '\n\n', content)
        
        return content
    
    def read_laws_data(self, csv_file: str) -> pd.DataFrame:
        """Read the laws data CSV file"""
        return pd.read_csv(csv_file)
    
    def generate_abstract(self, introduction_content: str) -> str:
        """Generate an abstract based on an existing introduction"""
        prompt = f"""As an accounting academic, please convert the following introduction into a concise academic abstract.

Guidelines:
- Maintain the key points from the introduction
- Condense the content to 150-250 words
- Include background, research objective, methodology, key findings, and contribution
- Use a formal academic tone
- Avoid adding new information not present in the original text
- Use present tense for established findings
- Use past tense for specific results
- Do not include citations in the abstract
- Do not use the label "Abstract"
- Write in one paragraph

Introduction to Convert:
{introduction_content}

Please provide a structured abstract that captures the essence of the original introduction."""

        try:
            response = self.client.messages.create(
                model="claude-sonnet-4-20250514",
                max_tokens=4000,
                temperature=0.5,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error generating abstract: {str(e)}")
            return f"Error in analysis: {str(e)}"
    
    def process_introductions(self, input_dir: str, output_dir: str):
        """Process introduction files and generate corresponding abstracts"""
        # Create abstracts directory
        os.makedirs(output_dir, exist_ok=True)
    
        # Find all introduction files
        introduction_files = [f for f in os.listdir(input_dir) if f.endswith('_introduction.txt')]
    
        # Process each introduction file
        for intro_file in introduction_files:
            try:
                # Create abstract filename (replace 'introduction' with 'abstract')
                abstract_filename = intro_file.replace('_introduction.txt', '_abstract.txt')
                abstract_path = os.path.join(output_dir, abstract_filename)
            
                # CHECK FOR EXISTING FILES
                if os.path.exists(abstract_path):
                    print(f"Skipping {intro_file}: Abstract already exists")
                    continue
            
                # Read introduction content
                with open(os.path.join(input_dir, intro_file), 'r', encoding='utf-8') as f:
                    introduction_content = f.read()
            
                # Generate abstract
                abstract = self.generate_abstract(introduction_content)
            
                # Clean markdown formatting
                clean_abstract = self.clean_markdown_formatting(abstract)
            
                # Save abstract
                with open(abstract_path, 'w', encoding='utf-8') as f:
                    f.write(clean_abstract)
            
                # PROGRESS FEEDBACK
                print(f"Generated abstract for {intro_file}")
        
            except Exception as e:
                print(f"Error processing {intro_file}: {str(e)}")
            
def main():
    # Configuration
    API_KEY = "enter API here"  # Replace with your actual API key
    
    # Directories
    INPUT_DIR = r"enter folder path here"
    OUTPUT_DIR = r"enter folder path here"
    
    try:
        # Initialize and run abstract generator
        generator = AbstractGenerator(API_KEY)
        generator.process_introductions(INPUT_DIR, OUTPUT_DIR)
        print("Abstract generation complete!")
    
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()
    
# 14. Combine AI-generated content from Txt. files
import os
import pandas as pd

def find_background_file(back_hypo_dir: str, law_name: str, clean_mechanism: str) -> str:
    """Find the background file handling different naming patterns including EU regulations with acronyms"""
    
    # Try standard pattern first
    standard_file = os.path.join(back_hypo_dir, f"{law_name}_{clean_mechanism}_background_hypothesis.txt")
    if os.path.exists(standard_file):
        return standard_file
    
    # Enhanced fuzzy matching for EU regulations and other complex names
    if os.path.exists(back_hypo_dir):
        all_bg_files = [f for f in os.listdir(back_hypo_dir) if f.endswith('_background_hypothesis.txt')]
        
        # Debug: Show relevant files for specific problematic cases
        if law_name in ['Alternative_Investment_Fund_Managers_Directive_AIFMD_European_Union', 
                        'National_Instrument_31103_Registration_Requirements_Canada']:
            print(f"\n=== DEBUG for {law_name} ===")
            print(f"Looking for mechanism: {clean_mechanism}")
            relevant_files = [f for f in all_bg_files if any(word in f.lower() for word in 
                             ['aifmd', 'alternative', 'national', '31103', 'instrument'])]
            print(f"Relevant files found ({len(relevant_files)}):")
            for f in relevant_files[:10]:  # Show first 10 relevant files
                print(f"  - {f}")
            print("=== END DEBUG ===\n")
    
        target_suffix = f"_{clean_mechanism}_background_hypothesis.txt"
        
        # Create matching variants of the law name based on actual file patterns
        def create_matching_variants(name):
            """Create various matching versions of the regulation name"""
            variants = set([name])
            
            # Handle European Union regulations with different abbreviations
            eu_variations = {
                'European_Union': ['European_Union', 'European', 'Europe', 'Eu', 'Eur', 'European_Uni', 'Europea', ''],
                'Canada': ['Canada', 'Can', 'CA', 'Cana', '']
            }
            
            for original, replacements in eu_variations.items():
                if original in name:
                    for replacement in replacements:
                        if replacement == '':
                            # When replacing with empty string, clean up double underscores
                            new_name = name.replace('_' + original, '')  # Remove with preceding underscore
                            if new_name == name:  # If no preceding underscore, try without
                                new_name = name.replace(original + '_', '')  # Remove with following underscore
                            if new_name == name:  # If still no change
                                new_name = name.replace(original, '')  # Just remove it
                            variants.add(new_name)
                        else:
                            variants.add(name.replace(original, replacement))
                
            # Handle acronym truncations 
            acronym_variations = {
                'AIFMD': ['AIFMD', 'AIFM', 'AIF'],
                'EMIR': ['EMIR'],
                'MiFID': ['MiFID', 'MiFI']
            }
    
            for original, replacements in acronym_variations.items():
                if original in name:
                    for replacement in replacements:
                        variants.add(name.replace(original, replacement))
                        
             # Handle truncated words more aggressively
            # Split by underscores and try partial matches
            parts = name.split('_')
            truncated_variants = []
            
            # Special handling for known problematic cases
            if 'AIFMD' in name:
                # Try without the 'D' at the end
                truncated_variants.append(name.replace('AIFMD', 'AIFM'))
    
            if 'Canada' in name and 'National_Instrument_31103' in name:
                # Try without Canada at the end
                truncated_variants.append(name.replace('_Canada', ''))
    
            for i, part in enumerate(parts):
                # Try truncating longer words
                if len(part) > 6:
                    truncated_variants.append('_'.join(parts[:i] + [part[:6]] + parts[i+1:]))
                    truncated_variants.append('_'.join(parts[:i] + [part[:8]] + parts[i+1:]))
    
            variants.update(truncated_variants)
            
            # Remove underscores version
            variants.add(name.replace('_', ''))
            
            return list(variants)
        
        # Get all variants to search for
        search_variants = create_matching_variants(law_name)
        
        print(f"Searching for background file for: {law_name}")
        print(f"Looking for mechanism: {clean_mechanism}")
        print(f"Search variants: {search_variants[:3]}...")  # Show first 3
        
        # Try exact matches with variants
        for variant in search_variants:
            variant_file = f"{variant}_{clean_mechanism}_background_hypothesis.txt"
            full_path = os.path.join(back_hypo_dir, variant_file)
            if os.path.exists(full_path):
                print(f"Found exact match: {variant_file}")
                return full_path
        
        # Enhanced fuzzy matching with scoring
        def calculate_match_score(file_name, target_law, target_mechanism):
            """Calculate similarity score between file name and target"""
            score = 0
            
            # Remove the suffix to get the core name
            if file_name.endswith('_background_hypothesis.txt'):
                core_name = file_name[:-28]  # Remove '_background_hypothesis.txt'
            else:
                core_name = file_name
            
            # Check if this file is for the right mechanism
            if not core_name.endswith(f'_{target_mechanism}'):
                return 0  # Wrong mechanism, skip
            
            # Get the law part (everything before the mechanism)
            law_part = core_name[:-len(f'_{target_mechanism}')]
            
            # Normalize both names for comparison
            file_normalized = law_part.replace('_', '').lower()
            target_normalized = target_law.replace('_', '').lower()
            
            # Exact match gets highest score
            if file_normalized == target_normalized:
                return 100
            
            # Handle truncation scenarios more aggressively
            def normalize_for_truncation(text):
                """Normalize text for truncation matching"""
                # Handle common truncations
                text = text.replace('europeanunion', 'europea')
                text = text.replace('european', 'europea')
                text = text.replace('infrastructure', 'infrastr')
                text = text.replace('regulation', 'regulat')
                text = text.replace('directive', 'direct')
                text = text.replace('instrument', 'instrum')
                text = text.replace('requirements', 'require')
                text = text.replace('registration', 'registr')
                text = text.replace('national', 'nation')
                text = text.replace('aifmd', 'aifm')  
                text = text.replace('canada', '')  
                return text
            
            file_truncated = normalize_for_truncation(file_normalized)
            target_truncated = normalize_for_truncation(target_normalized)
            
            # Check truncated versions
            if file_truncated == target_truncated:
                score += 80
            elif file_truncated in target_truncated or target_truncated in file_truncated:
                score += 60
            
            # Check for EU abbreviation matches (more variations)
            eu_mappings = {
                'europeanunion': ['european', 'europe', 'eu', 'eur', 'europea', 'europeanuni'],
                'canada': ['can', 'ca']
            }
            
            for full_form, abbreviations in eu_mappings.items():
                for abbrev in abbreviations:
                    if full_form in target_normalized and abbrev in file_normalized:
                        score += 40
                    if abbrev in target_normalized and full_form in file_normalized:
                        score += 40
            
            # Check if one contains the other
            if target_normalized in file_normalized or file_normalized in target_normalized:
                score += 30
            
            # Check for key regulation identifiers with partial matching
            key_identifiers = [
                ('aifmd', 'aifmd'),
                ('aifmd', 'aifm'),
                ('aifm', 'aifm'),
                ('emir', 'emir'), 
                ('mifid', 'mifid'),
                ('alternative', 'altern'),
                ('investment', 'invest'),
                ('fund', 'fund'),
                ('managers', 'manag'),
                ('market', 'market'),
                ('infrastructure', 'infrastr'),
                ('regulation', 'regulat'),
                ('markets', 'market'),
                ('financial', 'financ'),
                ('instruments', 'instrum'),
                ('directive', 'direct'),
                ('national', 'nation'),
                ('instrument', 'instrum'),
                ('31103', '31103'),
                ('registration', 'registr'),
                ('requirements', 'require')
            ]
            
            # Score based on matching key identifiers (both full and partial)
            for full_identifier, partial_identifier in key_identifiers:
                if full_identifier in target_normalized and full_identifier in file_normalized:
                    score += 15
                elif full_identifier in target_normalized and partial_identifier in file_normalized:
                    score += 10
                elif partial_identifier in target_normalized and full_identifier in file_normalized:
                    score += 10
            
            return score
        
        # Score all files and find best match
        best_match = None
        best_score = 0
        
        print(f"Trying fuzzy matching for {len(all_bg_files)} files...")
        
        for bg_file in all_bg_files:
            score = calculate_match_score(bg_file, law_name, clean_mechanism)
            
            if score > best_score and score >= 40:  # Minimum threshold
                best_score = score
                best_match = bg_file
                print(f"New best match: {bg_file} (score: {score})")
        
        if best_match:
            print(f"Final match found: {best_match} (score: {best_score})")
            return os.path.join(back_hypo_dir, best_match)
        else:
            print("No suitable match found")
    
    # Return None if not found
    return None

def get_available_laws_from_files(base_dir: str) -> dict:
    """Get laws and mechanisms that actually have files available"""
    abs_dir = os.path.join(base_dir, 'abstracts')
    if not os.path.exists(abs_dir):
        return {}
    
    abstract_files = [f for f in os.listdir(abs_dir) if f.endswith('_abstract.txt')]
    law_mechanisms = {}
    mechanisms = ['Information_Asymmetry', 'Unsophisticated_Investors', 'Corporate_Governance', 
                 'Proprietary_Costs', 'Litigation_Risk', 'Equity_Issuance', 'Reputation_Risk']
    
    for file in abstract_files:
        filename_base = file.replace('_abstract.txt', '')
        law_name = None
        mechanism = None
        
        for mech in mechanisms:
            if filename_base.endswith('_' + mech):
                law_name = filename_base.replace('_' + mech, '')
                mechanism = mech.replace('_', ' ')
                break
        
        if law_name and mechanism:
            if law_name not in law_mechanisms:
                law_mechanisms[law_name] = []
            law_mechanisms[law_name].append(mechanism)
    
    return law_mechanisms

def combine_single_law_mechanism(base_dir: str, law_name: str, mechanism: str) -> str:
    """Combine text sections for a specific law and mechanism"""
    
    clean_mechanism = mechanism.replace(' ', '_')
    output_filename = f"{law_name}_{clean_mechanism}_combined.txt"
    
    # Define folder paths
    folders = {
        'abstracts': os.path.join(base_dir, 'abstracts'),
        'introduction': os.path.join(base_dir, 'introduction'),
        'background': os.path.join(base_dir, 'background and hypothesis development'),
        'model_specification': os.path.join(base_dir, 'model_specification'),
        'descriptive_stats': os.path.join(base_dir, 'descriptive_stats'),
        'regression_analyses': os.path.join(base_dir, 'regression_analyses'),
        'conclusion': os.path.join(base_dir, 'conclusion'),
        'output': os.path.join(base_dir, 'combined_sections')
    }
    
    print(f"\n=== PROCESSING {law_name} - {mechanism} ===")
    
    # Define file paths
    files = {
        'abstract': os.path.join(folders['abstracts'], f"{law_name}_{clean_mechanism}_abstract.txt"),
        'introduction': os.path.join(folders['introduction'], f"{law_name}_{clean_mechanism}_introduction.txt"),
        'model': os.path.join(folders['model_specification'], f"panel_{law_name}_{clean_mechanism}_model_specification.txt"),
        'conclusion': os.path.join(folders['conclusion'], f"panel_{law_name}_{clean_mechanism}_conclusion.txt"),
        'descriptive_stats': os.path.join(folders['descriptive_stats'], f"panel_{law_name}_{clean_mechanism}", 'descriptive_stats_analysis.txt'),
        'regression': os.path.join(folders['regression_analyses'], f"panel_{law_name}_{clean_mechanism}", 'claude_interpretation.txt')
    }
    
    # Find background file with enhanced matching
    background_file = find_background_file(folders['background'], law_name, clean_mechanism)
    if background_file:
        files['background'] = background_file
        print(f"Background found: {background_file}")
    else:
        print(f"Background missing for {law_name} - {mechanism}")
        # Continue processing anyway - you can change this behavior if needed
        print("Proceeding without background file...")
    
    # Check all files exist (except background which we handle separately)
    missing_files = []
    for section, filepath in files.items():
        if section == 'background' and not background_file:
            continue  # Skip background if not found
        if not os.path.exists(filepath):
            missing_files.append(section)
            print(f"MISSING: {section} - {filepath}")
        else:
            print(f"FOUND: {section} - {filepath}")
    
    if missing_files:
        print(f"Skipping {law_name} - {mechanism}: Missing {', '.join(missing_files)}")
        return None
    
    print(f"All required sections found. Proceeding with combination...")
    
    # Create output directory
    os.makedirs(folders['output'], exist_ok=True)
    
    # Combine content
    combined_text = f"Analysis of {law_name} through {mechanism} channel\n\n"
    
    # Add each section
    sections = [
        ('abstract', 'Abstract: '),
        ('introduction', 'INTRODUCTION\n' + '='*50 + '\n\n'),
        ('background', 'BACKGROUND AND HYPOTHESIS DEVELOPMENT\n' + '='*50 + '\n\n'),
        ('model', 'RESEARCH DESIGN\n' + '='*50 + '\n\n'),
        ('descriptive_stats', 'DESCRIPTIVE STATISTICS\n' + '='*50 + '\n\n'),
        ('regression', 'RESULTS\n' + '='*50 + '\n\n'),
        ('conclusion', 'CONCLUSION\n' + '='*50 + '\n\n')
    ]
    
    for section_name, header in sections:
        if section_name == 'background' and not background_file:
            continue  # Skip background section if file not found
            
        filepath = files.get(section_name)
        if filepath and os.path.exists(filepath):
            print(f"Adding {section_name} from {filepath}")
            with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
                content = f.read()
                if section_name == 'abstract':
                    combined_text += header + content.strip() + "\n\n"
                    combined_text += "\f"  # Page break
                else:
                    combined_text += header + content + "\n\n"
    
    # Save combined file
    output_file = os.path.join(folders['output'], output_filename)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(combined_text)
    
    print(f"✓ Successfully saved: {output_file}")
    return output_file

def create_law_mechanism_dict(csv_file: str, available_laws: dict) -> dict:
    """Create dictionary of laws and their active mechanisms"""
    
    df = pd.read_csv(csv_file)
    csv_law_mechanisms = {}
    
    mechanisms = [
        'Litigation Risk', 'Corporate Governance', 'Proprietary Costs',
        'Information Asymmetry', 'Unsophisticated Investors', 'Equity Issuance', 'Reputation Risk'
    ]
    
    # Get laws from CSV that have files
    for _, row in df.iterrows():
        law_title = row['Regulation Title'].replace(' ', '_')
        if law_title in available_laws:
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            available_mechanisms_for_law = available_laws[law_title]
            filtered_mechanisms = [mech for mech in active_mechanisms if mech in available_mechanisms_for_law]
            if filtered_mechanisms:
                csv_law_mechanisms[law_title] = filtered_mechanisms
    
    # Add laws that have files but aren't in CSV
    final_law_mechanisms = csv_law_mechanisms.copy()
    for law_name, available_mechs in available_laws.items():
        if law_name not in final_law_mechanisms:
            final_law_mechanisms[law_name] = available_mechs
            print(f"Note: {law_name} has files but not in CSV. Including all mechanisms: {available_mechs}")
    
    return final_law_mechanisms

def main():
    """Main function to combine all laws"""
    
    print("Script starting...")
    
    BASE_DIR = r"enter folder path here"
    CSV_FILE = os.path.join(BASE_DIR, "enter file path here")
    
    # Get available laws
    available_laws = get_available_laws_from_files(BASE_DIR)
    print(f"Found {len(available_laws)} laws with available files:")
    for law, mechanisms in available_laws.items():
        print(f"  - {law}: {mechanisms}")
    print()
    
    # Create processing list
    law_mechanisms = create_law_mechanism_dict(CSV_FILE, available_laws)
    total_combinations = sum(len(mechanisms) for mechanisms in law_mechanisms.values())
    print(f"\nProcessing {total_combinations} law-mechanism combinations")
    
    # Process each combination
    successful = 0
    failed = 0
    
    for law_name, mechanisms in law_mechanisms.items():
        for mechanism in mechanisms:
            try:
                result = combine_single_law_mechanism(BASE_DIR, law_name, mechanism)
                if result:
                    successful += 1
                else:
                    failed += 1
            except Exception as e:
                print(f"✗ Error processing {law_name} - {mechanism}: {str(e)}")
                failed += 1
    
    print(f"\n{'='*60}")
    print("FINAL SUMMARY:")
    print(f"Total combinations: {total_combinations}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"{'='*60}")

if __name__ == "__main__":
    main()

# 15. Ask Claude to create a reference list
import os
import re
import time
import anthropic
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT

def create_reference_pdf(references, output_path):
    """
    Creates a PDF with properly formatted references using ReportLab.
    
    Args:
        references (list or str): List of references or string containing references
        output_path (str): Path where the PDF will be saved
    """
    doc = SimpleDocTemplate(
        output_path,
        pagesize=letter,
        rightMargin=72,
        leftMargin=72,
        topMargin=72,
        bottomMargin=72
    )
    
    styles = getSampleStyleSheet()
    
    # Create style for references with proper hanging indentation
    ref_style = ParagraphStyle(
        'Reference',
        parent=styles['Normal'],
        fontName='Times-Roman',
        fontSize=12,
        leading=14,
        leftIndent=36,  # Overall left indent
        firstLineIndent=-36,  # Creates hanging indent
        alignment=TA_LEFT,
        spaceAfter=12  # Space between references
    )
    
    # Create header style
    header_style = ParagraphStyle(
        'Header',
        parent=styles['Normal'],
        fontName='Times-Bold',
        fontSize=12,
        spaceBefore=0,
        spaceAfter=20,
        alignment=TA_LEFT
    )
    
    # Initialize story for the PDF
    story = []
    
    # Add References header
    story.append(Paragraph("References", header_style))
    
    # Process references
    if isinstance(references, str):
        refs = clean_references(references)
    else:
        refs = references
    
    # Add each reference
    for ref in refs:
        if ref.strip():
            # Clean and format the reference
            ref = clean_reference(ref)
            story.append(Paragraph(ref, ref_style))
    
    # Build PDF
    doc.build(story)

def clean_reference(ref):
    """
    Cleans and formats a single reference.
    
    Args:
        ref (str): Reference string to clean
    
    Returns:
        str: Cleaned reference
    """
    # Remove TextBlock formatting more aggressively
    ref = re.sub(r'TextBlock\s*\([^)]*\)', '', ref)
    ref = re.sub(r'citations=None,?\s*text=', '', ref)
    ref = re.sub(r'type=\'text\',?\s*', '', ref)
    
    # Remove line breaks and excess whitespace
    ref = ' '.join(ref.split())
    
    # Remove various formatting markers
    ref = re.sub(r'\'|\\\n|\\n', '', ref)
    
    # Fix spacing around periods in author names
    ref = re.sub(r'\.\s*([A-Z])', r'. \1', ref)
    
    # Fix spacing around ampersands
    ref = re.sub(r'\s*&\s*', ' & ', ref)
    
    # Fix multiple spaces
    ref = re.sub(r'\s+', ' ', ref)
    
    # Remove asterisks around journal names while preserving italics in PDF
    ref = re.sub(r'\s*\*([^*]+)\*', r' \1', ref)
    
    # Ensure proper spacing after commas
    ref = re.sub(r',\s*', ', ', ref)
    
    # Fix spacing around parentheses
    ref = re.sub(r'\s*\(\s*', ' (', ref)
    ref = re.sub(r'\s*\)', ')', ref)
    
    # Remove any remaining parenthetical formatting artifacts
    ref = re.sub(r'\([^)]*citations[^)]*\)', '', ref)
    
    # Ensure the reference ends with a period
    ref = ref.rstrip('.')
    ref += '.'
    
    return ref.strip()

def clean_references(text):
    """
    Cleans and splits reference text into individual references.
    
    Args:
        text (str): Full text containing references
    
    Returns:
        list: List of cleaned references
    """
    # First, remove all TextBlock formatting
    text = re.sub(r'TextBlock\s*\([^)]*\)\s*', '', text)
    text = re.sub(r'citations=None,?\s*text=', '', text)
    text = re.sub(r'type=\'text\',?\s*', '', text)
    
    # Remove extra quotes and formatting
    text = re.sub(r'[\'\"]', '', text)
    text = text.replace('\\n', '\n')
    
    # Split into potential references
    lines = text.split('\n')
    
    # Initialize variables
    refs = []
    current_ref = []
    
    for line in lines:
        line = line.strip()
        # Skip empty lines, headers, and formatting remnants
        if not line or line.lower() == 'references' or 'textblock' in line.lower():
            continue
            
        # If line starts with a capital letter and previous reference exists,
        # it's probably a new reference
        if re.match(r'^[A-Z]', line) and current_ref:
            refs.append(' '.join(current_ref))
            current_ref = [line]
        else:
            current_ref.append(line)
    
    # Add the last reference
    if current_ref:
        refs.append(' '.join(current_ref))
    
    # Clean each reference
    cleaned_refs = []
    for ref in refs:
        cleaned = clean_reference(ref)
        if cleaned and not cleaned.isspace() and len(cleaned) > 10:  # Filter out very short "references"
            cleaned_refs.append(cleaned)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_refs = []
    for ref in cleaned_refs:
        if ref not in seen:
            seen.add(ref)
            unique_refs.append(ref)
    
    return unique_refs

def get_formatted_references(prompt, max_retries=3):
    """
    Gets formatted references using the Anthropic Claude API with retry logic.
    
    Args:
        prompt (str): The prompt to send to Claude
        max_retries (int): Maximum number of retry attempts
        
    Returns:
        str: Formatted references from Claude, or None if failed
    """
    # Get API key from environment variable
    api_key = os.getenv('ANTHROPIC_API_KEY')
    if not api_key:
        # Fallback to hardcoded key if environment variable not set
        api_key = "enter API here"
    
    try:
        client = anthropic.Anthropic(api_key=api_key)
        
        for attempt in range(max_retries):
            try:
                # Make the API call
                message = client.messages.create(
                    model="claude-sonnet-4-20250514",
                    max_tokens=4000,
                    temperature=0.5,
                    system="You are a helpful research assistant with expertise in academic citations. Format references in proper APA style with full journal names, volumes, and page numbers.",
                    messages=[
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ]
                )
                
                # Extract and clean the content - FIXED
                if message and hasattr(message, 'content'):
                    content = message.content
                    if isinstance(content, list):
                        # Extract just the text from TextBlock objects
                        text_parts = []
                        for item in content:
                            if hasattr(item, 'text'):
                                text_parts.append(item.text)
                            else:
                                text_parts.append(str(item))
                        content = '\n'.join(text_parts)
                    elif hasattr(content, 'text'):
                        content = content.text
                    return content
                
                return None
                
            except Exception as e:
                print(f"API Error on attempt {attempt + 1}: {e}")
                if attempt < max_retries - 1:
                    time.sleep(5)
                else:
                    return None
        
        return None
        
    except Exception as e:
        print(f"Client initialization error: {e}")
        return None

def batch_process_files(input_dir, output_dir, start_from=0, delay_seconds=2):
    """
    Process all text files in a directory and create corresponding reference PDFs.
    
    Args:
        input_dir (str): Path to directory containing input text files
        output_dir (str): Path to directory where PDFs will be saved
        start_from (int): File index to start from (for resuming)
        delay_seconds (int): Delay between API calls to avoid rate limits
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get list of files to process
    files_to_process = [f for f in os.listdir(input_dir) if f.endswith('_combined.txt')]
    files_to_process.sort()  # Process in consistent order
    
    # Counter for processed files
    processed = 0
    errors = 0
    skipped = 0
    
    print(f"Found {len(files_to_process)} files to process")
    print(f"Starting from file index {start_from}")
    
    # Process each file in the input directory
    for i, filename in enumerate(files_to_process):
        if i < start_from:
            skipped += 1
            continue
            
        try:
            # Construct full input path
            input_path = os.path.join(input_dir, filename)
            
            # Create output filename
            output_filename = filename.replace('_combined.txt', '_references.pdf')
            output_path = os.path.join(output_dir, output_filename)
            
            # Skip if output already exists
            if os.path.exists(output_path):
                print(f"Skipping {filename} - output already exists")
                skipped += 1
                continue
            
            print(f"Processing {i+1}/{len(files_to_process)}: {filename}")
            
            # Read input file
            with open(input_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            # Create prompt for Claude
            prompt = f"""Based on the following text, generate a reference list in APA format. 
            Format each reference exactly like these examples:

            Leuz, C., & Verrecchia, R. E. (2000). The economic consequences of increased disclosure. Journal of Accounting Research, 91-124.

            Bourveau, T., She, G., & Zaldokas, A. (2020). Corporate disclosure as a tacit coordination mechanism: Evidence from cartel enforcement regulations. Journal of Accounting Research, 58(2), 295-332.

            Text for analysis:
            {text}

            Please format each reference following the exact style above, including:
            1. Remove any asterisks, TextBlock tags, or other formatting markers 
            2. Author names with initials
            3. Full title in sentence case
            4. Journal name in italics (use *journal name* for italics)
            5. Volume, issue, and page numbers where applicable
            6. Year in parentheses
            7. One reference per line 
            8. Subsequent references should be followed by a space after the previous reference
            9. Sort alphabetically by author's last name
            10. Provide only the references, no extra text or explanations"""

            
            # Get formatted references from Claude
            formatted_refs = get_formatted_references(prompt)
            
            if formatted_refs:
                # Create the PDF with the formatted references
                create_reference_pdf(formatted_refs, output_path)
                processed += 1
                print(f"✓ Successfully processed: {filename}")
            else:
                errors += 1
                print(f"✗ Error getting references for: {filename}")
            
            # Add delay between API calls to avoid rate limits
            if delay_seconds > 0 and i < len(files_to_process) - 1:
                time.sleep(delay_seconds)
                
        except Exception as e:
            errors += 1
            print(f"✗ Error processing {filename}: {str(e)}")
    
    # Print summary
    print(f"\n{'='*50}")
    print("PROCESSING COMPLETE!")
    print(f"Total files found: {len(files_to_process)}")
    print(f"Skipped: {skipped} files")
    print(f"Successfully processed: {processed} files")
    print(f"Errors: {errors} files")
    print(f"{'='*50}")

if __name__ == "__main__":
    # Set input and output directories
    input_directory = r"enter folder path here"
    output_directory = r"enter folder path here"
    
    # Process all files
    batch_process_files(input_directory, output_directory, start_from=0, delay_seconds=2)
    
# 16. Combine manuscript files with pdf table files for descriptive statistics and regression analysis

import os
import re
from PyPDF2 import PdfMerger
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, KeepTogether, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT, TA_CENTER
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

def clean_mechanism_name(mechanism):
    """Clean mechanism name for consistent formatting"""
    # Replace spaces and hyphens with underscores, remove extra spaces
    cleaned = re.sub(r'[\s\-]+', '_', mechanism.strip())
    # Remove any trailing underscores
    cleaned = cleaned.rstrip('_')
    return cleaned

def format_title_name(name):
    """Format name for display in titles with proper spacing"""
    title_fixes = {
        'Interactive_Datafor_Financial_Reporting': 'Interactive Data for Financial Reporting',
        'Internet_Availabilityof_Proxy_Materials': 'Internet Availability of Proxy Materials',
        'Political_Contributionsby_Investment_Advisers': 'Political Contributions by Investment Advisers',
        'Proxy_Votingby_Investment_Advisers': 'Proxy Voting by Investment Advisers',
        'Regulation_ABAsset_Backed_Securities': 'Regulation AB Asset Backed Securities',
        'Regulation_RBank_Securities_Activities': 'Regulation R Bank Securities Activities',
        'Regulation_SBSRSecurity_Based_Swap_Reporting': 'Regulation SBSR Security Based Swap Reporting',
        'Regulation_SFPSSecurities_Financing_Transaction_Reporting': 'Regulation SFPS Securities Financing Transaction Reporting',
        'Regulation_BTRBlackout_Trading_Restriction': 'Regulation BTR Blackout Trading Restriction'
    }
    
    # Check if we have a specific fix for this name
    if name in title_fixes:
        return title_fixes[name]
    
    # For other cases, just replace underscores with spaces
    return name.replace('_', ' ')

def get_actual_law_mechanism_pairs(base_dir):
    """Get actual law-mechanism pairs from existing combined files"""
    combined_dir = os.path.join(base_dir, 'combined_sections')
    if not os.path.exists(combined_dir):
        print(f"Combined sections directory not found: {combined_dir}")
        return []
    
    # Define known mechanisms to help with parsing
    known_mechanisms = [
        'Information_Asymmetry',
        'Corporate_Governance', 
        'Unsophisticated_Investors',
        'Litigation_Risk',
        'Reputation_Risk',
        'Proprietary_Costs',
        'Equity_Issuance'
    ]
    
    pairs = []
    for filename in os.listdir(combined_dir):
        if filename.endswith('_combined.txt'):
            # Remove _combined.txt suffix
            base_name = filename.replace('_combined.txt', '')
            
            # Try to find the mechanism in the filename
            mechanism_found = None
            law_name = None
            
            for mechanism in known_mechanisms:
                if mechanism in base_name:
                    mechanism_found = mechanism
                    # The law name is everything before the mechanism
                    law_name = base_name.replace(f'_{mechanism}', '')
                    break
            
            if mechanism_found and law_name:
                pairs.append((law_name, mechanism_found))
            else:
                print(f"Warning: Could not parse filename: {filename}")
    
    return pairs

def get_descriptive_stats(desc_dir: str, law_name: str, mechanism: str) -> str:
    """Get descriptive statistics PDF from panel subfolder"""
    # Panel folder uses underscores
    panel_name = f"panel_{law_name}_{mechanism}"
    panel_dir = os.path.join(desc_dir, panel_name)
    
    # Look for descriptive_stats_table.pdf
    desc_file = os.path.join(panel_dir, 'descriptive_stats_table.pdf')
    
    if os.path.exists(desc_file):
        print(f"Found descriptive statistics table: {desc_file}")
        return desc_file
    else:
        print(f"Warning: No descriptive statistics table found in {panel_dir}")
        return ""

def get_regression_analyses(reg_dir: str, law_name: str, mechanism: str) -> str:
    """Get regression analyses PDF from panel subfolder"""
    # Panel folder uses underscores
    panel_name = f"panel_{law_name}_{mechanism}"
    panel_dir = os.path.join(reg_dir, panel_name)
    
    # Look for regression_table.pdf
    reg_file = os.path.join(panel_dir, 'regression_table.pdf')
    
    if os.path.exists(reg_file):
        print(f"Found regression table: {reg_file}")
        return reg_file
    else:
        print(f"Warning: No regression table found in {panel_dir}")
        return ""

def merge_pdf_files(base_dir: str, law_name: str, mechanism: str):
    """Merge manuscript PDF with regression results, descriptive statistics, and reference PDFs"""
    # Register Times New Roman font
    try:
        pdfmetrics.registerFont(TTFont('Times New Roman', 'times.ttf'))
        pdfmetrics.registerFont(TTFont('Times New Roman Bold', 'timesbd.ttf'))
    except:
        print("Warning: Times New Roman font not found, using default font")
    
    # Define file paths
    combined_sections_dir = os.path.join(base_dir, 'combined_sections')
    reg_dir = os.path.join(base_dir, 'regression_analyses')
    desc_dir = os.path.join(base_dir, 'descriptive_stats')
    corr_dir = os.path.join(base_dir, 'correlations')
    ref_dir = os.path.join(base_dir, 'references')
    output_dir = os.path.join(base_dir, 'final_manuscripts')
    os.makedirs(output_dir, exist_ok=True)
    
    # Check if manuscript exists (mechanism already has underscores)
    manuscript_file = os.path.join(combined_sections_dir, f"{law_name}_{mechanism}_combined.txt")
    if not os.path.exists(manuscript_file):
        print(f"Manuscript file not found: {manuscript_file}")
        return False

    # Create intermediate PDF with formatting
    temp_pdf_path = os.path.join(output_dir, f'temp_{law_name}_{mechanism}.pdf')
    temp_pdf = SimpleDocTemplate(
        temp_pdf_path,
        pagesize=letter,
        rightMargin=72,
        leftMargin=72,
        topMargin=72,
        bottomMargin=72
    )

    # Create styles
    styles = getSampleStyleSheet()
    
    custom_title = ParagraphStyle(
        name='CustomTitle',
        fontName='Times New Roman Bold',
        fontSize=16,
        spaceAfter=16,
        spaceBefore=24,
        firstLineIndent=0,
        alignment=TA_CENTER,
        leading=24
    )
    
    subtitle_style = ParagraphStyle(
        name='CustomSubtitle',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=24,
        spaceBefore=12,
        firstLineIndent=0,
        alignment=TA_CENTER,
        leading=24
    )
    
    abstract_style = ParagraphStyle(
        name='Abstract',
        fontName='Times New Roman',
        fontSize=12,
        firstLineIndent=0,
        spaceAfter=60,
        leading=14,
        alignment=TA_JUSTIFY
    )
    
    regular_style = ParagraphStyle(
        name='CustomRegular',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=12,
        firstLineIndent=36,
        leading=24,
        alignment=TA_JUSTIFY
    )
    
    heading_style = ParagraphStyle(
        name='CustomHeading',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=6,
        spaceBefore=18,
        firstLineIndent=0, 
        alignment=TA_LEFT, 
        leading=24
    )
    
    subheading_style = ParagraphStyle(
        name='CustomSubheading',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=6,
        spaceBefore=12,
        firstLineIndent=0,  
        alignment=TA_LEFT,  
        leading=24
    )

    # Read manuscript
    with open(manuscript_file, 'r', encoding='utf-8') as f:
        manuscript_text = f.read()

    # Create story (content)
    story = []
    
    # Format law name for display in the PDF title using the improved function
    formatted_law_name = format_title_name(law_name)

    # Add title and subtitle
    title = f"{formatted_law_name} and Voluntary Disclosure"
    story.append(Paragraph(title, custom_title))
    story.append(Paragraph("Artemis Intelligencia", subtitle_style))
    story.append(Paragraph("September 10, 2025", subtitle_style))
    story.append(Spacer(1, 24))

    # Common section headers to identify (exact matches)
    main_headers = ['INTRODUCTION', 'BACKGROUND AND HYPOTHESIS DEVELOPMENT', 
                    'RESEARCH DESIGN', 'DESCRIPTIVE STATISTICS', 'RESULTS', 'CONCLUSION']
    
    # Subheaders - be more specific to avoid false positives
    subheaders_exact = ['Background', 'Theoretical Framework','Hypothesis Development', 'Model Explanation', 'Mathematical Model', 
                        'Regression Analysis','Variable Definitions', 'Empirical Model', 'Sample Selection', 'Sample Construction',
                        'Sample Description and Descriptive Statistics', 
                        'Sample Selection and Regulatory Context', 'Sample Selection and Data Sources',
                         'Sample Construction and Regulatory Setting', 'Model Specification', 
                        'Sample Selection and Regulatory Framework', 'Model Development and Theoretical Framework',
                         'Mathematical Specification', 'Sample Construction and Data Sources', 'Variable Definitions and Measurement'
                         'Model Development and Theoretical Foundation', 'Model Development', 'Regression Specification']

    # Process manuscript text more carefully
    lines = manuscript_text.split('\n')
    current_paragraph = []
    
    for i, line in enumerate(lines):
        line_stripped = line.strip()
    
        # Skip lines with just equals signs (separator lines)
        if line_stripped and all(c == '=' for c in line_stripped):
            continue
    
        # ADD THIS: Skip "Research Design" if previous non-empty line was "RESEARCH DESIGN"
        # Skip "Research Design" if previous non-empty line was "RESEARCH DESIGN"
        if line_stripped == "Research Design":
            # Look back for the previous non-empty, non-equals line
            skip_this_line = False
            for j in range(i-1, -1, -1):
                prev_line = lines[j].strip()
                if prev_line and not all(c == '=' for c in prev_line):
                    if prev_line == "RESEARCH DESIGN":
                        skip_this_line = True
                    break
            if skip_this_line:
                continue  # Now this continue applies to the main loop
        
        # Skip empty lines
        if not line_stripped:
            # If we have accumulated a paragraph, add it before the empty line
            if current_paragraph:
                paragraph_text = ' '.join(current_paragraph)
                story.append(Paragraph(paragraph_text, regular_style))
                current_paragraph = []
            continue
            
        # Skip the original title if it appears
        if i < 5 and ('Voluntary Disclosure' in line_stripped or 'Analysis of' in line_stripped) and len(line_stripped) < 150:
            continue
            
        # Check if it's a main header (exact match or with = signs)
        is_main_header = False
        for header in main_headers:
            if header == line_stripped.upper().replace('=', '').strip():
                is_main_header = True
                # Add any accumulated paragraph first
                if current_paragraph:
                    paragraph_text = ' '.join(current_paragraph)
                    story.append(Paragraph(paragraph_text, regular_style))
                    current_paragraph = []
                # Add the header
                story.append(Paragraph(header, heading_style))
                break
        
        if is_main_header:
            continue
            
        # Check if it's a subheader (exact match, short line)
        is_subheader = False
        if len(line_stripped) < 50:  # Subheaders are typically short
            for subheader in subheaders_exact:
                if subheader.lower() == line_stripped.lower():
                    is_subheader = True
                    # Add any accumulated paragraph first
                    if current_paragraph:
                        paragraph_text = ' '.join(current_paragraph)
                        story.append(Paragraph(paragraph_text, regular_style))
                        current_paragraph = []
                    # Add the subheader
                    story.append(Paragraph(line_stripped, subheading_style))
                    break
        
        if is_subheader:
            continue
            
        # It's part of a regular paragraph - accumulate it
        current_paragraph.append(line_stripped)
    
    # Don't forget the last paragraph if there is one
    if current_paragraph:
        paragraph_text = ' '.join(current_paragraph)
        story.append(Paragraph(paragraph_text, regular_style))

    # Create the intermediate PDF
    temp_pdf.build(story)

    # Merge PDFs
    merger = PdfMerger()
    
    try:
        # Add formatted manuscript
        merger.append(temp_pdf_path)
        
        # Add references (mechanism already has underscores)
        ref_file = os.path.join(ref_dir, f"{law_name}_{mechanism}_references.pdf")
        if os.path.exists(ref_file):
            merger.append(ref_file)
            print(f"Added references from: {ref_file}")
        else:
            print(f"No references file found at: {ref_file}")
    
        # Add descriptive statistics table
        desc_stats_path = get_descriptive_stats(desc_dir, law_name, mechanism)
        if desc_stats_path:
            merger.append(desc_stats_path)
        else:
            print(f"No descriptive statistics table found for {law_name}_{mechanism}")
            
        # Add correlations table (mechanism already has underscores)
        corr_file = os.path.join(corr_dir, f"{law_name}_{mechanism}_correlation_table.pdf")
        if os.path.exists(corr_file):
            merger.append(corr_file)
            print(f"Added correlation table from: {corr_file}")
        else:
            print(f"No correlation file found at: {corr_file}")
    
        # Add regression table
        reg_table_path = get_regression_analyses(reg_dir, law_name, mechanism)
        if reg_table_path:
            merger.append(reg_table_path)
        else:
            print(f"No regression table found for {law_name}_{mechanism}")

        # Create output filename with properly formatted law name
        formatted_law_name_for_file = format_title_name(law_name)
        
        # The mechanism keeps its underscores in the filename
        output_file = os.path.join(output_dir, 
                                   f"{formatted_law_name_for_file} and Voluntary Disclosure_{mechanism}_final.pdf")
        
        merger.write(output_file)
        merger.close()
        
        # Clean up temporary file
        os.remove(temp_pdf_path)
        
        print(f"✓ Successfully created formatted PDF for {law_name} - {mechanism}")
        print(f"Saved to: {output_file}")
        return True
        
    except Exception as e:
        print(f"Error creating PDF: {str(e)}")
        return False
    finally:
        merger.close()

# Batch processing of multiple laws and mechanisms
def batch_merge_pdfs(base_dir):
    # Clear any cached paths
    import importlib
    import sys
    
    print(f"Current working directory: {os.getcwd()}")
    print(f"Script BASE_DIR: {base_dir}")
    print(f"Contents of BASE_DIR: {os.listdir(base_dir)}")
    """Process all available law-mechanism combinations"""
    
    # Get actual pairs from existing files
    laws_mechanisms = get_actual_law_mechanism_pairs(base_dir)
    
    if not laws_mechanisms:
        print("No law-mechanism pairs found in combined_sections directory")
        return
    
    print(f"Found {len(laws_mechanisms)} law-mechanism combinations to process:")
    for law, mechanism in laws_mechanisms[:5]:  # Show first 5 as example
        print(f"  - {law} - {mechanism}")
    if len(laws_mechanisms) > 5:
        print(f"  ... and {len(laws_mechanisms) - 5} more")
    print()
    
    successful = 0
    failed = 0
    
    for law, mechanism in laws_mechanisms:
        try:
            result = merge_pdf_files(base_dir, law, mechanism)
            if result:
                successful += 1
            else:
                failed += 1
        except Exception as e:
            print(f"✗ Error processing {law} - {mechanism}: {str(e)}")
            failed += 1
    
    print(f"\n{'='*50}")
    print("PDF MERGING COMPLETE!")
    print(f"Total combinations: {len(laws_mechanisms)}")
    print(f"Successful: {successful}")
    print(f"Failed: {failed}")
    print(f"{'='*50}")
    
    
# Usage
if __name__ == "__main__":
    BASE_DIR = r"enter folder path here"
    batch_merge_pdfs(BASE_DIR)
    
# 17. Add page numbers 
import os
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import io

def add_page_numbers(input_path, output_path):
    reader = PdfReader(input_path)
    writer = PdfWriter()
    
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=(page.mediabox.width, page.mediabox.height))
        
        if i > 0:
            can.setFont('Times-Roman', 12)
            can.drawString(page.mediabox.width/2 - 6, 40, str(i))
        
        can.save()
        packet.seek(0)
        number_pdf = PdfReader(packet)
        
        if len(number_pdf.pages) > 0:
            page.merge_page(number_pdf.pages[0])
        writer.add_page(page)
    
    with open(output_path, "wb") as output_file:
        writer.write(output_file)

def batch_process_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    pdfs = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    
    for i, filename in enumerate(pdfs, 1):
        print(f"\nProcessing {i}/{len(pdfs)}: {filename}")
        input_path = os.path.join(input_dir, filename)
        
        # Shorten the filename if it's too long
        base_name = filename[:-4]  # Remove .pdf
        if len(base_name) > 100:  # If name is too long
            # Keep first 50 and last 45 characters
            shortened = base_name[:50] + "..." + base_name[-45:]
            output_filename = f"numbered_{shortened}.pdf"
        else:
            output_filename = f"numbered_{filename}"
            
        output_path = os.path.join(output_dir, output_filename)
        
        try:
            add_page_numbers(input_path, output_path)
            print(f"Successfully processed: {filename}")
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")
# Usage
input_dir = r"enter folder path here"
output_dir = r"enter folder path here"
batch_process_pdfs(input_dir, output_dir)