In [None]:
#Code written in assistance from Claude 3.5-Sonnet
#Claude was used to assist with improving documentation and improving handling of errors 
#Note: the code will run best when executing each section separately
#Required inputs: CSV files with securities law data, data with "GVKEY" and "FYEAR" as identifiers

# 1. Claude identify federal securities laws
import os
import anthropic
import pandas as pd
import re
from typing import List, Dict

def get_securities_laws(conversation_history=None):
    # Initialize the Anthropic client
    client = anthropic.Anthropic(
        api_key="enter API here"
    )
    
    # Initial prompt 
    initial_content = """Your task is to identify and compile a comprehensive database of at least 100 federal securities 
    laws. Securities regulation is the field of U.S. law that covers transactions and other dealings with securities. 
    Securities laws aim at ensuring that investors receive accurate and necessary information regarding the type and value
    of the interest under consideration for purchase.

The goal is to create a dataset that captures the following key details for each law. 

Please follow these guidelines:

Data Fields to Collect:
• Date: The announcement or implementation date of the law (use YYYY-MM-DD format).
• Regulation Title or Name: The official name or designation of the regulatory change.
• Regulatory Body/Authority: The government entity responsible for the law.
• Description: A brief overview of the law, including key provisions and the rationale behind it.
• Impact: The potential or observed effects on industries, markets, or stakeholders.
•Litigation Risk: Is this law related to the risk of litigation against managers? By risk of litigation we mean the probability that a manager will be sued or face legal action because of this law. Answer this question with Yes or No. If yes, label the entry "Litigation Risk".
•Corporate Governance: Is this law related to corporate governance of firms? Corporate governance refers to the internal monitoring system charged with overseeing managers and commonly focuses on matters such as board independence or insider trading policy. Answer this question with Yes or No.If yes, label the entry "Corporate Governance".
•Proprietary Costs: Is this law related to proprietary costs of firms? By proprietary costs, we mean costs that result from the disclosure of information to competitors which could harm a firm’s competitive position. Answer this question with Yes or No.If yes, label the entry "Proprietary Costs".
•Information Asymmetry: Is this law related to information asymmetry between owners and managers? By information asymmetry we mean that one party has more or better information than the other party. Answer this question with Yes or No. If yes, label the entry "Information Asymmetry".
•Unsophisticated Investors: Is the law related to protecting unsophisticated investors? By unsophisticated investors, we mean investors that are either new to investing or are not well informed. Answer this question with Yes or No. If yes, label the entry "Unsophisticated Investors".
•Equity Issuance in Public vs. Private Markets: Is this law related to the costs and benefits of issuing equity in public versus private markets? Answer this question with Yes or No. If yes, label the entry "Equity Issuance in Public vs. Private Markets".
•Reputation Risk: Is this law related to the reputation of firm managers? By of firm manager, we mean the career prospects and prestige of an individual manager. Answer this question with Yes or No. If yes, label the entry "Reputation Risk".

• References: Links to official documents or credible news sources.

Requirements:
• Scope: Cover as many laws as possible that were announced or implemented in the last 25 years.
• Consistency: Ensure uniform formatting for all entries in the dataset.
• Dates must be in YYYY-MM-DD format (e.g., 2002-07-30).

Output:
Provide data in a tabular format with rows for each law and columns for the data fields listed above. 
Use credible, authoritative sources such as government websites, legal databases, academic journals, or credible news sources.
"""

    try:
        if conversation_history:
            messages = conversation_history
        else:
            messages = [{
                "role": "user",
                "content": initial_content
            }]

        response = client.messages.create(
            max_tokens=8192,
            model="claude-3-5-sonnet-20241022",
            messages=messages
        )
        return response.content[0].text, messages + [
            {"role": "assistant", "content": response.content[0].text}
        ]
    except Exception as e:
        print(f"Error making API call: {e}")
        return None, messages

def add_follow_up_prompt(conversation_history, follow_up_prompt):
    """Add a follow-up prompt to the conversation history"""
    return conversation_history + [{"role": "user", "content": follow_up_prompt}]

def standardize_date(date_str):
    """Attempt to standardize date format to YYYY-MM-DD"""
    try:

        return pd.to_datetime(date_str).strftime('%Y-%m-%d')
    except:

        return date_str

def parse_response_to_dataframe(response_text: str) -> pd.DataFrame:
    """Parse the response text into a pandas DataFrame."""
    print("\nParsing response...")
    
    data = []
    current_entry = None
    entry_number = None
    
    # Split into lines and clean
    lines = [line.strip() for line in response_text.split('\n') if line.strip()]
    
    for line in lines:
        # Check for new entry by looking for numbered entries (e.g., "1." or "101.")
        number_match = re.match(r'^(\d+)\.', line)
        if number_match:
            # Save previous entry if it exists
            if current_entry and len(current_entry) > 0:
                if 'Regulation Title' not in current_entry and entry_number:
                    current_entry['Regulation Title'] = f"Law {entry_number}"
                data.append(current_entry)
            current_entry = {}
            entry_number = number_match.group(1)
            continue
            
        # Process key-value pairs
        if ':' in line:
            key, value = [x.strip() for x in line.split(':', 1)]
            
            # Map keys to column names
            key_mapping = {
                'Date': 'Date',
                'Title': 'Regulation Title',
                'Authority': 'Regulatory Body',
                'Description': 'Description',
                'Impact': 'Impact',
                'Litigation Risk': 'Litigation Risk',
                'Corporate Governance': 'Corporate Governance',
                'Proprietary Costs': 'Proprietary Costs',
                'Information Asymmetry': 'Information Asymmetry',
                'Unsophisticated Investors': 'Unsophisticated Investors',
                'Equity Issuance': 'Equity Issuance',
                'Reputation Risk': 'Reputation Risk',
                'References': 'References'
            }
            
            if key in key_mapping:
                column_name = key_mapping[key]
                if column_name == 'Date':
                    current_entry[column_name] = standardize_date(value)
                else:
                    current_entry[column_name] = value.strip()

    # Add the last entry
    if current_entry and len(current_entry) > 0:
        if 'Regulation Title' not in current_entry and entry_number:
            current_entry['Regulation Title'] = f"Law {entry_number}"
        data.append(current_entry)
    
    print(f"\nFound {len(data)} entries")
    
    
    # Create DataFrame
    if data:
        df = pd.DataFrame(data)
        required_columns = ['Date', 'Regulation Title', 'Regulatory Body', 'Description', 'Impact',
                          'Litigation Risk', 'Corporate Governance', 'Proprietary Costs',
                          'Information Asymmetry', 'Unsophisticated Investors', 'Equity Issuance',
                          'Reputation Risk', 'References']
        
        for col in required_columns:
            if col not in df.columns:
                print(f"Adding missing column: {col}")
                df[col] = None
        
        # Clean up titles 
        df['Regulation Title'] = df['Regulation Title'].fillna('Unknown')
        df['Regulation Title'] = df['Regulation Title'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
        
        # Create a composite key for deduplication
        df['dedup_key'] = df.apply(lambda row: f"{row['Date']}_{row['Regulation Title']}", axis=1)
        df = df.drop_duplicates(subset=['dedup_key'], keep='first')
        df = df.drop('dedup_key', axis=1)
        
        # Reorder columns
        df = df[required_columns]
        print(f"Created DataFrame with {len(df)} rows")
        return df.copy()
    else:
        print("No valid data to create DataFrame")
        return pd.DataFrame()
                

def compile_all_responses() -> pd.DataFrame:
    """Compile multiple API responses into a single DataFrame."""
    all_responses = []
    conversation_history = None

    # Get initial response
    initial_response, conversation_history = get_securities_laws()
    if initial_response:
        print("\nInitial response:")
        print(initial_response)
        all_responses.append(initial_response)

        follow_up_prompts = [
            """Starting with number {last_num}, list 20 more federal securities laws using this exact format for each:
Date: YYYY-MM-DD
Title: [title]
Authority: [body]
Description: [brief]
Impact: [impact]
Litigation Risk: Yes/No
Corporate Governance: Yes/No
Proprietary Costs: Yes/No
Information Asymmetry: Yes/No
Unsophisticated Investors: Yes/No
Equity Issuance: Yes/No
Reputation Risk: Yes/No
References: [link]""",

            "Continue from number {last_num}. Provide 20 more laws using the exact same format.",
            
            "List 20 more laws starting at number {last_num}. Use the same format.",
            
            "Add 20 more laws beginning with number {last_num}. Keep the same format.",
            
            "Provide 20 more laws from number {last_num}. Same format.",
            
            "Continue with 20 more laws from {last_num}. Same format.",
            
            "Add 20 more laws starting at {last_num}. Same format.",
            
            "List 20 more laws from number {last_num}. Same format.",
            
            "Provide 20 more laws starting at {last_num}. Same format.",
            
            "Add final set of laws starting at {last_num}. Same format.",
            
            "List 20 more laws from number {last_num}. Same format.",
            
            "Provide 20 more laws starting at {last_num}. Same format.",
            
            "Add final set of laws starting at {last_num}. Same format."
            
            "List 20 more laws from number {last_num}. Same format.",
            
            "Provide 20 more laws starting at {last_num}. Same format.",
            
            "Add final set of laws starting at {last_num}. Same format."
            
            "Recall that you have to identify at least 100 federal securities laws. Recall securities regulation is the field of U.S. law that covers transactions and other dealings with securities. Securities laws aim at ensuring that investors receive accurate and necessary information regarding the type and value of the interest under consideration for purchase."
        ]
        last_num = len(parse_response_to_dataframe(initial_response)) + 1
        
        for i, prompt_template in enumerate(follow_up_prompts, 1):
            prompt = prompt_template.format(last_num=last_num)
            conversation_history = add_follow_up_prompt(conversation_history, prompt)
            response, conversation_history = get_securities_laws(conversation_history)
            
            if response:
                print(f"\nFollow-up response {i}:")
                print(response)
                all_responses.append(response)
                df = parse_response_to_dataframe(response)
                last_num += len(df)
            
    # Parse all responses into DataFrames and concatenate
    dfs = []
    for response in all_responses:
        df = parse_response_to_dataframe(response)
        if not df.empty:
            dfs.append(df)

    if not dfs:
        print("No valid data frames were created!")
        return pd.DataFrame()

    # Concatenate all DataFrames
    final_df = pd.concat(dfs, ignore_index=True)

    # Remove duplicates using multiple fields to better identify unique laws
    final_df['Title_clean'] = final_df['Regulation Title'].fillna('').str.lower().str.strip()
    final_df['Description_clean'] = final_df['Description'].fillna('').str.lower().str.strip()
    
    # Create composite key for deduplication
    final_df['dedup_key'] = final_df.apply(
        lambda row: f"{row['Date']}_{row['Title_clean']}_{row['Description_clean'][:50]}", 
        axis=1
    )
    
    # Remove duplicates and cleanup
    final_df = final_df.drop_duplicates(subset=['dedup_key'], keep='first')
    final_df = final_df.drop(['Title_clean', 'Description_clean', 'dedup_key'], axis=1)

    # Sort by date
    try:
        final_df['DateSort'] = pd.to_datetime(final_df['Date'], errors='coerce')
        final_df = final_df.dropna(subset=['DateSort'])
        final_df = final_df.sort_values('DateSort', ascending=False)
        final_df = final_df.drop('DateSort', axis=1)
    except Exception as e:
        print(f"Warning: Could not sort by date due to: {e}")
        print("Problematic dates:")
        print(final_df['Date'].value_counts())

    # Return the final DataFrame
    return final_df

if __name__ == "__main__":
    # Compile all responses into a DataFrame
    df = compile_all_responses()
    
    if df.empty:
        print("\nError: No data was collected!")
    else:
        # Display basic statistics
        print(f"\nTotal number of unique laws: {len(df)}")
        print("\nMost recent laws:")
        print(df.head().to_string())
        
        # Save to CSV
        output_path = "enter file path here"
        df.to_csv(output_path, index=False)
        print(f"\nDatabase saved to: {output_path}")

# 2. Add column for Year 

import pandas as pd

df=pd.read_csv("enter file path here")

df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df

#Excluding years prior to 2002 since we do not use forecast data for these years. We also exclude years 2018 onwards
#from law file because we need 2 years after and we have data forecast data up to 2019
filtered_df = df[~df['Year'].isin([1986, 1987, 1988, 1989,1990, 1991, 1992, 1993, 1994, 1995, 1996,
                                   1997, 1998, 1999, 2000, 2001,2018, 2019, 2020, 2021, 2022, 2023])]

filtered_df_with_titles = filtered_df.dropna(subset=["Regulatory Body"])

filtered_df_with_titles.to_csv("enter file path here")

# 3. Create Panel Datasets for Each Law and Each Channel
import pandas as pd
import os

def create_channel_panels(laws_file: str, panel_file: str, output_dir: str) -> None:
    """
    Creates separate panel datasets for each law and each channel marked as "Yes".
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the input files
    laws_df = pd.read_csv(laws_file)
    panel_df = pd.read_csv(panel_file)
    
    # Convert Year columns to int
    laws_df['Year'] = pd.to_numeric(laws_df['Year'])
    panel_df['FYEAR'] = pd.to_numeric(panel_df['FYEAR'])
    
    # Define channels to check for "Yes"
    channels = [
        'Litigation Risk',
        'Corporate Governance',
        'Proprietary Costs',
        'Information Asymmetry',
        'Unsophisticated Investors',
        'Equity Issuance',
        'Reputation Risk'
    ]
    
    # List of columns to bring from laws dataset
    law_columns = [
        'Date', 'Regulation Title', 'Regulatory Body', 'Description', 
        'Impact', 'Litigation Risk', 'Corporate Governance', 
        'Proprietary Costs', 'Information Asymmetry', 
        'Unsophisticated Investors', 'Equity Issuance', 
        'Reputation Risk', 'References', 'Year'
    ]
    
    # Process each law
    for _, law in laws_df.iterrows():
        try:
            # Check each channel
            for channel in channels:
                # Only create panel if channel is "Yes"
                if str(law[channel]).strip().lower() == "yes":
                    # Create a copy of the panel data
                    law_panel = panel_df.copy()
                    
                    # Add law information to each row
                    for col in law_columns:
                        law_panel[col] = law[col]
                    
                    # Create treatment indicator
                    law_panel['post_law'] = (law_panel['FYEAR'] >= law['Year']).astype(int)
                    law_panel['treated'] = 1
                    law_panel['treatment_effect'] = law_panel['post_law'] * law_panel['treated']
                    
                    # Create filename with both law and channel
                    safe_title = law['Regulation Title'].replace('/', '_').replace('\\', '_')
                    safe_title = ''.join(c for c in safe_title if c.isalnum() or c in ('_', '-'))
                    safe_channel = channel.replace(' ', '_')
                    
                    # Save to CSV
                    output_file = os.path.join(output_dir, f"panel_{safe_title}_{safe_channel}.csv")
                    law_panel.to_csv(output_file, index=False)
                    
                    print(f"Created panel dataset for: {law['Regulation Title']} - {channel}")
            
        except Exception as e:
            print(f"Error processing law {law['Regulation Title']}: {str(e)}")
            continue

if __name__ == "__main__":
    # Configuration
    laws_file = "enter file path here"
    panel_file = "enter folder path here"
    output_dir = "enter folder path here"
    
    # Create panel datasets
    create_channel_panels(laws_file, panel_file, output_dir)
    
    print("\nPanel creation complete!")

# 4. Run Regression analyses and save regression tables
import pandas as pd
import numpy as np
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
import os
import json
import glob
from fpdf import FPDF
import traceback


class RegressionAnalyzer:
    def __init__(self):
        """Initialize regression analyzer"""
        pass

    def _get_significance_stars(self, pvalue: float) -> str:
        """Get significance stars based on p-value."""
        if pvalue < 0.01:
            return "***"
        elif pvalue < 0.05:
            return "**"
        elif pvalue < 0.1:
            return "*"
        return ""

    def filter_event_window(self, df: pd.DataFrame) -> pd.DataFrame:
        """Filter data to ±2 years around regulation year"""
        try:
            regulation_year = int(df['Year'].iloc[0])
            return df[
                (df['FYEAR'] >= regulation_year - 2) &
                (df['FYEAR'] <= regulation_year + 2)
            ]
        except KeyError as e:
            print(f"Missing column in DataFrame: {e}")
            raise
        except Exception as e:
            print(f"Error during filtering: {e}")
            raise

    def run_regressions(self, df: pd.DataFrame) -> dict:
        """Run multiple regression specifications without fixed effects"""
        results_dict = {}
        specifications = {
            '(1)': {
                'dep_var': 'freqMF',
                'controls': []
            },
            '(2)': {
                'dep_var': 'freqMF',
                'controls': ['linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 'lcalrisk']
            }
        }

        for spec_name, spec in specifications.items():
            print(f"\nRunning regression for specification {spec_name}")
            try:
                dep_var = spec['dep_var']
                controls = spec.get('controls', [])
                variables = controls + ['treatment_effect']

                # Clean data
                print("Getting required columns...")
                required_columns = variables + [dep_var, 'GVKEY']  # Include all needed columns
                if not all(col in df.columns for col in required_columns):
                    missing_cols = [col for col in required_columns if col not in df.columns]
                    print(f"Missing columns: {missing_cols}")
                    raise ValueError(f"Missing required columns: {missing_cols}")
            
                reg_data = df[required_columns].copy()
                reg_data = reg_data.replace([np.inf, -np.inf], np.nan)
                reg_data = reg_data.dropna()
            
                print(f"Observations: {len(reg_data)}")
                print(f"Number of unique firms: {len(reg_data['GVKEY'].unique())}")

                # Simple matrix construction without fixed effects
                X = add_constant(reg_data[variables])
                y = reg_data[dep_var]

                # Fit model with standard errors
                model = OLS(y, X)
                results = model.fit(cov_type='HC0')  # Using heteroskedasticity-robust standard errors

                # Store results
                results_dict[spec_name] = {
                    'coefficients': results.params.to_dict(),
                    'pvalues': results.pvalues.to_dict(),
                    't_stats': (results.params / results.bse).to_dict(),
                    'r_squared': results.rsquared,
                    'n_obs': int(results.nobs),
                    'n_firms': len(reg_data['GVKEY'].unique()),
                    'controls': controls,
                    'fixed_effects': {
                    'firm': False,
                    'industry_year': False
                    }
                }
                print(f"Successfully completed regression for specification {spec_name}")
                
            except Exception as e:
                print(f"Error in specification {spec_name}: {str(e)}")
                traceback.print_exc()
                continue
        
        if not results_dict:
            raise ValueError("No successful regressions completed")
        
        return results_dict

    def save_regression_table_as_pdf(self, results: dict, regulation_title: str, output_path: str):
        """Save regression table as PDF matching the target format"""
        try:
            pdf = FPDF(format='A4', orientation='P')  # Changed to portrait and A4
            pdf.set_margins(30, 20, 30)
            pdf.add_page()
            pdf.set_font('Times', size=12)  # Changed to Times
        
        
            # Title - all in bold Times New Roman
            try:
                # First try Windows standard folder for Times New Roman
                pdf.add_font('Times New Roman', '', r'C:\Windows\Fonts\times.ttf', uni=True)
                pdf.add_font('Times New Roman', 'B', r'C:\Windows\Fonts\timesbd.ttf', uni=True)
                pdf.set_font('Times New Roman', 'B', 12)  # Set bold font first
            except:
                try:
                    # Try alternative paths for Times New Roman
                    pdf.add_font('Times New Roman', '', 'times.ttf', uni=True)
                    pdf.add_font('Times New Roman', 'B', 'timesbd.ttf', uni=True)
                    pdf.set_font('Times New Roman', 'B', 12)
                except:
                    print("Times New Roman font not found, using Arial Bold")
                    pdf.set_font('Arial', 'B', 12)

            # Both title and table number in bold
            pdf.cell(0, 10, "Table 3", ln=True, align='C')
            pdf.cell(0, 10, f"The Impact of {regulation_title} on Management Forecast Frequency", ln=True, align='C')
            pdf.ln(5)

            # Switch back to regular font for table content
            try:
                pdf.set_font('Times New Roman', '', 12)
            except:
                pdf.set_font('Arial', '', 12)
            

            # Calculate column widths for 3 columns
            col_width = (pdf.w - 60) / 3  # -60 for margins and first column
            first_col_width = 60

            # Table header
            pdf.cell(first_col_width, 8, "", 1)
            for i in range(1, 3):  # Only 2 specifications now
                pdf.cell(col_width, 8, f"({i})", 1, align='C')
            pdf.ln()

            # Treatment Effect
            pdf.cell(first_col_width, 8, "Treatment Effect", 1)
            for i in range(1, 3):
                spec = f'({i})'
                coef = results[spec]['coefficients']['treatment_effect']
                tstat = abs(results[spec]['t_stats']['treatment_effect'])
                stars = self._get_significance_stars(results[spec]['pvalues']['treatment_effect'])
                pdf.cell(col_width, 8, f"{coef:.4f}{stars} ({tstat:.2f})", 1, align='C')
            pdf.ln()

            # Control variables for specification (2)
            control_labels = {
                'linstown': 'Institutional ownership',
                'lsize': 'Firm size',
                'lbtm': 'Book-to-market',
                'lroa': 'ROA',
                'lsaret12': 'Stock return',
                'levol': 'Earnings volatility',
                'lloss': 'Loss',
                'lcalrisk': 'Class action litigation risk'
            }

            for var, label in control_labels.items():
                pdf.cell(first_col_width, 8, label, 1)
                for i in range(1, 3):
                    spec = f'({i})'
                    if var in results[spec]['coefficients']:
                        coef = results[spec]['coefficients'][var]
                        tstat = abs(results[spec]['t_stats'][var])
                        stars = self._get_significance_stars(results[spec]['pvalues'][var])
                        pdf.cell(col_width, 8, f"{coef:.4f}{stars} ({tstat:.2f})", 1, align='C')
                    else:
                        pdf.cell(col_width, 8, "", 1, align='C')
                pdf.ln()


            # N and R²
            for stat in ['N', 'R²']:
                pdf.cell(first_col_width, 8, stat, 1)
                for i in range(1, 3):
                    spec = f'({i})'
                    value = results[spec]['n_obs'] if stat == 'N' else results[spec]['r_squared']
                    text = f"{value:,}" if stat == 'N' else f"{value:.4f}"
                    pdf.cell(col_width, 8, text, 1, align='C')
                pdf.ln()

            # Notes
            pdf.ln(10)
            pdf.set_font('Times', size=10)
            notes = "Notes: t-statistics in parentheses. *, **, and *** represent significance at the 10%, 5%, and 1% level, respectively."
            pdf.multi_cell(0, 5, notes)

            pdf.output(output_path)
            print(f"PDF saved at {output_path}")

        except Exception as e:
            print(f"Error saving PDF: {e}")
            traceback.print_exc()

    def analyze_panel(self, panel_file: str, output_dir: str):
        """Analyze a single panel dataset"""
        try:
            print(f"\nStarting analysis of {os.path.basename(panel_file)}...")
            print("Reading data...")
            df = pd.read_csv(panel_file)
            
            print("Filtering event window...")
            df_filtered = self.filter_event_window(df)
            
            print("Running regressions...")
            results = self.run_regressions(df_filtered)
            
            print("Saving results...")
            os.makedirs(output_dir, exist_ok=True)
            
            # Save filtered data and results
            print("Saving filtered data...")
            df_filtered.to_csv(os.path.join(output_dir, 'filtered_data.csv'), index=False)
            with open(os.path.join(output_dir, 'regression_results.json'), 'w') as f:
                json.dump(results, f, indent=4)
            
            # Save regression table
            table_path = os.path.join(output_dir, 'regression_table.pdf')
            self.save_regression_table_as_pdf(
                results,
                df['Regulation Title'].iloc[0],
                table_path
            )
            print(f"Saved regression table to {table_path}")
            
        except Exception as e:
            print(f"Error analyzing panel: {str(e)}")
            traceback.print_exc()


def analyze_all_panels(input_dir: str, output_dir: str):
    """Analyze all panel datasets in a directory"""
    analyzer = RegressionAnalyzer()
    panel_files = glob.glob(os.path.join(input_dir, "*.csv"))
    total_files = len(panel_files)
    successful_runs = 0
    failed_runs = 0
    print(f"\nFound {total_files} panel files to analyze")
    
    for i, panel_file in enumerate(panel_files, 1):
        print(f"\n{'='*80}")
        print(f"Processing panel {i} of {total_files}: {os.path.basename(panel_file)}")
        print(f"{'='*80}")
        
        try:
            analyzer.analyze_panel(panel_file, os.path.join(output_dir, os.path.basename(panel_file).replace('.csv', '')))
            successful_runs += 1
            print(f"Successfully processed panel {i}")
        except Exception as e:
            failed_runs += 1
            print(f"Failed to process panel {i}: {str(e)}")
            
        # Print progress summary
        print(f"\nProgress Summary:")
        print(f"Processed: {i}/{total_files} ({(i/total_files)*100:.1f}%)")
        print(f"Successful: {successful_runs}")
        print(f"Failed: {failed_runs}")


if __name__ == "__main__":
    # Configuration
    INPUT_DIR = "enter folder path here"
    OUTPUT_DIR = "enter folder path here"
    
    # Run analysis on all panels
    analyze_all_panels(INPUT_DIR, OUTPUT_DIR)

# 5. Check and keep significant results 
import os
import json
import glob
import shutil

def process_significance(base_dir, delete_nonsig=False):
    """
    Process panels based on significance and either delete or move non-significant results.
    t-stat >= 1.96 is considered significant.
    """
    # Create directory for non-significant results if not deleting
    if not delete_nonsig:
        nonsig_dir = os.path.join(os.path.dirname(base_dir), 'nonsignificant_results')
        os.makedirs(nonsig_dir, exist_ok=True)
    
    # Find all panel directories
    panel_dirs = glob.glob(os.path.join(base_dir, 'panel_*'))
    print(f"Found {len(panel_dirs)} panel directories")
    
    # Track results
    significant_count = 0
    not_significant_count = 0
    
    # Process each panel
    for panel_dir in panel_dirs:
        panel_name = os.path.basename(panel_dir)
        json_file = os.path.join(panel_dir, 'regression_results.json')
        
        try:
            # Read regression results
            with open(json_file, 'r') as f:
                results = json.load(f)
            
            # Check specification (2)
            if '(2)' in results:
                t_stat = abs(results['(2)']['t_stats']['treatment_effect'])
                is_significant = t_stat >= 1.96
                
                if is_significant:
                    print(f"{panel_name}: t-stat = {t_stat:.2f} (significant - keeping)")
                    significant_count += 1
                else:
                    print(f"{panel_name}: t-stat = {t_stat:.2f} (not significant - {'deleting' if delete_nonsig else 'moving'})")
                    if delete_nonsig:
                        shutil.rmtree(panel_dir)
                    else:
                        shutil.move(panel_dir, os.path.join(nonsig_dir, panel_name))
                    not_significant_count += 1
            else:
                print(f"{panel_name}: No specification (2) found - {'deleting' if delete_nonsig else 'moving'}")
                if delete_nonsig:
                    shutil.rmtree(panel_dir)
                else:
                    shutil.move(panel_dir, os.path.join(nonsig_dir, panel_name))
                not_significant_count += 1
                
        except Exception as e:
            print(f"Error processing {panel_name}: {str(e)}")
    
    # Print summary
    total_processed = significant_count + not_significant_count
    print("\nSummary:")
    print(f"Total panels processed: {total_processed}")
    print(f"Significant results: {significant_count} ({(significant_count/total_processed)*100:.1f}%)")
    print(f"Not significant results: {not_significant_count} ({(not_significant_count/total_processed)*100:.1f}%)")
    if not delete_nonsig:
        print(f"\nNon-significant results moved to: {nonsig_dir}")
    else:
        print("\nNon-significant results deleted")


base_dir = "enter folder path here"


delete_nonsig = False  
process_significance(base_dir, delete_nonsig)

# 6. Ask Claude to write a background, theoretical framework, and hypothesis development section
import pandas as pd
import json
import os
from anthropic import Anthropic
import glob

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def get_laws_analysis(self, csv_file: str) -> list:
        """Read and analyze laws from CSV file"""
        df = pd.read_csv(csv_file)
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        for _, row in df.iterrows():
            # Get active mechanisms (where value is 'Yes')
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
            law = {
                'title': row['Regulation Title'],
                'year': row['Year'],
                'body': row['Regulatory Body'],
                'description': row['Description'],
                'impact': row['Impact'],
                'mechanisms': active_mechanisms
            }
            laws.append(law)
        
        return laws

    def get_background_hypothesis(self, law: dict, mechanism: str) -> str:
        """Get background, theoretical framework, and hypothesis development for a law and specific mechanism"""
        prompt = f"""You are an accounting academic writing a research paper examining {law['title']} and its impact on 
        voluntary disclosure through the {mechanism} channel.
Please write the background, theoretical framework, and hypothesis development section following these guidelines:

Law Details:
Title: {law['title']} ({law['year']})
Regulatory Body: {law['body']}
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Please structure your response as follows:

1. Background (3 paragraphs, ~400 words total):
    - Label this subsection "Background"
    - Describe the relevant U.S. federal securities law in  {law['title']}
    -Include the date that the change is effective ({law['year']}), which firms are affected, and why the change was instituted.  
    -Discuss the effective date ({law['year']}) and implementation details
    -Please also discuss whether there were other contemporaneous securities law adoptions. 
    -Support each claim with citations to foundational papers 

2. Theoretical Framework
    - Begin with a brief introduction connecting the law to the relevant theoretical perspective {mechanism}
    - Explain core concepts of {mechanism}
    - Connect to voluntary disclosure decisions
    - Link to the specific {mechanism} being studied
    - Support with 2-3 seminal citations

3. Hypothesis Development (3 paragraphs, ~800 words total):
    - Label this subsection "Hypothesis Development"
    - Present economic mechanisms linking {law['title']} to voluntary disclosure decisions through the {mechanism} channel
    - Draw on established theoretical frameworks specifically related to {mechanism}
    - Propose a theoretically supported hypothesis about the relationship between the U.S. federal 
    securities law from file {law['title']} and voluntary disclosure for the specific {mechanism} channel
    - Build logical arguments step by step think through whether prior literature suggests competing theoretical 
    predictions or if the literature suggests only one direction for the relationship. 
    - Present the formal hypothesis statement on its own line, clearly labeled "H1:"
    - Support each claim with citations to foundational papers 

Writing Guidelines:
- Use active voice (e.g., "We examine" instead of "This paper examines")
- Maintain formal academic tone suitable for a top journal
- Include 2-3 citations per paragraph 
- Use present tense for established findings
- Make clear distinctions between correlation and causation
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Financial Studies"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting background and hypothesis: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def create_background_hypothesis_files(self, csv_file: str, output_dir: str):
        """Generate and save background and hypothesis sections for all laws"""
        # Create main directory
        main_dir = os.path.join(output_dir, 'background and hypothesis development')
        os.makedirs(main_dir, exist_ok=True)
        
        # Get laws
        laws = self.get_laws_analysis(csv_file)
        
        # Process each law and mechanism
        for law in laws:
            print(f"\nProcessing law: {law['title']}")
            
            # Generate separate background and hypothesis for each mechanism
            for mechanism in law['mechanisms']:
                print(f"Processing mechanism: {mechanism}")
                
                # Create filename with both law and mechanism
                clean_mechanism = mechanism.replace(' ', '_')
                filename = f"{law['title'].replace(' ', '_')}_{clean_mechanism}_background_hypothesis.txt"
                file_path = os.path.join(main_dir, filename)
            
                # Check if file already exists
                if os.path.exists(file_path):
                    print(f"Skipping {law['title']} - {mechanism}: File already exists")
                    continue
                
                # Get background and hypothesis content
                content = self.get_background_hypothesis(law, mechanism)
            
                
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)
                print(f"Saved background and hypothesis for {law['title']} - {mechanism}")

def main():
    # Configuration
    API_KEY = "enter API here"
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = "enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.create_background_hypothesis_files(CSV_FILE, OUTPUT_DIR)
        print("\nBackground and hypothesis development sections complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

# 7. Send regression results to Claude for interpretation
import json
import os
import glob
from typing import Dict, List
from anthropic import Anthropic

class RegressionInterpreter:
    def __init__(self, input_dir: str, output_dir: str, api_key: str):
        """Initialize interpreter with input and output directories"""
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.client = Anthropic(api_key=api_key)
        
    def _get_significance_stars(self, pvalue: float) -> str:
        """Get significance stars based on p-value."""
        if pvalue < 0.01:
            return "***"
        elif pvalue < 0.05:
            return "**"
        elif pvalue < 0.1:
            return "*"
        return ""
    
    def _get_significance_level(self, pvalue: float) -> str:
        """Convert p-value to significance level description"""
        if pvalue < 0.01:
            return "at the 1% level"
        elif pvalue < 0.05:
            return "at the 5% level"
        elif pvalue < 0.1:
            return "at the 10% level"
        return "not statistically significant"

    def read_regression_results(self, regulation_name: str) -> Dict:
        """Read regression results JSON file for a specific regulation"""
        results_path = os.path.join(self.output_dir, regulation_name, 'regression_results.json')
        
        try:
            with open(results_path, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            print(f"No results file found for {regulation_name}")
            return {}
        except json.JSONDecodeError:
            print(f"Error reading results file for {regulation_name}")
            return {}

    def read_hypothesis(self, regulation_name: str) -> str:
        """
        Read hypothesis file with flexible matching across various filename formats.
    
        Args:
            regulation_name (str): Name of the regulation to find a hypothesis for
    
        Returns:
            str: Hypothesis text or empty string if no file found
        """
        # Remove 'panel_' prefix if present
        if regulation_name.startswith('panel_'):
            regulation_name = regulation_name[6:]  # Remove 'panel_' prefix
    
        # Set up hypothesis directory
        hypothesis_dir = os.path.join(os.path.dirname(self.output_dir), 
                                'background and hypothesis development')
    
        # Print debugging information
        print(f"Searching for hypothesis file for: {regulation_name}")
        print(f"Hypothesis directory: {hypothesis_dir}")
    
        # Prepare search terms
        search_terms = [
            # Original input
            regulation_name,
            # Remove underscores
            regulation_name.replace('_', ''),
            # Replace underscores with spaces
            regulation_name.replace('_', ' '),
        ]
    
        # Try to list files for debugging
        try:
            all_files = os.listdir(hypothesis_dir)
            print("Files in directory:")
            for file in all_files:
                print(file)
        except Exception as e:
            print(f"Error listing directory: {e}")
            return ""
    
        # Find matching files
        matching_files = [
            file for file in all_files
            if (any(term.lower() in file.lower() for term in search_terms) and 
                file.lower().endswith('_background_hypothesis.txt'))
        ]
    
        # If no matches found, try more relaxed matching
        if not matching_files:
            matching_files = [
                file for file in all_files
                if any(
                    term.lower() in file.lower().replace('_', '').replace(' ', '') 
                    for term in search_terms
                ) and file.lower().endswith('_background_hypothesis.txt')
            ]
    
        # Process the first matching file
        for filename in matching_files:
            hypothesis_file = os.path.join(hypothesis_dir, filename)
        
            print(f"Found matching file: {filename}")
            print(f"Full path: {hypothesis_file}")
        
            try:
                with open(hypothesis_file, 'r', encoding='utf-8') as f:
                    content = f.read()
            
                # Extract hypothesis development section
                if "Hypothesis Development" in content:
                    # Split by "Hypothesis Development" and take everything after it
                    hypothesis_section = content.split("Hypothesis Development")[1]
            
                    # If there's a formal H1 statement, include everything up to that
                    if "H1:" in hypothesis_section:
                        hypothesis_development = hypothesis_section.split("H1:")[0].strip()
                        h1_statement = "H1:" + hypothesis_section.split("H1:")[1].strip()
                        return f"Hypothesis Development:\n\n{hypothesis_development}\n\n{h1_statement}"
                    else:
                        return f"Hypothesis Development:\n\n{hypothesis_section.strip()}"
                else:
                    print(f"No Hypothesis Development section found in {filename}")
        
            except Exception as e:
                print(f"Error reading hypothesis file {filename}: {str(e)}")
    
        print(f"No hypothesis file found for {regulation_name}")
        return ""

    def format_results_text(self, regulation_title: str, regulation_year: int, results: Dict) -> str:
        """Format regression results into text for the academic prompt"""
        results_text = f"Regression Analysis for {regulation_title} (Year: {regulation_year})\n\n"
        
        for spec_name, res in results.items():
            results_text += f"\nSpecification {spec_name}:\n"
            results_text += f"Treatment Effect: {res['coefficients']['treatment_effect']:.4f}\n"
            results_text += f"T-statistic: {res['t_stats']['treatment_effect']:.2f}\n"
            results_text += f"P-value: {res['pvalues']['treatment_effect']:.4f}\n"
            results_text += f"R-squared: {res['r_squared']:.4f}\n"
            results_text += f"Number of observations: {int(res['n_obs'])}\n"
            results_text += f"Number of firms: {res['n_firms']}\n"
            
            if res['controls']:
                results_text += "\nControl Variables:\n"
                for control in res['controls']:
                    coef = res['coefficients'][control]
                    tstat = res['t_stats'][control]
                    pvalue = res['pvalues'][control]
                    stars = self._get_significance_stars(pvalue)
                    results_text += f"{control}: {coef:.4f}{stars} (t={tstat:.2f}, p={pvalue:.4f})\n"
            
            results_text += "\nFixed Effects:\n"
            for fe, included in res['fixed_effects'].items():
                results_text += f"{fe}: {'Yes' if included else 'No'}\n"
            
            results_text += "-" * 50 + "\n"
        
        return results_text

    def generate_claude_interpretation(self, regulation_title: str, regulation_year: int, results_text: str, hypothesis_text: str) -> str:
        """Generate interpretation using Claude API"""
        prompt = f"""You are an accounting academic with a PhD in accounting. 
        You should use active voice (e.g. "We find" instead of "It is found"). 
        Use present tense for all established findings. 
        Distinguish between correlation and causation. 
        Write the results description for this analysis as if you were writing an academic paper for an accounting journal, 
        you are studying the association between a change in mandatory disclosure and voluntary disclosure. 
        
        Here is the hypothesis that was developed:
        {hypothesis_text}
        
        Please provide a detailed academic analysis of these regression results:

{results_text}

Please structure your analysis as follows (3 paragraphs, ~600 words total):
1. Label this section Regression Analysis
2. Main finding (treatment effect interpretation)
3. Statistical significance and economic magnitude
4. Model specification comparison
5. Control variable effects
   Describe whether the relationship is consistent with prior literature
6. Explain whether the results support the hypothesis stated in the Hypothesis section above

Write in an academic style suitable for a top accounting journal."""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text
        except Exception as e:
            print(f"Error getting Claude interpretation: {str(e)}")
            return f"Error in Claude analysis: {str(e)}"

    def interpret_regulation_impact(self, regulation_name: str) -> str:
        """Generate interpretation for a single regulation's results"""
        results = self.read_regression_results(regulation_name)
        
        # Read the original panel file to get regulation title
        panel_file = os.path.join(self.input_dir, f"{regulation_name}.csv")
        try:
            import pandas as pd
            df = pd.read_csv(panel_file)
            regulation_title = df['Regulation Title'].iloc[0]
            regulation_year = df['Year'].iloc[0]
        except:
            regulation_title = regulation_name
            regulation_year = "N/A"
        
        # Format results text
        results_text = self.format_results_text(regulation_title, regulation_year, results)
        
        # Read hypothesis text
        hypothesis_text = self.read_hypothesis(regulation_name)
        
        # Generate interpretation using Claude
        interpretation = self.generate_claude_interpretation(
            regulation_title, 
            regulation_year, 
            results_text,
            hypothesis_text
        )
        
        # Create subfolder if it doesn't exist
        regulation_dir = os.path.join(self.output_dir, regulation_name)
        os.makedirs(regulation_dir, exist_ok=True)
        
        # Save interpretation to file
        claude_path = os.path.join(regulation_dir, 'claude_interpretation.txt')
        try:
            with open(claude_path, 'w', encoding='utf-8') as f:
                f.write(interpretation)
            print(f"Saved interpretation to {claude_path}")
        except Exception as e:
            print(f"Error saving interpretation to file: {str(e)}")
        
        return interpretation

    def analyze_all_regulations(self) -> None:
        """Analyze results for all regulations in the directory"""
        panel_files = glob.glob(os.path.join(self.input_dir, "panel_*_*.csv"))
        
        for panel_file in panel_files:
            regulation_name = os.path.splitext(os.path.basename(panel_file))[0]
            try:
                print("\n" + "="*80)
                print(self.interpret_regulation_impact(regulation_name))
                print("="*80 + "\n")
            except Exception as e:
                print(f"Error analyzing {regulation_name}: {str(e)}")

def main():
    # Configuration
    API_KEY = "enter API here"  
    BASE_DIR = "enter folder path here"
    INPUT_DIR = os.path.join(BASE_DIR, "law_panels")
    OUTPUT_DIR = os.path.join(BASE_DIR, "regression_analyses")
    
    interpreter = RegressionInterpreter(INPUT_DIR, OUTPUT_DIR, API_KEY)
    interpreter.analyze_all_regulations()

if __name__ == "__main__":
    main()

## 8. Create Correlation tables
import os
import pandas as pd
import numpy as np
from reportlab.lib import colors
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_LEFT 
from scipy import stats
from reportlab.lib.pagesizes import letter, landscape

def create_correlation_table(data_path, output_dir):
    """
    Creates a clean correlation table PDF in the style of academic papers.
    
    Args:
        data_path (str): Path to the panel data CSV
        output_dir (str): Path to save output files
    """
    # Read the CSV file
    df = pd.read_csv(data_path)
    
    # Select numerical variables for correlation
    numeric_vars = ['treatment_effect','freqMF','linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 
                    'lcalrisk']
                   
    # Create shorter variable names for the table
    var_mapping = {
        'treatment_effect': 'Treatment Effect',
        'freqMF': 'FreqMF',
        'linstown': 'Institutional ownership',
        'lsize': 'Firm size',
        'lbtm': 'Book-to-market',
        'lroa': 'ROA',
        'lsaret12': 'Stock return',
        'levol': 'Earnings volatility',
        'lloss': 'Loss',
        'lcalrisk': 'Class action litigation risk'
        
    }
    
    # Calculate correlation matrix
    corr_matrix = df[numeric_vars].corr()
    
    # Calculate p-values for significance testing
    def calculate_pvalue(x, y):
        return stats.pearsonr(x.dropna(), y.dropna())[1]
    
    p_values = pd.DataFrame(index=numeric_vars, columns=numeric_vars)
    for i in numeric_vars:
        for j in numeric_vars:
            p_values.loc[i,j] = calculate_pvalue(df[i], df[j])
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Get panel name from path
    panel_name = os.path.basename(os.path.dirname(data_path))
    
    # Create PDF
    clean_name = panel_name.replace('panel_', '')
    pdf_path = os.path.join(output_dir, f'{clean_name}_correlation_table.pdf')
    doc = SimpleDocTemplate(pdf_path, pagesize=landscape(letter), rightMargin=30, leftMargin=30, topMargin=50, bottomMargin=50)
    
    # Prepare table data
    table_data = [['']]  # First cell empty
    
    # Add column headers
    for var in numeric_vars:
        table_data[0].append(var_mapping[var])
    
    # Add rows
    for i, var1 in enumerate(numeric_vars, 1):
        row = [var_mapping[var1]]  # Row header
        for var2 in numeric_vars:
            if var1 == var2:
                row.append('1.00')
            else:
                value = corr_matrix.loc[var1, var2]
                # Format to 2 decimal places
                formatted_value = f'{value:.2f}'
                row.append(formatted_value)
        table_data.append(row)
    
    # Create table style
    style = [
        ('FONTNAME', (0,0), (-1,-1), 'Times-Roman'),
        ('FONTSIZE', (0,0), (-1,-1), 8),
        ('ALIGN', (0,0), (-1,-1), 'CENTER'),
        ('TOPPADDING', (0,0), (-1,-1), 3),
        ('BOTTOMPADDING', (0,0), (-1,-1), 3),
        ('GRID', (0,0), (-1,-1), 0.25, colors.black),  # Lighter grid lines
        ('BOX', (0,0), (-1,-1), 0.25, colors.black),
        # Make column headers and row headers bold
        ('FONTNAME', (0,0), (-1,0), 'Times-Bold'),
        ('FONTNAME', (0,0), (0,-1), 'Times-Bold'),
    ]
    
    # Add bold style for significant correlations
    for i in range(1, len(table_data)):
        for j in range(1, len(table_data[0])):
            if i != j:  # Skip diagonal
                var1 = numeric_vars[i-1]
                var2 = numeric_vars[j-1]
                if p_values.loc[var1,var2] < 0.05:  # 5% significance level
                    style.append(('FONTNAME', (j,i), (j,i), 'Times-Bold'))
    
    # Create table
    table = Table(table_data)
    table.setStyle(TableStyle(style))
    
    # Create title
    styles = getSampleStyleSheet()
    title_style = ParagraphStyle(
        'CustomTitle',
        parent=styles['Normal'],
        fontSize=12,
        alignment=TA_CENTER,
        spaceBefore=12,
        spaceAfter=20,
        fontName='Times-Bold'
    )

    
    # Create Panel title in smaller text if needed
    panel_title = ""
    if panel_name:
        clean_panel_name = panel_name.replace('panel_', '').replace('_', ' ')
        # Format the law name with proper spacing
        if 'NominatingCommitteeRequirements' in clean_panel_name:
            law_name = 'Nominating Committee Requirements'
        elif 'ResourceExtractionDisclosureRules' in clean_panel_name:
            law_name = 'Resource Extraction Disclosure Rules'
        elif 'PayRatioDisclosureRule' in clean_panel_name:
            law_name = 'Pay Ratio Disclosure Rule'
        else:
            law_name = clean_panel_name
    
        panel_title = f"<br/>{law_name}"
    
    title = Paragraph(f"Table 2<br/>Pearson Correlations{panel_title}", title_style)
    
    # Add footnote
    footnote_style = ParagraphStyle(
        'Footnote',
        parent=styles['Normal'],
        fontSize=8,
        alignment=TA_LEFT,
        fontName='Times-Roman',
        spaceBefore=6,
        leading=10  # Controls line spacing
    )
    footnote = Paragraph("This table shows the Pearson correlations for the sample. "
                        "Correlations that are significant at the 0.05 level or better are highlighted in bold. ", footnote_style)
    
    # Build PDF
    doc.build([title, table, Spacer(1, 12), footnote])
    
    print(f"Created correlation table PDF for {panel_name}")
    return pdf_path

def batch_process_panels(base_dir,output_base_dir):
    """
    Process all panel folders and create correlation tables, skipping existing ones.
    
    Args:
        base_dir (str): Base directory containing panel folders
        output_base_dir (str): Base directory where correlations folder should be created
    """
    print(f"Starting to process panels in: {base_dir}")
    
    # Create output directory
    output_dir = os.path.join("enter folder path here", "correlations")
    os.makedirs(output_dir, exist_ok=True)
    print(f"Created output directory: {output_dir}")
    
    # Count total panels
    panel_folders = [f for f in os.listdir(base_dir) if f.startswith('panel_')]
    total_panels = len(panel_folders)
    processed = 0
    skipped = 0
    errors = 0
    
    print(f"\nFound {total_panels} panel folders to process")
    
    # Process each panel folder
    for i, panel_folder in enumerate(panel_folders, 1):
        print(f"\n{'='*80}")
        print(f"Processing panel {i} of {total_panels}: {panel_folder}")
        print(f"{'='*80}")
        
        panel_path = os.path.join(base_dir, panel_folder)
        
        # Check if correlation table already exists
        clean_name = panel_folder.replace('panel_', '')
        existing_table = os.path.join(output_dir, f'{clean_name}_correlation_table.pdf')
        
        if os.path.exists(existing_table):
            print(f"Skipping {panel_folder}: Correlation table already exists")
            skipped += 1
            continue
            
        # Look for the data file
        data_file = 'filtered_data.csv'
        data_path = os.path.join(panel_path, data_file)
            
        if os.path.exists(data_path):
            try:
                table_path = create_correlation_table(data_path, output_dir)
                print(f"Created correlation table for {panel_folder}")
                print(f"Table saved to: {table_path}")
                processed += 1
            except Exception as e:
                print(f"Error processing {panel_folder}: {str(e)}")
                errors += 1
        else:
            print(f"No data file found in {panel_folder}")
            errors += 1
        
        # Print progress summary
        print(f"\nProgress Summary:")
        print(f"Processed: {i}/{total_panels} ({(i/total_panels)*100:.1f}%)")
        print(f"Successfully created: {processed}")
        print(f"Skipped (already exist): {skipped}")
        print(f"Errors: {errors}")

if __name__ == "__main__":
    # Base directory containing panel folders
    BASE_DIR = "enter folder path here"
    OUTPUT_BASE_DIR = "enter folder path here"
    
    # Process all panels
    batch_process_panels(BASE_DIR, OUTPUT_BASE_DIR)

# 9. Send sample and descriptive statistics results to Claude for interpretation

import pandas as pd
import numpy as np
import os
import glob
from typing import Dict, List
import json
from anthropic import Anthropic
import traceback
from fpdf import FPDF

class DescriptiveStatsAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def calculate_descriptive_stats(self, df: pd.DataFrame) -> Dict:
        """Calculate descriptive statistics for the dataset"""
        # List of numeric columns to analyze (excluding GVKEY, FYEAR, etc.)
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        numeric_cols = [col for col in numeric_cols if col not in ['GVKEY', 'FYEAR', 'sic3', 'Year']]
        
        # Sort columns to match example order if possible
        preferred_order = [
            'linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 'lcalrisk'
        ]
        sorted_cols = sorted(numeric_cols, key=lambda x: 
                           preferred_order.index(x) if x in preferred_order else float('inf'))
        
        stats = {}
        for col in sorted_cols:  # Use sorted_cols instead of numeric_cols
            col_stats = {
                'n': len(df[col].dropna()),
                'mean': df[col].mean(),
                'median': df[col].median(),
                'std': df[col].std(),
                'p25': df[col].quantile(0.25),
                'p75': df[col].quantile(0.75),
                'min': df[col].min(),
                'max': df[col].max()
            }
            stats[col] = col_stats
        
        # Add additional summary statistics
        stats['summary'] = {
            'total_observations': len(df),
            'unique_firms': len(df['GVKEY'].unique()),
            'year_range': f"{df['FYEAR'].min()} to {df['FYEAR'].max()}",
            'industries': len(df['sic3'].unique())
        }
        
        return stats
    

    
    def get_claude_interpretation(self, stats: Dict, regulation_title: str) -> str:
        """Get Claude's interpretation of descriptive statistics"""
        # Format statistics for Claude
        stats_text = f"Descriptive Statistics for {regulation_title}\n\n"
        
        # Add summary information
        summary = stats['summary']
        stats_text += "Sample Characteristics:\n"
        stats_text += f"Total observations: {summary['total_observations']:,}\n"
        stats_text += f"Number of unique firms: {summary['unique_firms']:,}\n"
        stats_text += f"Sample period: {summary['year_range']}\n"
        stats_text += f"Number of industries: {summary['industries']}\n\n"
        
        # Add variable statistics
        stats_text += "Variable Statistics:\n"
        for var, var_stats in {k: v for k, v in stats.items() if k != 'summary'}.items():
            stats_text += f"\n{var}:\n"
            stats_text += f"N: {var_stats['n']:,}\n"
            stats_text += f"Mean: {var_stats['mean']:.3f}\n"
            stats_text += f"Median: {var_stats['median']:.3f}\n"
            stats_text += f"Std Dev: {var_stats['std']:.3f}\n"
            stats_text += f"25th percentile: {var_stats['p25']:.3f}\n"
            stats_text += f"75th percentile: {var_stats['p75']:.3f}\n"
            stats_text += f"Min: {var_stats['min']:.3f}\n"
            stats_text += f"Max: {var_stats['max']:.3f}\n"
        
        # Create prompt for Claude
        prompt = f"""You are an accounting academic with a PhD in accounting. 
        You should use active voice (e.g. "We find" instead of "It is found"). 
        Use present tense for all established findings. Write the descriptive statistics section for this analysis as if 
        you were writing an academic paper for an accounting journal. Here are the descriptive statistics:

{stats_text}

Please structure your analysis as follows (400 words):
1. Label this section "Sample Description and Descriptive Statistics"
2. Describe the sample characteristics (number of firms, time period, industries)
3. Describe the key variables' distributions
4. Highlight any notable patterns or potential outliers
5. Compare statistics to relevant benchmarks from prior literature where applicable

Write in an academic style suitable for a top accounting journal."""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text
        except Exception as e:
            print(f"Error getting Claude interpretation: {str(e)}")
            return f"Error in Claude analysis: {str(e)}"
    
    def create_descriptive_stats_table(self, stats: Dict, output_path: str, regulation_title: str):
        """Create a PDF table of descriptive statistics in academic paper format"""
        try:
            pdf = FPDF(format='A4', orientation='L')
            pdf.add_page()
        
            # Set Times New Roman font
            try:
                pdf.add_font('Times', '', 'times.ttf', uni=True)
                pdf.add_font('Times', 'B', 'timesbd.ttf', uni=True)
                pdf.set_font('Times', size=11)
            except:
                pdf.set_font('Times', size=11)
        
            pdf.set_margins(20, 20, 20)
        
            # Title
            pdf.set_font('Times', 'B', 14)
            pdf.cell(0, 10, 'Table 1', align='C', ln=True)
            pdf.set_font('Times', '', 12)
            pdf.cell(0, 10, 'Descriptive Statistics', align='C', ln=True)
            pdf.ln(5)
        
            # Calculate column widths
            var_width = 70
            num_width = 30
        
            # Table headers
            pdf.set_font('Times', 'B')
            headers = ['Variables', 'N', 'Mean', 'Std. Dev.', 'P25', 'Median', 'P75']
            pdf.cell(var_width, 8, headers[0], border=1)
            for header in headers[1:]:
                pdf.cell(num_width, 8, header, border=1, align='C')
            pdf.ln()
        
            # Variable name mappings with ordered display
            var_display_names = {
                'freqMF': 'FreqMF',
                'treatment_effect': 'Treatment Effect',
                'linstown': 'Institutional ownership',
                'lsize': 'Firm size',
                'lbtm': 'Book-to-market',
                'lroa': 'ROA',
                'lsaret12': 'Stock return',
                'levol': 'Earnings volatility',
                'lloss': 'Loss',
                'lcalrisk': 'Class action litigation risk'
            }
        
            # Excluded variables
            excluded_vars = {'sic4', 'permno', 'post-law', 'treated'}
        
            # Sort variables to ensure FreqMF is first
            variables = {k: v for k, v in stats.items() 
                        if k != 'summary' and k not in excluded_vars}
        
            # Define display order
            display_order = ['freqMF', 'treatment_effect'] + [
                k for k in var_display_names.keys() 
                if k not in ['freqMF', 'treatment_effect']
            ]
        
            pdf.set_font('Times', '')
            for var_name in display_order:
                if var_name in variables:
                    var_stats = variables[var_name]
                    display_name = var_display_names.get(var_name, var_name)
                    pdf.cell(var_width, 8, display_name, border=1)
                
                    pdf.cell(num_width, 8, f"{var_stats['n']:,}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['mean']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['std']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['p25']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['median']:.4f}", border=1, align='C')
                    pdf.cell(num_width, 8, f"{var_stats['p75']:.4f}", border=1, align='C')
                    pdf.ln()
        
            # Footnote
            pdf.ln(10)
            pdf.set_font('Times', '', 10)
            footnote = "This table shows the descriptive statistics. All continuous variables are winsorized at the 1st and 99th percentiles."
            pdf.multi_cell(0, 5, footnote)
        
            pdf.output(output_path)
            print(f"Successfully saved descriptive statistics table to {output_path}")
        
        except Exception as e:
            print(f"Error creating descriptive statistics table: {str(e)}")
            print(f"Traceback: {traceback.format_exc()}")
        
            try:
                pdf = FPDF()
                pdf.add_page()
                pdf.set_font("Times", size=12)
                pdf.cell(0, 10, "Error occurred while creating descriptive statistics table")
                pdf.ln()
                pdf.cell(0, 10, f"Error: {str(e)}")
                pdf.output(output_path)
            except Exception as e2:
                print(f"Emergency PDF save also failed: {str(e2)}")
    
    def analyze_panel(self, panel_dir: str, output_dir: str) -> None:
        """Analyze descriptive statistics for a single panel dataset"""
        try:
            # Get panel name from directory name
            panel_name = os.path.basename(panel_dir)
            print(f"\nAnalyzing {panel_name}...")
            
            # Read filtered data
            data_file = os.path.join(panel_dir, 'filtered_data.csv')
            if not os.path.exists(data_file):
                print(f"No filtered_data.csv found in {panel_dir}")
                return
            
            print(f"Reading data from {data_file}")
            df = pd.read_csv(data_file)
            
            # Create output directory
            panel_output_dir = os.path.join(output_dir, panel_name)
            os.makedirs(panel_output_dir, exist_ok=True)
            print(f"Created output directory: {panel_output_dir}")
            
            # Calculate descriptive statistics
            print("Calculating descriptive statistics...")
            stats = self.calculate_descriptive_stats(df)
            
            # Save descriptive statistics to JSON
            stats_path = os.path.join(panel_output_dir, 'descriptive_stats.json')
            with open(stats_path, 'w') as f:
                json.dump(stats, f, indent=4)
            print(f"Saved descriptive statistics to {stats_path}")
            
            # Create and save descriptive statistics table
            table_path = os.path.join(panel_output_dir, 'descriptive_stats_table.pdf')
            print(f"Attempting to create PDF table at {table_path}")
            self.create_descriptive_stats_table(stats, table_path, panel_name)
            
            # Get Claude's interpretation
            print("Getting Claude's interpretation...")
            interpretation = self.get_claude_interpretation(stats, panel_name)
            
            # Save Claude's interpretation
            interpretation_path = os.path.join(panel_output_dir, 'descriptive_stats_analysis.txt')
            with open(interpretation_path, 'w') as f:
                f.write(interpretation)
            print(f"Saved Claude's analysis to {interpretation_path}")
            
        except Exception as e:
            print(f"Error analyzing {panel_dir}: {str(e)}")
            print(f"Traceback: {traceback.format_exc()}")

def analyze_all_panels(base_dir: str, output_dir: str, api_key: str):
    """Analyze all panel datasets in subfolders"""
    analyzer = DescriptiveStatsAnalyzer(api_key)
    os.makedirs(output_dir, exist_ok=True)

    # Find all subfolders that start with 'panel_'
    panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
    print(f"Found {len(panel_dirs)} panel directories to analyze")

    for i, panel_dir in enumerate(panel_dirs, 1):
        print(f"\nProcessing panel {i} of {len(panel_dirs)}: {panel_dir}")
        analyzer.analyze_panel(panel_dir, output_dir)

if __name__ == "__main__":
    # Configuration
    API_KEY = "enter API here"
    # Updated paths for Windows using raw strings to handle backslashes
    BASE_DIR = "enter folder path here"
    OUTPUT_DIR ="enter folder path here"
    
    # Run analysis on all panels
    analyze_all_panels(BASE_DIR, OUTPUT_DIR, API_KEY)

# 10. Ask Claude to write introduction

import pandas as pd
import json
import os
from anthropic import Anthropic
import glob

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def read_regression_results(self, base_dir: str) -> dict:
        """Read regression results from all panel subfolders"""
        all_results = {}
        
        # Look for panel directories that include mechanism
        panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
        
        for panel_dir in panel_dirs:
            panel_name = os.path.basename(panel_dir)
            results_file = os.path.join(panel_dir, 'regression_results.json')
            
            if os.path.exists(results_file):
                print(f"Reading results from {panel_name}")
                with open(results_file, 'r') as f:
                    results = json.load(f)
                all_results[panel_name] = results
            else:
                print(f"No results file found in {panel_name}")
        
        return all_results

    def get_laws_analysis(self, csv_file: str) -> list:
        """Read and analyze laws from CSV file, return list of law dictionaries"""
        df = pd.read_csv(csv_file)
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        for _, row in df.iterrows():
            # Get active mechanisms (where value is 'Yes')
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
            law = {
                'title': row['Regulation Title'],
                'year': row['Year'],
                'body': row['Regulatory Body'],
                'description': row['Description'],
                'impact': row['Impact'],
                'mechanisms': active_mechanisms
            }
            laws.append(law)
        
        return laws

    def format_regression_results(self, results: dict) -> str:
        """Format regression results for a specific panel"""
        if not results:
            return "No regression results available."
            
        formatted_text = "\nRegression Results:\n\n"
        
        for spec_name, spec_results in results.items():
            formatted_text += f"\nSpecification {spec_name}:\n"
            try:
                formatted_text += f"Treatment Effect: {spec_results['coefficients']['treatment_effect']:.4f}\n"
                formatted_text += f"T-statistic: {abs(spec_results['t_stats']['treatment_effect']):.2f}\n"
                formatted_text += f"P-value: {spec_results['pvalues']['treatment_effect']:.4f}\n"
                formatted_text += f"R-squared: {spec_results['r_squared']:.4f}\n"
                
                if spec_results['controls']:
                    formatted_text += "\nControl Variables:\n"
                    for control in spec_results['controls']:
                        coef = spec_results['coefficients'][control]
                        tstat = spec_results['t_stats'][control]
                        pvalue = spec_results['pvalues'][control]
                        formatted_text += f"{control}: coef={coef:.4f}, t={tstat:.2f}, p={pvalue:.4f}\n"
                
                formatted_text += "\n" + "-"*50 + "\n"
            except KeyError as e:
                print(f"Missing key in regression results: {e}")
                continue
                
        return formatted_text

    def get_comprehensive_introduction(self, law: dict, mechanism: str, regression_results: dict) -> str:
        """Get comprehensive introduction for a law and specific mechanism"""
        regression_text = self.format_regression_results(regression_results)
        print(f"\nFormatted regression results for {law['title']} - {mechanism}:")
        print(regression_text)
        
        prompt = f"""As an accounting academic, please write a comprehensive introduction section examining {law['title']} 
        and its impact on voluntary disclosure through the {mechanism} channel.

Law Details:
Title: {law['title']} ({law['year']})
Regulatory Body: {law['body']}
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Empirical Results:
{regression_text}

Please structure the introduction as follows:

1. Motivation (2 paragraphs, ~200 words):
   - Begin with the importance of {law['title']}
   - Open with a broad statement about {law['title']}
   - Focus specifically on how it relates to {mechanism}
   - Explain its relevance to voluntary disclosure through this mechanism
   - Identify the specific gap or puzzle in the literature
   - Identify specific research questions

2. Hypothesis Development (3 paragraphs, ~300 words):
   - Present the economic mechanism linking the regulation to voluntary disclosure
   - Explain how {mechanism} affects voluntary disclosure
   - Discuss theoretical underpinnings
   - Build on established theoretical frameworks
   - Develop clear, testable predictions
   - Build logical arguments step by step
   - Support each claim with citations to foundational papers
   - Support arguments with citations

3. Results Summary (3 paragraphs, ~300 words):
   - Lead with strongest statistical findings
   - Present the treatment effect coefficient of {regression_text}
   - Summarize the key findings of the analysis, 
     discussing the significance of the variable in terms of predictive power: {regression_text}
   - Discuss significance of variables and their predictive power
   - Present results in order of importance
   - Include economic significance
   - Use precise statistical language
   - Connect findings back to the {mechanism} channel

4. Contribution (2 paragraphs, ~200 words):
   - Position relative to 3-4 most closely related papers
   - Highlight novel findings about {mechanism}
   - Discuss broader implications for theory and practice
   - Emphasize contributions to understanding this specific economic channel

Guidelines:
- Do not include headers in the write up
- Do not include extra text or explanations
    -Example of what not to include: "Here's a comprehensive introduction section following your guidelines" or 
    "Here's a comprehensive introduction section examining Resource Extraction Disclosure Rules and its impact on voluntary disclosure through the Corporate Governance channel"
- Use active voice (e.g., "We find" instead of "It is found")
- Maintain formal academic tone
- Include 2-3 citations per paragraph 
- Use present tense for established findings
- Use past tense for your specific results
- Make clear distinctions between correlation and causation
- Avoid speculation beyond the data
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Financial Studies"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=8000,
                temperature=0.5,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting introduction: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def analyze_and_save_introductions(self, base_dir: str, csv_file: str, output_dir: str):
        """Generate and save comprehensive introductions"""
        # Create introduction directory
        intro_dir = os.path.join(output_dir, 'introduction')
        os.makedirs(intro_dir, exist_ok=True)
        
        # Get laws and regression results
        laws = self.get_laws_analysis(csv_file)
        regression_results = self.read_regression_results(base_dir)
        
        # Generate introduction for each law and each mechanism
        for law in laws:
            print(f"\nProcessing law: {law['title']}")
            
            # Generate separate introduction for each mechanism
            for mechanism in law['mechanisms']:
                print(f"Writing introduction for mechanism: {mechanism}")
                
                # Find matching regression results if available
                clean_mechanism = mechanism.replace(' ', '_')
                panel_name = f"panel_{law['title'].replace(' ', '')}_{clean_mechanism}"
                law_results = regression_results.get(panel_name, {})
                
                if not law_results:
                    print(f"Warning: No regression results found for {panel_name}")
                
                # Generate introduction
                intro = self.get_comprehensive_introduction(law, mechanism, law_results)
                
                # Create filename with both law and mechanism
                filename = f"{law['title'].replace(' ', '_')}_{clean_mechanism}_introduction.txt"
                
                # Save introduction
                with open(os.path.join(intro_dir, filename), 'w', encoding='utf-8') as f:
                    f.write(intro)
                
                print(f"Saved introduction for {law['title']} - {mechanism}")

def main():
    # Configuration
    API_KEY = "enter API here"
    BASE_DIR = "enter folder path here"
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = "enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.analyze_and_save_introductions(BASE_DIR, CSV_FILE, OUTPUT_DIR)
        print("Analysis complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

# 11. Ask Claude to write the model specification section of a paper 

import pandas as pd
import json
import os
from anthropic import Anthropic
import glob

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def read_regression_results(self, base_dir: str) -> dict:
        """Read regression results from all panel subfolders"""
        all_results = {}
        
        panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
        
        for panel_dir in panel_dirs:
            panel_name = os.path.basename(panel_dir)
            results_file = os.path.join(panel_dir, 'regression_results.json')
            
            if os.path.exists(results_file):
                print(f"Reading results from {panel_name}")
                with open(results_file, 'r') as f:
                    results = json.load(f)
                all_results[panel_name] = results
            else:
                print(f"No results file found in {panel_name}")
        
        return all_results

    def format_regression_results(self, results: dict) -> str:
        """Format regression results for a specific panel"""
        formatted_text = "\nRegression Results:\n\n"
        
        for spec_name, spec_results in results.items():
            formatted_text += f"\nSpecification {spec_name}:\n"
            formatted_text += f"Treatment Effect: {spec_results['coefficients']['treatment_effect']:.4f}\n"
            formatted_text += f"T-statistic: {abs(spec_results['t_stats']['treatment_effect']):.2f}\n"
            formatted_text += f"P-value: {spec_results['pvalues']['treatment_effect']:.4f}\n"
            formatted_text += f"R-squared: {spec_results['r_squared']:.4f}\n"
            
            if spec_results['controls']:
                formatted_text += "\nControl Variables:\n"
                for control in spec_results['controls']:
                    coef = spec_results['coefficients'][control]
                    tstat = spec_results['t_stats'][control]
                    pvalue = spec_results['pvalues'][control]
                    formatted_text += f"{control.replace('_', ' ')}: coef={coef:.4f}, t={tstat:.2f}, p={pvalue:.4f}\n"
            
            formatted_text += "\n" + "-"*50 + "\n"
        
        return formatted_text

    def get_laws_analysis(self, csv_file: str) -> list:
        """Read and analyze laws from CSV file"""
        df = pd.read_csv(csv_file)
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        for _, row in df.iterrows():
            # Get active mechanisms (where value is 'Yes')
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
            law = {
                'title': row['Regulation Title'],
                'year': row['Year'],
                'body': row['Regulatory Body'],
                'description': row['Description'],
                'impact': row['Impact'],
                'mechanisms': active_mechanisms
            }
            laws.append(law)
        
        return laws

    def get_model_specification(self, law: dict, mechanism: str, regression_results: dict) -> str:
        """Get model specification section for a law and specific mechanism with regression results"""
        regression_text = self.format_regression_results(regression_results) if regression_results else "No regression results available."
        
        # Get number of observations from regression results
        n_obs = None
        if regression_results and '(3)' in regression_results:
            n_obs = regression_results['(3)'].get('n_obs', 'Not available')
        
        # Get list of control variables from regression results
        controls = []
        if regression_results:
            for spec in regression_results.values():
                if spec.get('controls'):
                    controls.extend(spec['controls'])
            controls = list(set(controls))  # Remove duplicates
        
        prompt = f"""You are an accounting academic writing a research paper examining {law['title']} and its impact 
        on voluntary disclosure through the {mechanism} channel. 
        Please write the model specification section for an academic journal in accounting.

Law Details:
Title: {law['title']} ({law['year']})
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Regression Information:
{regression_text}

Please follow these detailed guidelines:

1. Label this section "Research Design" or "Model Specification"

2. Identifying Firms Affected by {law['title']} 
    - Explain the step by step process to identify firms affected by {law['title']} 
    - Describe the regulatory authority that is responsible for the law {law['body']} 

3. Model Explanation (2-3 paragraphs, ~300 words total):
    - Explain the regression model used to examine the relationship between {law['title']} 
      and voluntary disclosure through the {mechanism} channel
          -The model is: FreqMF = β₀ + β₁Treatment Effect + γControls + ε
    - Only discuss the control variables that appear in the regression results {regression_text}
      These variables are based on prior literature and are: Institutional Ownership, Firm Size, Book-to-Market,
      ROA, Stock Return, Earnings volatility, Loss, Class action litigation risk
    - Support model choices with citations to foundational papers
    - Explain potential endogeneity concerns and how the research design addresses them
    - Use clear, academic language
    - Avoid using underscores in variable names

4. Mathematical Model:
    - Present the complete regression equation in proper mathematical notation {regression_text}
        - Label the equation as follows: FreqMF = β₀ + β₁Treatment Effect + γControls + ε
            - Label the dependent variable "FreqMF"
            - Label the variable of interest as "Treatment Effect"
            - Label the control variables in the regression equation as "Controls"
    - Do no include the subscripts i and t in the regression 
    - Format the equation professionally

5. Variable Definitions (2-3 paragraphs, ~300 words total):
    - Define the dependent variable (FreqMF - management forecast frequency)
    - Define the "Treatment Effect" variable
    - Define each control variable used in the model as they appear in {regression_text}
      These variables are based on prior literature and are: Institutional Ownership, Firm Size, Book-to-Market,
      ROA, Stock Return, Earnings volatility, Loss, Class action litigation risk
        -Cite the appropriate paper for these variables from the Journal of Accounting Research
    - Do no include the subscripts i and t in the variable definition
    - For each control variable, provide detailed explanations about their expected relationships with voluntary disclosure
    - Explain how variables relate to the {mechanism} channel
    

6. Sample Construction (2-3 paragraphs, ~300 words total):
    - Describe the event window around {law['year']}
        -The time window for this analysis is 2 years before and 2 years after the regulation is implemented. Therefore,
         The total number of years of the sample period is 5 years.
    - Describe the source of the data from Compustat, I/B/E/S, Audit Analytics, and CRSP
    - Describe the sample construction process based on the number of observations: {n_obs if n_obs else 'Not available'}
    - Explain the treatment and control groups
    - Note any sample restrictions

Writing Guidelines:
- - Provide only the write up, no extra text or explanations 
    -Example of what not to include: "Here's a comprehensive model specifaction section following your guidelines"
- Use active voice (e.g., "We find" instead of "It is found")
- Maintain formal academic tone
- Include 2-3 citations per paragraph
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Financial Studies
- Use precise statistical language
- Make clear connections between variables and theoretical predictions
- Do not include Latex format"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting model specification: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def create_model_specifications(self, base_dir: str, csv_file: str, output_dir: str):
        """Generate and save model specification sections for all laws"""
        # Create main directory
        main_dir = os.path.join(output_dir, 'model_specification')
        os.makedirs(main_dir, exist_ok=True)
        
        # Get regression results for existing panels
        regression_results = self.read_regression_results(base_dir)
        total_panels = len(regression_results)
        processed = 0
        
        print(f"\nFound {total_panels} panels in regression folder")

        
        # Process each panel that exists
        for panel_name, results in regression_results.items():
            print(f"\n{'='*80}")
            print(f"Processing panel {processed + 1} of {total_panels}: {panel_name}")
        
            try:
                # Parse panel name to get law and mechanism
                parts = panel_name.replace('panel_', '').split('_')
                mechanism = parts[-1]  # Last part is the mechanism
                law_name = '_'.join(parts[:-1])  # Everything else is the law name
            
                # Read the original panel file to get law details
                panel_file = os.path.join(base_dir, panel_name, "filtered_data.csv")
                df = pd.read_csv(panel_file)
            
                law = {
                    'title': df['Regulation Title'].iloc[0],
                    'year': df['Year'].iloc[0],
                    'body': df.get('Regulatory Body', ['Unknown']).iloc[0],
                    'description': df.get('Description', ['Not available']).iloc[0],
                    'impact': df.get('Impact', ['Not available']).iloc[0],
                    'mechanisms': [mechanism]
                }
            
                # Generate model specification
                content = self.get_model_specification(law, mechanism, results)
            
                # Create filename
                filename = f"{panel_name}_model_specification.txt"
                file_path = os.path.join(main_dir, filename)
            
                # Save model specification
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)
                print(f"Saved model specification for {panel_name}")
            
            except Exception as e:
                print(f"Error processing panel {panel_name}: {str(e)}")
        
            processed += 1
            print(f"\nProgress: {processed}/{total_panels} ({(processed/total_panels)*100:.1f}%)")
    
        print("\nModel specification generation complete!")
    
def main():
    # Configuration
    API_KEY = "enter API here"
    BASE_DIR = "enter folder path here"
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = "enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.create_model_specifications(BASE_DIR, CSV_FILE, OUTPUT_DIR)
        print("\nModel specification sections complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

# 12. Ask Claude to write a conclusion 

import pandas as pd
import json
import os
from anthropic import Anthropic
import glob

class ComprehensiveAnalyzer:
    def __init__(self, api_key: str):
        """Initialize analyzer with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def read_regression_results(self, base_dir: str) -> dict:
        """Read regression results from all panel subfolders"""
        all_results = {}
        
        panel_dirs = [d for d in glob.glob(os.path.join(base_dir, 'panel_*_*')) if os.path.isdir(d)]
        
        for panel_dir in panel_dirs:
            panel_name = os.path.basename(panel_dir)
            results_file = os.path.join(panel_dir, 'regression_results.json')
            
            if os.path.exists(results_file):
                print(f"Reading results from {panel_name}")
                with open(results_file, 'r') as f:
                    results = json.load(f)
                all_results[panel_name] = results
            else:
                print(f"No results file found in {panel_name}")
        
        return all_results

    def format_regression_results(self, results: dict) -> str:
        """Format regression results for a specific panel"""
        formatted_text = "\nRegression Results:\n\n"
        
        for spec_name, spec_results in results.items():
            formatted_text += f"\nSpecification {spec_name}:\n"
            formatted_text += f"Treatment Effect: {spec_results['coefficients']['treatment_effect']:.4f}\n"
            formatted_text += f"T-statistic: {abs(spec_results['t_stats']['treatment_effect']):.2f}\n"
            formatted_text += f"P-value: {spec_results['pvalues']['treatment_effect']:.4f}\n"
            formatted_text += f"R-squared: {spec_results['r_squared']:.4f}\n"
            
            if spec_results['controls']:
                formatted_text += "\nControl Variables:\n"
                for control in spec_results['controls']:
                    coef = spec_results['coefficients'][control]
                    tstat = spec_results['t_stats'][control]
                    pvalue = spec_results['pvalues'][control]
                    formatted_text += f"{control.replace('_', ' ')}: coef={coef:.4f}, t={tstat:.2f}, p={pvalue:.4f}\n"
            
            formatted_text += "\n" + "-"*50 + "\n"
        
        return formatted_text

    def get_laws_analysis(self, csv_file: str) -> list:
        """Read and analyze laws from CSV file"""
        df = pd.read_csv(csv_file)
        laws = []
        
        # Define economic mechanisms
        mechanisms = [
            'Litigation Risk', 
            'Corporate Governance', 
            'Proprietary Costs', 
            'Information Asymmetry',
            'Unsophisticated Investors', 
            'Equity Issuance', 
            'Reputation Risk'
        ]
        
        for _, row in df.iterrows():
            # Get active mechanisms (where value is 'Yes')
            active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
            
            law = {
                'title': row['Regulation Title'],
                'year': row['Year'],
                'body': row['Regulatory Body'],
                'description': row['Description'],
                'impact': row['Impact'],
                'mechanisms': active_mechanisms
            }
            laws.append(law)
        
        return laws

    def get_conclusion(self, law: dict, mechanism: str, regression_results: dict) -> str:
        """Get conclusion section for a law and specific mechanism with regression results"""
        regression_text = self.format_regression_results(regression_results) if regression_results else "No regression results available."
        
        prompt = f"""You are an accounting academic writing a research paper examining {law['title']} and its 
        impact on voluntary disclosure through the {mechanism} channel. 
        Please write a conclusion section for an academic journal in accounting.

Law Details:
Title: {law['title']} ({law['year']})
Description: {law['description']}
Impact: {law['impact']}
Economic Mechanism: {mechanism}

Empirical Results:
{regression_text}

Please write a comprehensive conclusion following these guidelines:

1. Summary of Main Findings (2-3 paragraphs):
    - Restate the research question, focusing on the {mechanism} channel
    - Summarize key empirical findings
    - Discuss statistical and economic significance
    - Interpret the results in the context of {law['title']} and {mechanism}

2. Implications (1-2 paragraphs):
    - Discuss implications for regulators
    - Discuss implications for managers
    - Discuss implications for investors
    - Connect findings to broader literature on {mechanism}

3. Limitations and Future Research (1-2 paragraphs):
    - Acknowledge key limitations
    - Suggest promising avenues for future research
    - Discuss potential extensions, particularly related to {mechanism}
    
Writing Guidelines:
- Use active voice (e.g., "We find" instead of "It is found")
- Maintain formal academic tone
- Use past tense for your specific results
- Use present tense for implications
- Make clear distinctions between correlation and causation
- Focus on the practical significance of the findings
- Cite papers from top accounting and finance journals such as:
    The Accounting Review, Journal of Accounting Research, 
    Journal of Accounting and Economics, Contemporary Accounting Research, Accounting, Organizations, and Society,
    and Review of Financial Studies
- Do not include section headers
- Length: approximately 750 words"""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=8000,
                temperature=0.5,
                messages=[{
                    "role": "user",
                    "content": prompt
                }]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error getting conclusion: {str(e)}")
            return f"Error in analysis: {str(e)}"

    def create_conclusions(self, base_dir: str, csv_file: str, output_dir: str):
        """Generate and save conclusion sections for all laws"""
        # Create main directory
        main_dir = os.path.join(output_dir, 'conclusion')
        os.makedirs(main_dir, exist_ok=True)
        
        # Get laws and regression results
        laws = self.get_laws_analysis(csv_file)
        regression_results = self.read_regression_results(base_dir)
        
        # Process each law and mechanism
        for law in laws:
            print(f"\nProcessing law: {law['title']}")
            
            # Find matching regression results
            panel_name = f"panel_{law['title'].replace(' ', '')}"
            law_results = regression_results.get(panel_name, {})
            
            # Generate separate conclusion for each mechanism
            for mechanism in law['mechanisms']:
                print(f"Processing mechanism: {mechanism}")
                
                # Generate conclusion
                content = self.get_conclusion(law, mechanism, law_results)
                
                # Create filename with both law and mechanism
                clean_mechanism = mechanism.replace(' ', '_')
                filename = f"{law['title'].replace(' ', '_')}_{clean_mechanism}_conclusion.txt"
                file_path = os.path.join(main_dir, filename)
                
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(content)
                print(f"Saved conclusion for {law['title']} - {mechanism}")

def main():
    # Configuration
    API_KEY = "enter API here"
    BASE_DIR = "enter folder path here"
    CSV_FILE = "enter file path here"
    OUTPUT_DIR = "enter folder path here"
    
    try:
        analyzer = ComprehensiveAnalyzer(API_KEY)
        analyzer.create_conclusions(BASE_DIR, CSV_FILE, OUTPUT_DIR)
        print("\nConclusion sections complete!")
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

# 13. Ask Claude to write an abstract
import os
import json
import pandas as pd
from anthropic import Anthropic

class AbstractGenerator:
    def __init__(self, api_key: str):
        """Initialize abstract generator with Claude API key"""
        self.client = Anthropic(api_key=api_key)
    
    def read_laws_data(self, csv_file: str) -> pd.DataFrame:
        """Read the laws data CSV file"""
        return pd.read_csv(csv_file)
    
    def generate_abstract(self, introduction_content: str) -> str:
        """Generate an abstract based on an existing introduction"""
        prompt = f"""As an accounting academic, please convert the following introduction into a concise academic abstract.

Guidelines:
- Maintain the key points from the introduction
- Condense the content to 150-250 words
- Include background, research objective, methodology, key findings, and contribution
- Use a formal academic tone
- Avoid adding new information not present in the original text
- Use present tense for established findings
- Use past tense for specific results
- Do not include citations in the abstract
- Do not use the label "Abstract"

Introduction to Convert:
{introduction_content}

Please provide a structured abstract that captures the essence of the original introduction."""

        try:
            response = self.client.messages.create(
                model="claude-3-5-sonnet-20241022",
                max_tokens=3000,
                temperature=0.5,
                messages=[{"role": "user", "content": prompt}]
            )
            return response.content[0].text if hasattr(response, 'content') else "Error: No content in response"
        except Exception as e:
            print(f"Error generating abstract: {str(e)}")
            return f"Error in analysis: {str(e)}"
    
    def process_introductions(self, input_dir: str, output_dir: str):
        """Process introduction files and generate corresponding abstracts"""
        # Create abstracts directory
        os.makedirs(output_dir, exist_ok=True)
        
        # Find all introduction files
        introduction_files = [f for f in os.listdir(input_dir) if f.endswith('_introduction.txt')]
        
        # Process each introduction file
        for intro_file in introduction_files:
            try:
                # Read introduction content
                with open(os.path.join(input_dir, intro_file), 'r', encoding='utf-8') as f:
                    introduction_content = f.read()
                
                # Generate abstract
                abstract = self.generate_abstract(introduction_content)
                
                # Create abstract filename (replace 'introduction' with 'abstract')
                abstract_filename = intro_file.replace('_introduction.txt', '_abstract.txt')
                
                # Save abstract
                abstract_path = os.path.join(output_dir, abstract_filename)
                with open(abstract_path, 'w', encoding='utf-8') as f:
                    f.write(abstract)
                
                print(f"Generated abstract for {intro_file}")
            
            except Exception as e:
                print(f"Error processing {intro_file}: {str(e)}")

def main():
    # Configuration
    API_KEY = "enter API here"  
    
    # Directories
    INPUT_DIR = "enter folder path here"
    OUTPUT_DIR = "enter folder path here"
    
    try:
        # Initialize and run abstract generator
        generator = AbstractGenerator(API_KEY)
        generator.process_introductions(INPUT_DIR, OUTPUT_DIR)
        print("Abstract generation complete!")
    
    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

# 14. Combine AI-generated content from Txt. files
import os
import traceback
import pandas as pd

def create_law_mechanism_dict(csv_file: str) -> dict:
    """Create dictionary of laws and their active mechanisms from CSV"""
    df = pd.read_csv(csv_file)
    
    # Define all possible mechanisms
    mechanisms = [
        'Litigation Risk',
        'Corporate Governance',
        'Proprietary Costs',
        'Information Asymmetry',
        'Unsophisticated Investors',
        'Equity Issuance',
        'Reputation Risk'
    ]
    
    # Create the dictionary
    law_mechanisms = {}
    
    for _, row in df.iterrows():
        # Clean the law title for use as a key (replace spaces with underscores)
        law_title = row['Regulation Title'].replace(' ', '_')
        
        # Get active mechanisms for this law (where value is 'Yes')
        active_mechanisms = [mech for mech in mechanisms if row[mech] == 'Yes']
        
        # Add to dictionary if there are active mechanisms
        if active_mechanisms:
            law_mechanisms[law_title] = active_mechanisms
    
    return law_mechanisms

def get_descriptive_stats(desc_dir: str, law_name: str, mechanism: str) -> str:
    """Get descriptive statistics from panel subfolder"""
    # Convert law name to panel folder format (e.g., "panel_ResourceExtractionDisclosureRules_Unsophisticated_Investors")
    clean_mechanism = mechanism.replace(' ', '_')
    panel_name = f"panel_{law_name.replace('_', '')}_{clean_mechanism}"
    panel_dir = os.path.join(desc_dir, panel_name)
    
    desc_file = os.path.join(panel_dir, 'descriptive_stats_analysis.txt')
    
    if os.path.exists(desc_file):
        with open(desc_file, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        print(f"Warning: No descriptive statistics found for {panel_name}")
        return ""

def get_regression_analyses(desc_dir: str, law_name: str, mechanism: str) -> str:
    """Get regression analyses from panel subfolder"""
    # Convert law name to panel folder format (e.g., "panel_ResourceExtractionDisclosureRules_Unsophisticated_Investors")
    clean_mechanism = mechanism.replace(' ', '_')
    panel_name = f"panel_{law_name.replace('_', '')}_{clean_mechanism}"
    panel_dir = os.path.join(desc_dir, panel_name)
    
    desc_file = os.path.join(panel_dir, 'claude_interpretation.txt')
    
    if os.path.exists(desc_file):
        with open(desc_file, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        print(f"Warning: No regressions analyses found for {panel_name}")
        return ""

def combine_law_mechanism_sections(base_dir: str, law_name: str, mechanism: str) -> str:
    """Combine text sections for a specific law and mechanism in the correct order"""
    # Clean law name by removing underscores between words in the law name
    clean_law = law_name.replace('_', '')  # This removes underscores in the law name
    
    # Clean mechanism name for filenames - keep underscores between different components
    clean_mechanism = mechanism.replace(' ', '_')
    
    # New file naming format: Law_Mechanism_combined.txt
    output_filename = f"{clean_law}_{clean_mechanism}_combined.txt"
    
    # Define folder paths
    abs_dir = os.path.join(base_dir, 'abstracts')
    intro_dir = os.path.join(base_dir, 'introduction')
    back_hypo_dir = os.path.join(base_dir, 'background and hypothesis development')
    model_specification_dir = os.path.join(base_dir, 'model_specification')
    desc_dir = os.path.join(base_dir, 'descriptive_stats')
    reg_dir = os.path.join(base_dir, 'regression_analyses')
    conc_dir = os.path.join(base_dir, 'conclusion')
    
    # Check if all required sections exist
    required_files = {
        'abstract': os.path.join(abs_dir, f"{law_name}_{clean_mechanism}_abstract.txt"),
        'introduction': os.path.join(intro_dir, f"{law_name}_{clean_mechanism}_introduction.txt"),
        'background': os.path.join(back_hypo_dir, f"{law_name}_{clean_mechanism}_background_hypothesis.txt"),
        'model': os.path.join(model_specification_dir, f"{law_name}_{clean_mechanism}_model_specification.txt"),
        'conclusion': os.path.join(conc_dir, f"{law_name}_{clean_mechanism}_conclusion.txt")
    }
    
    # Check panel folders for descriptive stats and regression results
    panel_name = f"panel_{clean_law}_{clean_mechanism}"
    panel_files = {
        'descriptive_stats': os.path.join(desc_dir, panel_name, 'descriptive_stats_analysis.txt'),
        'regression': os.path.join(reg_dir, panel_name, 'claude_interpretation.txt')
    }
    
    # Verify all sections exist
    missing_sections = []
    for section, filepath in {**required_files, **panel_files}.items():
        if not os.path.exists(filepath):
            missing_sections.append(section)
    
    if missing_sections:
        print(f"\nSkipping {law_name} - {mechanism}: Missing sections: {', '.join(missing_sections)}")
        return None
    
    print(f"\nAll sections found for {law_name} - {mechanism}. Proceeding with combination...")
    
    # Define output directory
    output_dir = os.path.join(base_dir, 'combined_sections')
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize combined text
    combined_text = f"Analysis of {law_name} through {mechanism} channel\n\n"
    
    # 1. Get abstracts
    abs_file = os.path.join(abs_dir, f"{law_name}_{clean_mechanism}_abstract.txt")
    if os.path.exists(abs_file):
        print(f"Adding abstract from {abs_file}")
        with open(abs_file, 'r', encoding='utf-8') as f:
            combined_text += "Abstract: " + f.read().strip() + "\n\n"
    
    # Add a clear page break indicator
    combined_text += "\f" # Add form feed for page break
    
    # 2. Get introduction
    intro_file = os.path.join(intro_dir, f"{law_name}_{clean_mechanism}_introduction.txt")
    try:
        if os.path.exists(intro_file):
            print(f"Attempting to read introduction from {intro_file}")
            print(f"File size: {os.path.getsize(intro_file)} bytes")
            with open(intro_file, 'r', encoding='utf-8') as f:
                content = f.read()
                combined_text += "INTRODUCTION\n" + "="*50 + "\n\n"
                combined_text += content + "\n\n"
    except Exception as e:
        print(f"Error reading introduction file {intro_file}: {e}")
        print(traceback.format_exc())
    
            
    # 3. Get background and hypotheses
    back_hypo_file = os.path.join(back_hypo_dir, f"{law_name}_{clean_mechanism}_background_hypothesis.txt")
    if os.path.exists(back_hypo_file):
        print(f"Adding background and hypotheses from {back_hypo_file}")
        with open(back_hypo_file, 'r', encoding='utf-8') as f:
            combined_text += "BACKGROUND AND HYPOTHESIS DEVELOPMENT\n" + "="*50 + "\n\n"
            combined_text += f.read() + "\n\n"
            
    # 4. Model specification
    model_specification_file = os.path.join(model_specification_dir, f"{law_name}_{clean_mechanism}_model_specification.txt")
    if os.path.exists(model_specification_file):
        print(f"Adding model specification from {model_specification_file}")
        with open(model_specification_file, 'r', encoding='utf-8') as f:
            combined_text += "MODEL SPECIFICATION\n" + "="*50 + "\n\n"
            combined_text += f.read() + "\n\n"
    
    # 5. Get descriptive statistics from panel subfolder
    desc_text = get_descriptive_stats(desc_dir, law_name, mechanism)
    if desc_text:
        print(f"Adding descriptive statistics for {law_name} - {mechanism}")
        combined_text += "DESCRIPTIVE STATISTICS\n" + "="*50 + "\n\n"
        combined_text += desc_text + "\n\n"
    
    # 6. Get regression analyses from panel subfolder
    reg_text = get_regression_analyses(reg_dir, law_name, mechanism)
    if reg_text:
        print(f"Adding regression results for {law_name} - {mechanism}")
        combined_text += "RESULTS\n" + "="*50 + "\n\n"
        combined_text += reg_text + "\n\n"
        
    # 7. Get conclusion
    conc_file = os.path.join(conc_dir, f"{law_name}_{clean_mechanism}_conclusion.txt")
    if os.path.exists(conc_file):
        print(f"Adding conclusion from {conc_file}")
        with open(conc_file, 'r', encoding='utf-8') as f:
            combined_text += "CONCLUSION\n" + "="*50 + "\n\n"
            combined_text += f.read() + "\n\n"
    
    # Save combined text
    output_file = os.path.join(output_dir, output_filename)
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(combined_text)
    
    return output_file

def combine_all_laws(base_dir: str, csv_file: str):
    """Combine sections for all laws and their mechanisms"""
    law_mechanisms = create_law_mechanism_dict(csv_file)
    
    complete_papers = 0
    incomplete_papers = 0
    total_combinations = sum(len(mechanisms) for mechanisms in law_mechanisms.values())
    
    print(f"\nFound {total_combinations} law-mechanism combinations to process")
    
    for law_name, mechanisms in law_mechanisms.items():
        for mechanism in mechanisms:
            try:
                output_file = combine_law_mechanism_sections(base_dir, law_name, mechanism)
                if output_file:
                    complete_papers += 1
                    print(f"Successfully combined complete paper for {law_name} - {mechanism}")
                    print(f"Saved to: {output_file}")
                else:
                    incomplete_papers += 1
            except Exception as e:
                print(f"Error processing {law_name} - {mechanism}: {str(e)}")
                incomplete_papers += 1
    
    print("\nCombination Summary:")
    print(f"Total combinations: {total_combinations}")
    print(f"Complete papers: {complete_papers}")
    print(f"Incomplete papers: {incomplete_papers}")
                
if __name__ == "__main__":
    print("Script starting...")  
    BASE_DIR = "enter folder path here"
    CSV_FILE = os.path.join(BASE_DIR, "enter file path here")
    combine_all_laws(BASE_DIR, CSV_FILE)

# 15. Ask Claude to create a reference list
import os
import re
import anthropic
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_LEFT

def create_reference_pdf(references, output_path):
    """
    Creates a PDF with properly formatted references using ReportLab.
    
    Args:
        references (list or str): List of references or string containing references
        output_path (str): Path where the PDF will be saved
    """
    doc = SimpleDocTemplate(
        output_path,
        pagesize=letter,
        rightMargin=72,
        leftMargin=72,
        topMargin=72,
        bottomMargin=72
    )
    
    styles = getSampleStyleSheet()
    
    # Create style for references with proper hanging indentation
    ref_style = ParagraphStyle(
        'Reference',
        parent=styles['Normal'],
        fontName='Times-Roman',
        fontSize=12,
        leading=14,
        leftIndent=36,  # Overall left indent
        firstLineIndent=-36,  # Creates hanging indent
        alignment=TA_LEFT,
        spaceAfter=12  # Space between references
    )
    
    # Create header style
    header_style = ParagraphStyle(
        'Header',
        parent=styles['Normal'],
        fontName='Times-Bold',
        fontSize=12,
        spaceBefore=0,
        spaceAfter=20,
        alignment=TA_LEFT
    )
    
    # Initialize story for the PDF
    story = []
    
    # Add References header
    story.append(Paragraph("References", header_style))
    
    # Process references
    if isinstance(references, str):
        refs = clean_references(references)
    else:
        refs = references
    
    # Add each reference
    for ref in refs:
        if ref.strip():
            # Clean and format the reference
            ref = clean_reference(ref)
            story.append(Paragraph(ref, ref_style))
    
    # Build PDF
    doc.build(story)

def clean_reference(ref):
    """
    Cleans and formats a single reference.
    
    Args:
        ref (str): Reference string to clean
    
    Returns:
        str: Cleaned reference
    """
    # Remove line breaks and excess whitespace
    ref = ' '.join(ref.split())
    
    # Remove TextBlock and other formatting markers
    ref = re.sub(r'TextBlock\(text=|type=\'text\'\)|\'|\\\n|\\n', '', ref)
    
    # Fix spacing around periods in author names
    ref = re.sub(r'\.\s*([A-Z])', r'. \1', ref)
    
    # Fix spacing around ampersands
    ref = re.sub(r'\s*&\s*', ' & ', ref)
    
    # Fix multiple spaces
    ref = re.sub(r'\s+', ' ', ref)
    
    # Remove asterisks around journal names while preserving italics in PDF
    ref = re.sub(r'\s*\*([^*]+)\*', r' \1', ref)
    
    # Ensure proper spacing after commas
    ref = re.sub(r',\s*', ', ', ref)
    
    # Fix spacing around parentheses
    ref = re.sub(r'\s*\(\s*', ' (', ref)
    ref = re.sub(r'\s*\)', ')', ref)
    
    # Ensure the reference ends with a period
    ref = ref.rstrip('.')
    ref += '.'
    
    return ref.strip()

def clean_references(text):
    """
    Cleans and splits reference text into individual references.
    
    Args:
        text (str): Full text containing references
    
    Returns:
        list: List of cleaned references
    """
    # First, standardize all newlines
    text = text.replace('\\n', '\n')
    
    # Remove formatting markers
    text = re.sub(r'TextBlock\(text=|type=\'text\'\)|\'', '', text)
    
    # Split into potential references
    lines = text.split('\n')
    
    # Initialize variables
    refs = []
    current_ref = []
    
    for line in lines:
        line = line.strip()
        # Skip empty lines and headers
        if not line or line.lower() == 'references':
            continue
            
        # If line starts with a capital letter and previous reference exists,
        # it's probably a new reference
        if re.match(r'^[A-Z]', line) and current_ref:
            refs.append(' '.join(current_ref))
            current_ref = [line]
        else:
            current_ref.append(line)
    
    # Add the last reference
    if current_ref:
        refs.append(' '.join(current_ref))
    
    # Clean each reference
    cleaned_refs = []
    for ref in refs:
        cleaned = clean_reference(ref)
        if cleaned and not cleaned.isspace():
            cleaned_refs.append(cleaned)
    
    # Remove duplicates while preserving order
    seen = set()
    unique_refs = []
    for ref in cleaned_refs:
        if ref not in seen:
            seen.add(ref)
            unique_refs.append(ref)
    
    return unique_refs

def get_formatted_references(prompt):
    """
    Gets formatted references using the Anthropic Claude API.
    
    Args:
        prompt (str): The prompt to send to Claude
        
    Returns:
        str: Formatted references from Claude
    """
    try:
        client = anthropic.Anthropic(
            api_key="enter API here"
        )
        
        # Make the API call
        message = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=4000,
            temperature=0,
            system="You are a helpful research assistant with expertise in academic citations. Format references in proper APA style with full journal names, volumes, and page numbers.",
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        # Extract and clean the content
        if message and hasattr(message, 'content'):
            content = message.content
            if isinstance(content, list):
                content = '\n'.join(str(item) for item in content)
            return content
            
        return None
        
    except Exception as e:
        print(f"API Error: {e}")
        return None

def batch_process_files(input_dir, output_dir):
    """
    Process all text files in a directory and create corresponding reference PDFs.
    
    Args:
        input_dir (str): Path to directory containing input text files
        output_dir (str): Path to directory where PDFs will be saved
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Counter for processed files
    processed = 0
    errors = 0
    
    # Process each file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith('_combined.txt'):
            try:
                # Construct full input path
                input_path = os.path.join(input_dir, filename)
                
                # Create output filename
                output_filename = filename.replace('_combined.txt', '_references.pdf')
                output_path = os.path.join(output_dir, output_filename)
                
                # Read input file
                with open(input_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                
                # Create prompt for Claude
                prompt = f"""Based on the following text, generate a reference list in APA format. 
                Format each reference exactly like these examples:

                Leuz, C., & Verrecchia, R. E. (2000). The economic consequences of increased disclosure. Journal of Accounting Research, 91-124.

                Bourveau, T., She, G., & Zaldokas, A. (2020). Corporate disclosure as a tacit coordination mechanism: Evidence from cartel enforcement regulations. Journal of Accounting Research, 58(2), 295-332.

                Text for analysis:
                {text}

                Please format each reference following the exact style above, including:
                1. Remove any asterisks, TextBlock tags, or other formatting markers 
                2. Author names with initials
                3. Full title in sentence case
                4. Journal name in italics (use *journal name* for italics)
                5. Volume, issue, and page numbers where applicable
                6. Year in parentheses
                7. One reference per line 
                8. Subsequent references should be followed by a space after the previous reference
                9. Sort alphabetically by author's last name
                10. Provide only the references, no extra text or explanations"""

                
                # Get formatted references from Claude
                formatted_refs = get_formatted_references(prompt)
                
                if formatted_refs:
                    # Create the PDF with the formatted references
                    create_reference_pdf(formatted_refs, output_path)
                    processed += 1
                    print(f"Successfully processed: {filename}")
                else:
                    errors += 1
                    print(f"Error getting references for: {filename}")
                    
            except Exception as e:
                errors += 1
                print(f"Error processing {filename}: {str(e)}")
    
    # Print summary
    print(f"\nProcessing complete!")
    print(f"Successfully processed: {processed} files")
    print(f"Errors: {errors} files")

if __name__ == "__main__":
    # Set your input and output directories
    input_directory = "enter folder path here"
    output_directory = "enter folder path here"
    
    # Process all files
    batch_process_files(input_directory, output_directory)

# 16. Combine manuscript files with table files for descriptive statistics and regression analyses

import os
import re
from PyPDF2 import PdfMerger
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT, TA_CENTER
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

def get_descriptive_stats(desc_dir: str, law_name: str, mechanism: str) -> str:
    """Get descriptive statistics PDF from panel subfolder"""
    # Convert law name to panel folder format
    clean_mechanism = mechanism.replace(' ', '_')
    panel_name = f"panel_{law_name.replace('_', '')}_{clean_mechanism}"
    panel_dir = os.path.join(desc_dir, panel_name)
    
    # Look for descriptive_stats_table.pdf
    desc_file = os.path.join(panel_dir, 'descriptive_stats_table.pdf')
    
    if os.path.exists(desc_file):
        print(f"Found descriptive statistics table: {desc_file}")
        return desc_file
    else:
        print(f"Warning: No descriptive statistics table found in {panel_dir}")
        return ""

def get_regression_analyses(reg_dir: str, law_name: str, mechanism: str) -> str:
    """Get regression analyses PDF from panel subfolder"""
    # Convert law name to panel folder format
    clean_mechanism = mechanism.replace(' ', '_')
    panel_name = f"panel_{law_name.replace('_', '')}_{clean_mechanism}"
    panel_dir = os.path.join(reg_dir, panel_name)
    
    # Look for regression_table.pdf
    reg_file = os.path.join(panel_dir, 'regression_table.pdf')
    
    if os.path.exists(reg_file):
        print(f"Found regression table: {reg_file}")
        return reg_file
    else:
        print(f"Warning: No regression table found in {panel_dir}")
        return ""

def merge_pdf_files(base_dir: str, law_name: str, mechanism: str):
    """Merge manuscript PDF with regression results, descriptive statistics, and reference PDFs"""
    # Register Times New Roman font
    try:
        pdfmetrics.registerFont(TTFont('Times New Roman', 'times.ttf'))
        pdfmetrics.registerFont(TTFont('Times New Roman Bold', 'timesbd.ttf'))
    except:
        print("Warning: Times New Roman font not found, using default font")
    
    # Clean mechanism name for filenames
    clean_mechanism = mechanism.replace(' ', '_')
    
    # Define file paths
    combined_sections_dir = os.path.join(base_dir, 'combined_sections')
    reg_dir = os.path.join(base_dir, 'regression_analyses')
    desc_dir = os.path.join(base_dir, 'descriptive_stats')
    corr_dir = os.path.join(base_dir, 'correlations')
    ref_dir = os.path.join(base_dir, 'references')
    output_dir = os.path.join(base_dir, 'final_manuscripts')
    os.makedirs(output_dir, exist_ok=True)

    # Create intermediate PDF with formatting
    temp_pdf_path = os.path.join(output_dir, f'temp_{law_name}_{clean_mechanism}.pdf')
    temp_pdf = SimpleDocTemplate(
        temp_pdf_path,
        pagesize=letter,
        rightMargin=72,
        leftMargin=72,
        topMargin=72,
        bottomMargin=72
    )

    # Create styles
    styles = getSampleStyleSheet()
    
    custom_title = ParagraphStyle(
        name='CustomTitle',
        fontName='Times New Roman Bold',
        fontSize=16,
        spaceAfter=16,
        spaceBefore=24,
        firstLineIndent=0,
        alignment=TA_CENTER,
        leading=24
    )
    
    subtitle_style = ParagraphStyle(
        name='CustomSubtitle',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=24,
        spaceBefore=12,
        firstLineIndent=0,
        alignment=TA_CENTER,
        leading=24
    )
    
    abstract_style = ParagraphStyle(
        name='Abstract',
        fontName='Times New Roman',
        fontSize=12,
        firstLineIndent=0,
        spaceAfter=60,
        leading=14,  # Tighter line spacing for abstract
        alignment=TA_JUSTIFY
    )
    
    
    regular_style = ParagraphStyle(
        name='CustomRegular',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=12,
        firstLineIndent=36,
        leading=24,
        alignment=TA_JUSTIFY
    )
    
    heading_style = ParagraphStyle(
        name='CustomHeading',
        fontName='Times New Roman',
        fontSize=12,
        spaceAfter=24,
        spaceBefore=24,
        firstLineIndent=0,
        alignment=TA_LEFT,
        leading=24
    )

    # Read manuscript
    manuscript_file = os.path.join(combined_sections_dir, f"{law_name}_{clean_mechanism}_combined.txt")
    with open(manuscript_file, 'r', encoding='utf-8') as f:
        manuscript_text = f.read()

    # Create story (content)
    story = []
    
    # Format law name
    formatted_law_name = " ".join(re.findall(r'[A-Z][^A-Z]*', law_name))

    # Add title and subtitle
    title = f"{formatted_law_name} and Voluntary Disclosure"
    story.append(Paragraph(title, custom_title))
    story.append(Paragraph("Artemis Intelligencia", subtitle_style))
    story.append(Paragraph("February 1, 2025", subtitle_style))
    story.append(Spacer(1, 24))

    # Add manuscript content (skip the original title)
    sections = manuscript_text.split('\n\n')
    for section in sections[1:]:  # Skip the first section which contains the old title
        if section.strip():
            if '=' in section:  # Section heading
                heading = section.split('\n')[0]
                story.append(Paragraph(heading, heading_style))
            else:
                paragraphs = section.split('\n')
                for paragraph in paragraphs:
                    if paragraph.strip():
                        story.append(Paragraph(paragraph, regular_style))

    # Create the intermediate PDF
    temp_pdf.build(story)

    # Merge PDFs
    merger = PdfMerger()
    
    try:
        # Add formatted manuscript
        merger.append(temp_pdf_path)
        
        # Add references
        ref_file = os.path.join(ref_dir, f"{law_name}_{clean_mechanism}_references.pdf")
        if os.path.exists(ref_file):
            merger.append(ref_file)
            print(f"Added references from: {ref_file}")
        else:
            print(f"No references file found at: {ref_file}")
    
        # Add descriptive statistics table
        desc_stats_path = get_descriptive_stats(desc_dir, law_name, mechanism)
        if desc_stats_path:
            merger.append(desc_stats_path)
        else:
            print(f"No descriptive statistics table found for {law_name}_{mechanism}")
            
        #Add correlations table
        corr_file= os.path.join(corr_dir, f"{law_name}_{clean_mechanism}_correlation_table.pdf")
        if os.path.exists(corr_file):
            merger.append(corr_file)
            print(f"Added correlation table from: {corr_file}")
        else:
            print(f"No correlation file found at: {corr_file}")
    
        # Add regression table
        reg_table_path = get_regression_analyses(reg_dir, law_name, mechanism)
        if reg_table_path:
            merger.append(reg_table_path)
        else:
            print(f"No regression table found for {law_name}_{mechanism}")

        # Save final merged PDF
        output_file = os.path.join(output_dir, f"{'_'.join(law_name.split('_'))} and Voluntary Disclosure_{clean_mechanism}_final.pdf")
        merger.write(output_file)
        merger.close()
        
        # Clean up temporary file
        os.remove(temp_pdf_path)
        
        print(f"Successfully created formatted PDF for {law_name} - {mechanism}")
        print(f"Saved to: {output_file}")
        
    except Exception as e:
        print(f"Error creating PDF: {str(e)}")
    finally:
        merger.close()

# Batch processing of multiple laws and mechanisms
def batch_merge_pdfs(base_dir):
    # Define the list of laws and mechanisms
    laws_mechanisms = [
("Law 1", "Mechanism 1"),
("Law 1", "Mechanism 2"),
("Law 2", "Mechanism 1"),
("Other laws", "Other mechanisms")
    ]
    
    for law, mechanism in laws_mechanisms:
        try:
            merge_pdf_files(base_dir, law, mechanism)
        except Exception as e:
            print(f"Error processing {law} - {mechanism}: {str(e)}")


BASE_DIR = "enter folder path here"
batch_merge_pdfs(BASE_DIR)


# 17. Add page numbers 
import os
from PyPDF2 import PdfReader, PdfWriter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import io

def add_page_numbers(input_path, output_path):
    reader = PdfReader(input_path)
    writer = PdfWriter()
    
    for i in range(len(reader.pages)):
        page = reader.pages[i]
        packet = io.BytesIO()
        can = canvas.Canvas(packet, pagesize=(page.mediabox.width, page.mediabox.height))
        
        if i > 0:
            can.setFont('Times-Roman', 12)
            can.drawString(page.mediabox.width/2 - 6, 40, str(i))
        
        can.save()
        packet.seek(0)
        number_pdf = PdfReader(packet)
        
        if len(number_pdf.pages) > 0:
            page.merge_page(number_pdf.pages[0])
        writer.add_page(page)
    
    with open(output_path, "wb") as output_file:
        writer.write(output_file)

def batch_process_pdfs(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    pdfs = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    
    for i, filename in enumerate(pdfs, 1):
        print(f"\nProcessing {i}/{len(pdfs)}: {filename}")
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, f"numbered_{filename}")
        try:
            add_page_numbers(input_path, output_path)
            print(f"Successfully processed: {filename}")
        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")


input_dir = "enter folder path here"
output_dir = "enter folder path here"
batch_process_pdfs(input_dir, output_dir)

