In [None]:
#Code was developed in collaboration with Claude4-Sonnet
#Claude was used for code generation, documentation, and error handling
#Note: the code will run best when executing each section separately

# 1. Use Claude to generate code for empirical analysis

import anthropic
import json
import os


API_KEY = "enter API key here"

# Python Code Generation for Empirical Analysis
prompt = """You are an accounting academic. Your task is to generate Python code to conduct empirical analysis.
Run multiple regression specifications with and without fixed effects.

IMPORTANT: These regressions will be run for multiple panels from a directory.
 
Follow these guidelines:
-Write Python code to generate a time trend variable.
-Write Python code to filter the data to ±2 years around regulation year.
-Write Python code to generate 3 regression specifications:
    For the first regression, this is baseline model, a univariate regression using ordinary least   squares (OLS) with no control variables
    For the second regression, this is a model with control variables (including the time trend variable you just created)
    For the third regression, this is a model with control variables and firm fixed effects
-Each regression should  include standard errors clustered at the firm level.
-Write Process all the CSV files in the directory.
-Write Python code to save the regression results as a JSON file and as a PDF file.

IMPORTANT: 
-The regulation year is stored in a column called 'Year' in the data.
-The data has firm and year identifiers: 'GVKEY' and 'FYEAR'.
-The independent variable is 'treatment_effect'.
-The control variables are 'linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 'levol', 'lloss', 'lcalrisk','time_trend'.
-The dependent variable is 'freqMF'.
-Include both coefficient estimates and t-stats in outputs. 
"""

def call_claude_api(prompt, api_key, model="claude-sonnet-4-20250514"):
    """
    Send a prompt to Claude API and return the response
    """
    try:
        client = anthropic.Anthropic(api_key=api_key)
        
        message = client.messages.create(
            model=model,
            max_tokens=8000,  
            temperature=0.5,    
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        return message.content[0].text
    
    except Exception as e:
        print(f"Error calling Claude API: {e}")
        return None

def save_response(response, filename="claude_response.txt"):
    """
    Save Claude's response to a file
    """
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response)
    print(f"Response saved to {filename}")

if __name__ == "__main__":
    print("Sending prompt to Claude API...")
    response = call_claude_api(prompt, API_KEY)
    
    if response:
        print("Response received!")
        print("-" * 50)
        print(response)
        print("-" * 50)
        
        # Save the response
        save_response(response, "empirical_analysis_code.txt")
        if "```python" in response:
            code_start = response.find("```python") + len("```python")
            code_end = response.find("```", code_start)
            if code_end != -1:
                python_code = response[code_start:code_end].strip()
                with open("empirical_analysis.py", 'w', encoding='utf-8') as f:
                    f.write(python_code)
                print("Python code extracted and saved to empirical_analysis.py")
    else:
        print("Failed to get response from Claude API")
        
# 2. Code below is from Claude (created above)
#Copied and pasted in this box

import pandas as pd
import numpy as np
import os
import json
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels.panel import PanelOLS
import warnings
warnings.filterwarnings('ignore')

def generate_time_trend(df):
    """
    Generate a time trend variable based on FYEAR
    """
    df = df.copy()
    min_year = df['FYEAR'].min()
    df['time_trend'] = df['FYEAR'] - min_year
    return df

def filter_data_around_regulation(df, window=2):
    """
    Filter data to ±2 years around regulation year
    Assumes regulation year is stored in 'Year' column
    """
    df = df.copy()
    
    # If 'Year' contains the regulation year for each observation
    if 'Year' in df.columns:
        df = df[(df['FYEAR'] >= df['Year'] - window) & 
                (df['FYEAR'] <= df['Year'] + window)]
    
    return df

def run_regression_specifications(df):
    """
    Run three regression specifications with clustered standard errors
    """
    results = {}
    
    # Ensure we have the required columns
    required_cols = ['freqMF', 'treatment_effect', 'GVKEY', 'FYEAR']
    control_vars = ['linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 
                   'levol', 'lloss', 'lcalrisk', 'time_trend']
    
    # Check if all required columns exist
    missing_cols = [col for col in required_cols + control_vars if col not in df.columns]
    if missing_cols:
        print(f"Warning: Missing columns: {missing_cols}")
    
    # Remove rows with missing values in key variables
    key_vars = ['freqMF', 'treatment_effect', 'GVKEY', 'FYEAR']
    df_clean = df.dropna(subset=key_vars)
    
    if len(df_clean) == 0:
        print("No valid observations after cleaning")
        return results
    
    try:
        # Specification 1: Baseline OLS (univariate)
        formula1 = 'freqMF ~ treatment_effect'
        model1 = ols(formula1, data=df_clean).fit(cov_type='cluster', 
                                                  cov_kwds={'groups': df_clean['GVKEY']})
        
        results['spec1_baseline'] = {
            'formula': formula1,
            'n_obs': int(model1.nobs),
            'r_squared': float(model1.rsquared),
            'coefficients': {},
            't_stats': {},
            'p_values': {},
            'std_errors': {}
        }
        
        for var in model1.params.index:
            results['spec1_baseline']['coefficients'][var] = float(model1.params[var])
            results['spec1_baseline']['t_stats'][var] = float(model1.tvalues[var])
            results['spec1_baseline']['p_values'][var] = float(model1.pvalues[var])
            results['spec1_baseline']['std_errors'][var] = float(model1.bse[var])
    
    except Exception as e:
        print(f"Error in Specification 1: {e}")
        results['spec1_baseline'] = {'error': str(e)}
    
    try:
        # Specification 2: OLS with control variables
        available_controls = [var for var in control_vars if var in df_clean.columns]
        control_formula = ' + '.join(available_controls)
        formula2 = f'freqMF ~ treatment_effect + {control_formula}'
        
        # Remove rows with missing control variables
        all_vars = ['freqMF', 'treatment_effect'] + available_controls
        df_spec2 = df_clean.dropna(subset=all_vars)
        
        if len(df_spec2) > 0:
            model2 = ols(formula2, data=df_spec2).fit(cov_type='cluster', 
                                                      cov_kwds={'groups': df_spec2['GVKEY']})
            
            results['spec2_controls'] = {
                'formula': formula2,
                'n_obs': int(model2.nobs),
                'r_squared': float(model2.rsquared),
                'coefficients': {},
                't_stats': {},
                'p_values': {},
                'std_errors': {}
            }
            
            for var in model2.params.index:
                results['spec2_controls']['coefficients'][var] = float(model2.params[var])
                results['spec2_controls']['t_stats'][var] = float(model2.tvalues[var])
                results['spec2_controls']['p_values'][var] = float(model2.pvalues[var])
                results['spec2_controls']['std_errors'][var] = float(model2.bse[var])
        else:
            results['spec2_controls'] = {'error': 'No observations after removing missing values'}
    
    except Exception as e:
        print(f"Error in Specification 2: {e}")
        results['spec2_controls'] = {'error': str(e)}
    
    try:
        # Specification 3: Panel regression with firm fixed effects
        available_controls = [var for var in control_vars if var in df_clean.columns]
        all_vars = ['freqMF', 'treatment_effect'] + available_controls + ['GVKEY', 'FYEAR']
        df_spec3 = df_clean.dropna(subset=all_vars)
        
        if len(df_spec3) > 0:
            # Set up panel data
            df_spec3 = df_spec3.set_index(['GVKEY', 'FYEAR'])
            
            # Prepare variables for panel regression
            y = df_spec3['freqMF']
            X_vars = ['treatment_effect'] + available_controls
            X = df_spec3[X_vars]
            
            # Run panel regression with entity fixed effects
            model3 = PanelOLS(y, X, entity_effects=True).fit(cov_type='clustered', 
                                                              cluster_entity=True)
            
            results['spec3_fixed_effects'] = {
                'n_obs': int(model3.nobs),
                'r_squared': float(model3.rsquared),
                'coefficients': {},
                't_stats': {},
                'p_values': {},
                'std_errors': {}
            }
            
            for var in model3.params.index:
                results['spec3_fixed_effects']['coefficients'][var] = float(model3.params[var])
                results['spec3_fixed_effects']['t_stats'][var] = float(model3.tstats[var])
                results['spec3_fixed_effects']['p_values'][var] = float(model3.pvalues[var])
                results['spec3_fixed_effects']['std_errors'][var] = float(model3.std_errors[var])
        else:
            results['spec3_fixed_effects'] = {'error': 'No observations after removing missing values'}
    
    except Exception as e:
        print(f"Error in Specification 3: {e}")
        results['spec3_fixed_effects'] = {'error': str(e)}
    
    return results

def create_results_table_pdf(all_results, output_path):
    """
    Create a PDF with regression results tables
    """
    with PdfPages(output_path) as pdf:
        for file_name, results in all_results.items():
            fig, ax = plt.subplots(figsize=(12, 8))
            ax.axis('tight')
            ax.axis('off')
            
            # Create table data
            table_data = []
            headers = ['Variable', 'Spec 1: Baseline', 'Spec 2: Controls', 'Spec 3: Fixed Effects']
            table_data.append(headers)
            
            # Get all variables across specifications
            all_vars = set()
            for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                if spec_name in results and 'coefficients' in results[spec_name]:
                    all_vars.update(results[spec_name]['coefficients'].keys())
            
            # Add coefficient and t-stat rows for each variable
            for var in sorted(all_vars):
                if var == 'Intercept':
                    continue
                    
                coef_row = [var]
                tstat_row = ['']
                
                for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                    if (spec_name in results and 
                        'coefficients' in results[spec_name] and 
                        var in results[spec_name]['coefficients']):
                        
                        coef = results[spec_name]['coefficients'][var]
                        tstat = results[spec_name]['t_stats'][var]
                        pval = results[spec_name]['p_values'][var]
                        
                        # Add significance stars
                        stars = ''
                        if pval < 0.01:
                            stars = '***'
                        elif pval < 0.05:
                            stars = '**'
                        elif pval < 0.10:
                            stars = '*'
                        
                        coef_row.append(f'{coef:.4f}{stars}')
                        tstat_row.append(f'({tstat:.2f})')
                    else:
                        coef_row.append('')
                        tstat_row.append('')
                
                table_data.append(coef_row)
                table_data.append(tstat_row)
            
            # Add summary statistics
            table_data.append([''])
            obs_row = ['Observations']
            r2_row = ['R-squared']
            
            for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                if spec_name in results and 'n_obs' in results[spec_name]:
                    obs_row.append(str(results[spec_name]['n_obs']))
                    r2_row.append(f"{results[spec_name]['r_squared']:.4f}")
                else:
                    obs_row.append('')
                    r2_row.append('')
            
            table_data.append(obs_row)
            table_data.append(r2_row)
            
            # Create table
            table = ax.table(cellText=table_data, cellLoc='center', loc='center')
            table.auto_set_font_size(False)
            table.set_fontsize(8)
            table.scale(1.2, 1.5)
            
            # Style the table
            for i in range(len(headers)):
                table[(0, i)].set_facecolor('#40466e')
                table[(0, i)].set_text_props(weight='bold', color='white')
            
            plt.title(f'Regression Results: {file_name}', fontsize=14, fontweight='bold', pad=20)
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()

def main():
    # Set directory path containing CSV files
    data_directory = "enter file path here"
    if not data_directory:
        data_directory = "."  # Current directory if no input
    
    # Output directory
    output_dir = "regression_results"
    os.makedirs(output_dir, exist_ok=True)
    
    # Check where files are saved
    print(f"Results will be saved to: {os.path.abspath(output_dir)}")
    
    # Dictionary to store all results
    all_results = {}
    
    # Process all CSV files in the directory
    csv_files = list(Path(data_directory).glob("*.csv"))
    
    if not csv_files:
        print(f"No CSV files found in {data_directory}")
        return
    
    print(f"Found {len(csv_files)} CSV files to process")
    
    for csv_file in csv_files:
        print(f"\nProcessing: {csv_file.name}")
        
        try:
            # Read the data
            df = pd.read_csv(csv_file)
            print(f"Loaded {len(df)} observations")
            
            # Generate time trend variable
            df = generate_time_trend(df)
            
            # Filter data to ±2 years around regulation year
            df_filtered = filter_data_around_regulation(df, window=2)
            print(f"After filtering: {len(df_filtered)} observations")
            
            if len(df_filtered) == 0:
                print("No observations after filtering")
                continue
            
            # Run regression specifications
            results = run_regression_specifications(df_filtered)
            
            # Store results
            all_results[csv_file.stem] = results
            
            print(f"Completed analysis for {csv_file.name}")
            
        except Exception as e:
            print(f"Error processing {csv_file.name}: {e}")
            all_results[csv_file.stem] = {'error': str(e)}
    
    # Save results as JSON
    json_output = os.path.join(output_dir, "regression_results.json")
    with open(json_output, 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"\nResults saved to: {json_output}")
    
    # Create PDF with results tables
    pdf_output = os.path.join(output_dir, "regression_results.pdf")
    create_results_table_pdf(all_results, pdf_output)
    print(f"PDF report saved to: {pdf_output}")
    
    # Print summary
    print(f"\nSummary:")
    print(f"- Processed {len(csv_files)} files")
    print(f"- Successful analyses: {len([r for r in all_results.values() if 'error' not in r])}")
    print(f"- Results saved in: {output_dir}/")

if __name__ == "__main__":
    main()

# 3. Using Claude to fix the error from the Python code it created 

import anthropic
import os


API_KEY = "enter API key here"

# Error-fixing prompt
prompt = """I have Python code that runs empirical analysis and creates regression tables. The script successfully 
creates JSON output but fails when creating the PDF table with this error:

Here's the current `create_results_table_pdf` function that needs to be fixed:

def create_results_table_pdf(all_results, output_path):

    with PdfPages(output_path) as pdf:
        for file_name, results in all_results.items():
            fig, ax = plt.subplots(figsize=(12, 8))
            ax.axis('tight')
            ax.axis('off')
            
            # Create table data
            table_data = []
            headers = ['Variable', 'Spec 1: Baseline', 'Spec 2: Controls', 'Spec 3: Fixed Effects']
            table_data.append(headers)
            
            # Get all variables across specifications
            all_vars = set()
            for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                if spec_name in results and 'coefficients' in results[spec_name]:
                    all_vars.update(results[spec_name]['coefficients'].keys())
            
            # Add coefficient and t-stat rows for each variable
            for var in sorted(all_vars):
                if var == 'Intercept':
                    continue
                    
                coef_row = [var]
                tstat_row = ['']
                
                for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                    if (spec_name in results and 
                        'coefficients' in results[spec_name] and 
                        var in results[spec_name]['coefficients']):
                        
                        coef = results[spec_name]['coefficients'][var]
                        tstat = results[spec_name]['t_stats'][var]
                        pval = results[spec_name]['p_values'][var]
                        
                        # Add significance stars
                        stars = ''
                        if pval < 0.01:
                            stars = '***'
                        elif pval < 0.05:
                            stars = '**'
                        elif pval < 0.10:
                            stars = '*'
                        
                        coef_row.append(f'{coef:.4f}{stars}')
                        tstat_row.append(f'({tstat:.2f})')
                    else:
                        coef_row.append('')
                        tstat_row.append('')
                
                table_data.append(coef_row)
                table_data.append(tstat_row)
            
            # Add summary statistics
            table_data.append([''])
            obs_row = ['Observations']
            r2_row = ['R-squared']
            
            for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                if spec_name in results and 'n_obs' in results[spec_name]:
                    obs_row.append(str(results[spec_name]['n_obs']))
                    r2_row.append(f"{results[spec_name]['r_squared']:.4f}")
                else:
                    obs_row.append('')
                    r2_row.append('')
            
            table_data.append(obs_row)
            table_data.append(r2_row)
            
            # Create table
            table = ax.table(cellText=table_data, cellLoc='center', loc='center')
            table.auto_set_font_size(False)
            table.set_fontsize(8)
            table.scale(1.2, 1.5)
            
            # Style the table
            for i in range(len(headers)):
                table[(0, i)].set_facecolor('#40466e')
                table[(0, i)].set_text_props(weight='bold', color='white')
            
            plt.title(f'Regression Results: {file_name}', fontsize=14, fontweight='bold', pad=20)
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()
```

Here is the Python error for this function:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[9], line 328
    325     print(f"- Results saved in: {output_dir}/")
    327 if __name__ == "__main__":
--> 328     main()

Cell In[9], line 318, in main()
    316 # Create PDF with results tables
    317 pdf_output = os.path.join(output_dir, "regression_results.pdf")
--> 318 create_results_table_pdf(all_results, pdf_output)
    319 print(f"PDF report saved to: {pdf_output}")
    321 # Print summary

Cell In[9], line 240, in create_results_table_pdf(all_results, output_path)
    237 table_data.append(r2_row)
    239 # Create table
--> 240 table = ax.table(cellText=table_data, cellLoc='center', loc='center')
    241 table.auto_set_font_size(False)
    242 table.set_fontsize(8)

File ~\Anaconda3\lib\site-packages\matplotlib\table.py:746, in table(ax, cellText, cellColours, cellLoc, colWidths, rowLabels, rowColours, rowLoc, colLabels, colColours, colLoc, loc, bbox, edges, **kwargs)
    744 for row in cellText:
    745     if len(row) != cols:
--> 746         raise ValueError("Each row in 'cellText' must have {} columns"
    747                          .format(cols))
    749 if cellColours is not None:
    750     if len(cellColours) != rows:

ValueError: Each row in 'cellText' must have 4 columns


PLEASE FIX THIS ERROR


"""

def call_claude_api(prompt, api_key, model="claude-sonnet-4-20250514"):
    """
    Send a prompt to Claude API and return the response
    """
    try:
        client = anthropic.Anthropic(api_key=api_key)
        
        message = client.messages.create(
            model=model,
            max_tokens=8000,
            temperature=0.50,
            messages=[
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        
        return message.content[0].text
    
    except Exception as e:
        print(f"Error calling Claude API: {e}")
        return None

def save_response(response, filename="fixed_function.txt"):
    """
    Save Claude's response to a file
    """
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response)
    print(f"Fixed function saved to: {filename}")

if __name__ == "__main__":
    # Check if API key is set
    if API_KEY == "your-api-key-here":
        print("Please replace 'your-api-key-here' with your actual API key")
        exit(1)
    
    print("Sending error fix request to Claude API...")
    response = call_claude_api(prompt, API_KEY)
    
    if response:
        print("Response received!")
        print("-" * 50)
        print(response)
        print("-" * 50)
        
        # Save the response
        save_response(response, "fixed_pdf_function.txt")
        
        # Extract Python code if present
        if "```python" in response:
            code_start = response.find("```python") + len("```python")
            code_end = response.find("```", code_start)
            if code_end != -1:
                python_code = response[code_start:code_end].strip()
                with open("fixed_create_results_table_pdf.py", 'w', encoding='utf-8') as f:
                    f.write(python_code)
                print("Fixed function code saved to: fixed_create_results_table_pdf.py")
                print("\nYou can now copy this function back into your main script.")
    else:
        print("Failed to get response from Claude API")
        
        
# 4. Code below is from Claude (created above)
#Copied and pasted in this box 
#This includes the copy and paste from Claude fixing the code for the create_results_table_pdf

import pandas as pd
import numpy as np
import os
import json
from pathlib import Path
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import statsmodels.api as sm
from statsmodels.formula.api import ols
from linearmodels.panel import PanelOLS
import warnings
warnings.filterwarnings('ignore')

def generate_time_trend(df):
    """
    Generate a time trend variable based on FYEAR
    """
    df = df.copy()
    min_year = df['FYEAR'].min()
    df['time_trend'] = df['FYEAR'] - min_year
    return df

def filter_data_around_regulation(df, window=2):
    """
    Filter data to ±2 years around regulation year
    Assumes regulation year is stored in 'Year' column
    """
    df = df.copy()
    
    # If 'Year' contains the regulation year for each observation
    if 'Year' in df.columns:
        df = df[(df['FYEAR'] >= df['Year'] - window) & 
                (df['FYEAR'] <= df['Year'] + window)]
    
    return df

def run_regression_specifications(df):
    """
    Run three regression specifications with clustered standard errors
    """
    results = {}
    
    # Ensure we have the required columns
    required_cols = ['freqMF', 'treatment_effect', 'GVKEY', 'FYEAR']
    control_vars = ['linstown', 'lsize', 'lbtm', 'lroa', 'lsaret12', 
                   'levol', 'lloss', 'lcalrisk', 'time_trend']
    
    # Check if all required columns exist
    missing_cols = [col for col in required_cols + control_vars if col not in df.columns]
    if missing_cols:
        print(f"Warning: Missing columns: {missing_cols}")
    
    # Remove rows with missing values in key variables
    key_vars = ['freqMF', 'treatment_effect', 'GVKEY', 'FYEAR']
    df_clean = df.dropna(subset=key_vars)
    
    if len(df_clean) == 0:
        print("No valid observations after cleaning")
        return results
    
    try:
        # Specification 1: Baseline OLS (univariate)
        formula1 = 'freqMF ~ treatment_effect'
        model1 = ols(formula1, data=df_clean).fit(cov_type='cluster', 
                                                  cov_kwds={'groups': df_clean['GVKEY']})
        
        results['spec1_baseline'] = {
            'formula': formula1,
            'n_obs': int(model1.nobs),
            'r_squared': float(model1.rsquared),
            'coefficients': {},
            't_stats': {},
            'p_values': {},
            'std_errors': {}
        }
        
        for var in model1.params.index:
            results['spec1_baseline']['coefficients'][var] = float(model1.params[var])
            results['spec1_baseline']['t_stats'][var] = float(model1.tvalues[var])
            results['spec1_baseline']['p_values'][var] = float(model1.pvalues[var])
            results['spec1_baseline']['std_errors'][var] = float(model1.bse[var])
    
    except Exception as e:
        print(f"Error in Specification 1: {e}")
        results['spec1_baseline'] = {'error': str(e)}
    
    try:
        # Specification 2: OLS with control variables
        available_controls = [var for var in control_vars if var in df_clean.columns]
        control_formula = ' + '.join(available_controls)
        formula2 = f'freqMF ~ treatment_effect + {control_formula}'
        
        # Remove rows with missing control variables
        all_vars = ['freqMF', 'treatment_effect'] + available_controls
        df_spec2 = df_clean.dropna(subset=all_vars)
        
        if len(df_spec2) > 0:
            model2 = ols(formula2, data=df_spec2).fit(cov_type='cluster', 
                                                      cov_kwds={'groups': df_spec2['GVKEY']})
            
            results['spec2_controls'] = {
                'formula': formula2,
                'n_obs': int(model2.nobs),
                'r_squared': float(model2.rsquared),
                'coefficients': {},
                't_stats': {},
                'p_values': {},
                'std_errors': {}
            }
            
            for var in model2.params.index:
                results['spec2_controls']['coefficients'][var] = float(model2.params[var])
                results['spec2_controls']['t_stats'][var] = float(model2.tvalues[var])
                results['spec2_controls']['p_values'][var] = float(model2.pvalues[var])
                results['spec2_controls']['std_errors'][var] = float(model2.bse[var])
        else:
            results['spec2_controls'] = {'error': 'No observations after removing missing values'}
    
    except Exception as e:
        print(f"Error in Specification 2: {e}")
        results['spec2_controls'] = {'error': str(e)}
    
    try:
        # Specification 3: Panel regression with firm fixed effects
        available_controls = [var for var in control_vars if var in df_clean.columns]
        all_vars = ['freqMF', 'treatment_effect'] + available_controls + ['GVKEY', 'FYEAR']
        df_spec3 = df_clean.dropna(subset=all_vars)
        
        if len(df_spec3) > 0:
            # Set up panel data
            df_spec3 = df_spec3.set_index(['GVKEY', 'FYEAR'])
            
            # Prepare variables for panel regression
            y = df_spec3['freqMF']
            X_vars = ['treatment_effect'] + available_controls
            X = df_spec3[X_vars]
            
            # Run panel regression with entity fixed effects
            model3 = PanelOLS(y, X, entity_effects=True).fit(cov_type='clustered', 
                                                              cluster_entity=True)
            
            results['spec3_fixed_effects'] = {
                'n_obs': int(model3.nobs),
                'r_squared': float(model3.rsquared),
                'coefficients': {},
                't_stats': {},
                'p_values': {},
                'std_errors': {}
            }
            
            for var in model3.params.index:
                results['spec3_fixed_effects']['coefficients'][var] = float(model3.params[var])
                results['spec3_fixed_effects']['t_stats'][var] = float(model3.tstats[var])
                results['spec3_fixed_effects']['p_values'][var] = float(model3.pvalues[var])
                results['spec3_fixed_effects']['std_errors'][var] = float(model3.std_errors[var])
        else:
            results['spec3_fixed_effects'] = {'error': 'No observations after removing missing values'}
    
    except Exception as e:
        print(f"Error in Specification 3: {e}")
        results['spec3_fixed_effects'] = {'error': str(e)}
    
    return results

def create_results_table_pdf(all_results, output_path):
    with PdfPages(output_path) as pdf:
        for file_name, results in all_results.items():
            fig, ax = plt.subplots(figsize=(12, 8))
            ax.axis('tight')
            ax.axis('off')
            
            # Create table data
            table_data = []
            headers = ['Variable', 'Spec 1: Baseline', 'Spec 2: Controls', 'Spec 3: Fixed Effects']
            table_data.append(headers)
            
            # Get all variables across specifications
            all_vars = set()
            for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                if spec_name in results and 'coefficients' in results[spec_name]:
                    all_vars.update(results[spec_name]['coefficients'].keys())
            
            # Add coefficient and t-stat rows for each variable
            for var in sorted(all_vars):
                if var == 'Intercept':
                    continue
                    
                coef_row = [var]
                tstat_row = ['']
                
                for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                    if (spec_name in results and 
                        'coefficients' in results[spec_name] and 
                        var in results[spec_name]['coefficients']):
                        
                        coef = results[spec_name]['coefficients'][var]
                        tstat = results[spec_name]['t_stats'][var]
                        pval = results[spec_name]['p_values'][var]
                        
                        # Add significance stars
                        stars = ''
                        if pval < 0.01:
                            stars = '***'
                        elif pval < 0.05:
                            stars = '**'
                        elif pval < 0.10:
                            stars = '*'
                        
                        coef_row.append(f'{coef:.4f}{stars}')
                        tstat_row.append(f'({tstat:.2f})')
                    else:
                        coef_row.append('')
                        tstat_row.append('')
                
                table_data.append(coef_row)
                table_data.append(tstat_row)
            
            # Add summary statistics with proper column count
            # Add empty separator row with correct number of columns
            table_data.append(['', '', '', ''])  # Fixed: now has 4 columns
            
            obs_row = ['Observations']
            r2_row = ['R-squared']
            
            for spec_name in ['spec1_baseline', 'spec2_controls', 'spec3_fixed_effects']:
                if spec_name in results and 'n_obs' in results[spec_name]:
                    obs_row.append(str(results[spec_name]['n_obs']))
                    r2_row.append(f"{results[spec_name]['r_squared']:.4f}")
                else:
                    obs_row.append('')
                    r2_row.append('')
            
            table_data.append(obs_row)
            table_data.append(r2_row)
            
            # Debug: Check all rows have same length
            expected_cols = len(headers)
            for i, row in enumerate(table_data):
                if len(row) != expected_cols:
                    print(f"Row {i} has {len(row)} columns, expected {expected_cols}: {row}")
                    # Pad or trim row to correct length
                    while len(row) < expected_cols:
                        row.append('')
                    if len(row) > expected_cols:
                        row = row[:expected_cols]
                        table_data[i] = row
            
            # Create table
            table = ax.table(cellText=table_data, cellLoc='center', loc='center')
            table.auto_set_font_size(False)
            table.set_fontsize(8)
            table.scale(1.2, 1.5)
            
            # Style the table
            for i in range(len(headers)):
                table[(0, i)].set_facecolor('#40466e')
                table[(0, i)].set_text_props(weight='bold', color='white')
            
            plt.title(f'Regression Results: {file_name}', fontsize=14, fontweight='bold', pad=20)
            pdf.savefig(fig, bbox_inches='tight')
            plt.close()

def main():
    # Set directory path containing CSV files
    data_directory = "enter folder path here"
    if not data_directory:
        data_directory = "."  # Current directory if no input
    
    # Output directory
    output_dir = "regression_results"
    os.makedirs(output_dir, exist_ok=True)
    
    # Check where files are saved
    print(f"Results will be saved to: {os.path.abspath(output_dir)}")
    
    # Dictionary to store all results
    all_results = {}
    
    # Process all CSV files in the directory
    csv_files = list(Path(data_directory).glob("*.csv"))
    
    if not csv_files:
        print(f"No CSV files found in {data_directory}")
        return
    
    print(f"Found {len(csv_files)} CSV files to process")
    
    for csv_file in csv_files:
        print(f"\nProcessing: {csv_file.name}")
        
        try:
            # Read the data
            df = pd.read_csv(csv_file)
            print(f"Loaded {len(df)} observations")
            
            # Generate time trend variable
            df = generate_time_trend(df)
            
            # Filter data to ±2 years around regulation year
            df_filtered = filter_data_around_regulation(df, window=2)
            print(f"After filtering: {len(df_filtered)} observations")
            
            if len(df_filtered) == 0:
                print("No observations after filtering")
                continue
            
            # Run regression specifications
            results = run_regression_specifications(df_filtered)
            
            # Store results
            all_results[csv_file.stem] = results
            
            print(f"Completed analysis for {csv_file.name}")
            
        except Exception as e:
            print(f"Error processing {csv_file.name}: {e}")
            all_results[csv_file.stem] = {'error': str(e)}
    
    # Save results as JSON
    json_output = os.path.join(output_dir, "regression_results.json")
    with open(json_output, 'w') as f:
        json.dump(all_results, f, indent=2)
    print(f"\nResults saved to: {json_output}")
    
    # Create PDF with results tables
    pdf_output = os.path.join(output_dir, "regression_results.pdf")
    create_results_table_pdf(all_results, pdf_output)
    print(f"PDF report saved to: {pdf_output}")
    
    # Print summary
    print(f"\nSummary:")
    print(f"- Processed {len(csv_files)} files")
    print(f"- Successful analyses: {len([r for r in all_results.values() if 'error' not in r])}")
    print(f"- Results saved in: {output_dir}/")

if __name__ == "__main__":
    main()
