In [18]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from linearmodels.iv import IV2SLS
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from linearmodels.panel import PanelOLS
from statsmodels.sandbox.regression.gmm import IV2SLS

# Load the dataset
file_path = 'dataset.dta'
df = pd.read_stata(file_path)


# Assuming 'regionid' is your panel identifier and 'year' represents time
df = df.set_index(['regionid', 'year'])

# Defining control variables for each model
controls_1 = ['conc_weig_basicpdpd_c4']
controls_2 = controls_1 + [f'ttrend{i}' for i in range(1, 35)]
controls_3 = controls_1 + ['laghershf_firm_tot', 'lagunemploymentrate']
controls_4 = controls_3 + [f'ttrend{i}' for i in range(1, 35)]
controls_5 = controls_3 + ['mean_weig_reg_vintage', 'mean_weig_reg_pharmacies', 'mean_weig_reg_drugs']
controls_6 = controls_5 + [f'ttrend{i}' for i in range(1, 35)]

# Defining outcome variable
outcome = 'logmean_w_reg_bpr_ch'

# Running OLS regressions for each set of controls
for i, controls in enumerate([controls_1, controls_2, controls_3, controls_4, controls_5, controls_6], 1):
    X = sm.add_constant(df[controls])
    y = df[outcome]
    model = PanelOLS(y, X, entity_effects=True, time_effects=True, drop_absorbed=True)
    results = model.fit(cov_type='clustered', cluster_entity=True)
    print(f'Results for Model {i}:')
    print(results)
    



Results for Model 1:
                           PanelOLS Estimation Summary                            
Dep. Variable:     logmean_w_reg_bpr_ch   R-squared:                        0.0710
Estimator:                     PanelOLS   R-squared (Between):             -0.4908
No. Observations:                   170   R-squared (Within):              -0.4241
Date:                  Tue, Feb 27 2024   R-squared (Overall):             -0.4311
Time:                          21:06:46   Log-likelihood                    235.43
Cov. Estimator:               Clustered                                           
                                          F-statistic:                      10.007
Entities:                            34   P-value                           0.0019
Avg Obs:                         5.0000   Distribution:                   F(1,131)
Min Obs:                         5.0000                                           
Max Obs:                         5.0000   F-statistic (robust):   

Variables have been fully absorbed and have removed from the regression:

ttrend34

  results = model.fit(cov_type='clustered', cluster_entity=True)
Variables have been fully absorbed and have removed from the regression:

ttrend34

  results = model.fit(cov_type='clustered', cluster_entity=True)


Results for Model 6:
                           PanelOLS Estimation Summary                            
Dep. Variable:     logmean_w_reg_bpr_ch   R-squared:                        0.3910
Estimator:                     PanelOLS   R-squared (Between):             -27.230
No. Observations:                   170   R-squared (Within):              -0.1960
Date:                  Tue, Feb 27 2024   R-squared (Overall):             -3.0361
Time:                          21:06:46   Log-likelihood                    271.33
Cov. Estimator:               Clustered                                           
                                          F-statistic:                      1.5312
Entities:                            34   P-value                           0.0494
Avg Obs:                         5.0000   Distribution:                   F(39,93)
Min Obs:                         5.0000                                           
Max Obs:                         5.0000   F-statistic (robust):   

Variables have been fully absorbed and have removed from the regression:

ttrend34

  results = model.fit(cov_type='clustered', cluster_entity=True)


In [19]:
# Running 2SLS regressions for each set of controls

instrument_var = 'regio_2006share2'
for i, controls in enumerate([controls_1, controls_2, controls_3, controls_4, controls_5, controls_6], 1):
    X = df[controls]
    y = df[outcome]
    instrument = df[[instrument_var] + controls]
    model = IV2SLS(y, sm.add_constant(X), instrument).fit()
    print(f'2SLS Results for Model {i}:')
    print(model.summary())





2SLS Results for Model 1:
                           IV2SLS Regression Results                            
Dep. Variable:     logmean_w_reg_bpr_ch   R-squared:                      -0.593
Model:                           IV2SLS   Adj. R-squared:                 -0.603
Method:                       Two Stage   F-statistic:                     4.075
                          Least Squares   Prob (F-statistic):             0.0451
Date:                  Tue, 27 Feb 2024                                         
Time:                          21:06:46                                         
No. Observations:                   170                                         
Df Residuals:                       168                                         
Df Model:                             1                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------