In [1]:
import pandas as pd

# Use a relative path to refer to the data file
data_path = './data/table4_dta.csv'
df = pd.read_csv(data_path)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import subprocess
import sys

# Install necessary packages
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pandas', 'statsmodels'])

import statsmodels.formula.api as smf

Collecting statsmodels
  Downloading statsmodels-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.4 (from statsmodels)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading statsmodels-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: patsy, statsmodels
Successfully installed patsy-0.5.6 statsmodels-0.14.1


In [3]:
# Take a random sample of 3/10 of the dataset
sample_df = df.sample(frac=0.3, random_state=42)

# Drop rows with missing values in relevant columns
sample_df = sample_df.dropna(subset=['ett', 'experiment', 'female', 'dod'])

# Define the regression formulas
formula1 = 'dod ~ experiment + female + C(foddar) + C(kommun60):C(foddar)'
formula2 = 'dod ~ experiment + female + C(foddar) + C(kommun60):C(foddar)'

# Fit the models on the sampled data
model1 = smf.wls(formula=formula1, data=sample_df, weights=sample_df['ett']).fit(cov_type='cluster', cov_kwds={'groups': sample_df['kommun60']})
model2 = smf.wls(formula=formula2, data=sample_df[sample_df['female'] == 0], weights=sample_df[sample_df['female'] == 0]['ett']).fit(cov_type='cluster', cov_kwds={'groups': sample_df[sample_df['female'] == 0]['kommun60']})
model3 = smf.wls(formula=formula2, data=sample_df[sample_df['female'] == 1], weights=sample_df[sample_df['female'] == 1]['ett']).fit(cov_type='cluster', cov_kwds={'groups': sample_df[sample_df['female'] == 1]['kommun60']})

# Print the regression results
print(model1.summary())
print(model2.summary())
print(model3.summary())

