In [1]:
#WLS Regression with 0.05 bandwith
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.linear_model import WLS

#load data
modified_data = pd.read_csv('modified_hansen_data.csv')

# Define the rectangular kernel weights
def rectangular_kernel_weights(x, center, bandwidth):
    return ((x >= center - bandwidth) & (x <= center + bandwidth)).astype(int)

# Apply the rectangular kernel weights
modified_data['rectangular_weights'] = rectangular_kernel_weights(
    modified_data['bac1'], center=0.08, bandwidth=0.05
)

# Subset the data to only include observations within the bandwidth
subset_data = modified_data[modified_data['rectangular_weights'] > 0]

# Prepare the variables for the regression
X_rectangular = subset_data[['dui', 'bac1', 'year', 'male', 'white', 'aged']]
X_rectangular = sm.add_constant(X_rectangular)
X_rectangular['dui_x_bac1'] = X_rectangular['dui'] * X_rectangular['bac1']

# The dependent variable
y_rectangular = subset_data['recidivism']

# Run the regression with robust standard errors
model_rectangular = sm.WLS(y_rectangular, X_rectangular, weights=subset_data['rectangular_weights']).fit(cov_type='HC3')


# Summary of the model
model_rectangular.summary()

0,1,2,3
Dep. Variable:,recidivism,R-squared:,0.004
Model:,WLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,57.12
Date:,"Sun, 28 Apr 2024",Prob (F-statistic):,4.04e-82
Time:,21:06:58,Log-Likelihood:,-21825.0
No. Observations:,89967,AIC:,43670.0
Df Residuals:,89959,BIC:,43740.0
Df Model:,7,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,5.4498,0.823,6.619,0.000,3.836,7.064
dui,-0.0582,0.015,-3.826,0.000,-0.088,-0.028
bac1,-0.0411,0.187,-0.220,0.826,-0.407,0.325
year,-0.0027,0.000,-6.485,0.000,-0.003,-0.002
male,0.0325,0.002,13.962,0.000,0.028,0.037
white,0.0145,0.003,5.153,0.000,0.009,0.020
aged,-0.0009,8.48e-05,-10.084,0.000,-0.001,-0.001
dui_x_bac1,0.4282,0.204,2.100,0.036,0.029,0.828

0,1,2,3
Omnibus:,43114.954,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,169454.67
Skew:,2.528,Prob(JB):,0.0
Kurtosis:,7.432,Cond. No.,1610000.0


In [3]:
#Regression with 0.025 bandwith


import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.linear_model import WLS

#load data
modified_data = pd.read_csv('modified_hansen_data.csv')

# Define the rectangular kernel weights
def rectangular_kernel_weights(x, center, bandwidth):
    return ((x >= center - bandwidth) & (x <= center + bandwidth)).astype(int)

# Apply the rectangular kernel weights
modified_data['rectangular_weights'] = rectangular_kernel_weights(
    modified_data['bac1'], center=0.08, bandwidth=0.025
)

# Subset the data to only include observations within the bandwidth
subset_data = modified_data[modified_data['rectangular_weights'] > 0]

# Prepare the variables for the regression
X_rectangular = subset_data[['dui', 'bac1', 'year', 'male', 'white', 'aged']]
X_rectangular = sm.add_constant(X_rectangular)
X_rectangular['dui_x_bac1'] = X_rectangular['dui'] * X_rectangular['bac1']

# The dependent variable
y_rectangular = subset_data['recidivism']

# Run the regression with robust standard errors
model_rectangular = sm.WLS(y_rectangular, X_rectangular, weights=subset_data['rectangular_weights']).fit(cov_type='HC3')

# Summary of the model
model_rectangular.summary()

0,1,2,3
Dep. Variable:,recidivism,R-squared:,0.005
Model:,WLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,33.09
Date:,"Sun, 28 Apr 2024",Prob (F-statistic):,2.94e-46
Time:,21:07:41,Log-Likelihood:,-11133.0
No. Observations:,47205,AIC:,22280.0
Df Residuals:,47197,BIC:,22350.0
Df Model:,7,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,5.9215,1.134,5.221,0.000,3.699,8.144
dui,-0.0683,0.034,-1.997,0.046,-0.135,-0.001
bac1,-0.2669,0.369,-0.723,0.470,-0.990,0.457
year,-0.0029,0.001,-5.119,0.000,-0.004,-0.002
male,0.0347,0.003,10.978,0.000,0.029,0.041
white,0.0156,0.004,4.060,0.000,0.008,0.023
aged,-0.0008,0.000,-6.706,0.000,-0.001,-0.001
dui_x_bac1,0.6045,0.438,1.380,0.167,-0.254,1.463

0,1,2,3
Omnibus:,22924.733,Durbin-Watson:,1.995
Prob(Omnibus):,0.0,Jarque-Bera (JB):,92241.149
Skew:,2.553,Prob(JB):,0.0
Kurtosis:,7.564,Cond. No.,1610000.0


In [4]:
#Regression with optimal bandwith
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.regression.linear_model import WLS

#load data
modified_data = pd.read_csv('modified_hansen_data.csv')

# Define the rectangular kernel weights
def rectangular_kernel_weights(x, center, bandwidth):
    return ((x >= center - bandwidth) & (x <= center + bandwidth)).astype(int)

# Apply the rectangular kernel weights
modified_data['rectangular_weights'] = rectangular_kernel_weights(
    modified_data['bac1'], center=0.08, bandwidth= 0.027222222222222224
)

# Subset the data to only include observations within the bandwidth
subset_data = modified_data[modified_data['rectangular_weights'] > 0]

# Prepare the variables for the regression
X_rectangular = subset_data[['dui', 'bac1', 'year', 'male', 'white', 'aged']]
X_rectangular = sm.add_constant(X_rectangular)
X_rectangular['dui_x_bac1'] = X_rectangular['dui'] * X_rectangular['bac1']

# The dependent variable
y_rectangular = subset_data['recidivism']

# Run the regression with robust standard errors
model_rectangular = sm.WLS(y_rectangular, X_rectangular, weights=subset_data['rectangular_weights']).fit(cov_type='HC3')

# Summary of the model
model_rectangular.summary()

0,1,2,3
Dep. Variable:,recidivism,R-squared:,0.005
Model:,WLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,35.44
Date:,"Sun, 28 Apr 2024",Prob (F-statistic):,9.6e-50
Time:,21:08:23,Log-Likelihood:,-11757.0
No. Observations:,50661,AIC:,23530.0
Df Residuals:,50653,BIC:,23600.0
Df Model:,7,,
Covariance Type:,HC3,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,6.0264,1.090,5.527,0.000,3.889,8.163
dui,-0.0421,0.031,-1.358,0.174,-0.103,0.019
bac1,-0.2521,0.341,-0.739,0.460,-0.921,0.417
year,-0.0029,0.001,-5.421,0.000,-0.004,-0.002
male,0.0346,0.003,11.377,0.000,0.029,0.041
white,0.0154,0.004,4.177,0.000,0.008,0.023
aged,-0.0008,0.000,-6.824,0.000,-0.001,-0.001
dui_x_bac1,0.3096,0.399,0.776,0.437,-0.472,1.091

0,1,2,3
Omnibus:,24792.629,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,101191.65
Skew:,2.568,Prob(JB):,0.0
Kurtosis:,7.643,Cond. No.,1610000.0


In [11]:
# Applying a bandwidth of 0.05 around the threshold for 'agg_dui'.


# Define 'agg_dui' within the specified bandwidth
data['aggDUI'] = ((data['bac1'] > 0.15) & (data['bac1'] <= 0.20)).astype(int)
data['bac1_agg_dui_bandwidth'] = data['bac1'] * data['aggDUI']

# Define the independent variables for the regression with the new 'agg_dui_bandwidth' definition
X_bandwidth_005 = data[['male', 'white', 'aged', 'acc', 'year', 'bac1', 'aggDUI']]
X_bandwidth_005 = sm.add_constant(X_bandwidth_005)

# Fit the regression model with the new 'agg_dui_bandwidth' definition
model_bandwidth_005 = sm.OLS(y, X_bandwidth_005).fit()

# Retrieve the regression results with the bandwidth
results_bandwidth_005 = model_bandwidth_005.summary()

results_bandwidth_005

0,1,2,3
Dep. Variable:,recidivism,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,116.9
Date:,"Sun, 28 Apr 2024",Prob (F-statistic):,4.03e-172
Time:,21:30:57,Log-Likelihood:,-61013.0
No. Observations:,214558,AIC:,122000.0
Df Residuals:,214550,BIC:,122100.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.3229,0.553,11.435,0.000,5.239,7.407
male,0.0281,0.002,16.495,0.000,0.025,0.031
white,0.0016,0.002,0.771,0.441,-0.002,0.006
aged,-0.0006,6.07e-05,-9.845,0.000,-0.001,-0.000
acc,-0.0055,0.002,-2.785,0.005,-0.009,-0.002
year,-0.0031,0.000,-11.292,0.000,-0.004,-0.003
bac1,0.2393,0.014,17.168,0.000,0.212,0.267
aggDUI,-0.0042,0.002,-2.569,0.010,-0.007,-0.001

0,1,2,3
Omnibus:,93717.984,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,315409.58
Skew:,2.36,Prob(JB):,0.0
Kurtosis:,6.606,Cond. No.,1600000.0


In [12]:
#Applying a bandwith of 0.025 around threshold for 'agg_dui'
# Define 'agg_dui' within the specified bandwidth
data['aggDUI'] = ((data['bac1'] > 0.15) & (data['bac1'] <= 0.175)).astype(int)
data['bac1_agg_dui_bandwidth'] = data['bac1'] * data['aggDUI']

# Define the independent variables for the regression with the new 'agg_dui_bandwidth' definition
X_bandwidth_025 = data[['male', 'white', 'aged', 'acc', 'year', 'bac1', 'aggDUI']]
X_bandwidth_025 = sm.add_constant(X_bandwidth_025)

# Fit the regression model with the new 'agg_dui_bandwidth' definition
model_bandwidth_025 = sm.OLS(y, X_bandwidth_025).fit()

# Retrieve the regression results with the bandwidth
results_bandwidth_025 = model_bandwidth_025.summary()

results_bandwidth_025

0,1,2,3
Dep. Variable:,recidivism,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,117.2
Date:,"Sun, 28 Apr 2024",Prob (F-statistic):,1.51e-172
Time:,21:32:25,Log-Likelihood:,-61012.0
No. Observations:,214558,AIC:,122000.0
Df Residuals:,214550,BIC:,122100.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.3161,0.553,11.422,0.000,5.232,7.400
male,0.0281,0.002,16.493,0.000,0.025,0.031
white,0.0016,0.002,0.774,0.439,-0.002,0.006
aged,-0.0006,6.07e-05,-9.842,0.000,-0.001,-0.000
acc,-0.0055,0.002,-2.805,0.005,-0.009,-0.002
year,-0.0031,0.000,-11.278,0.000,-0.004,-0.003
bac1,0.2331,0.013,17.630,0.000,0.207,0.259
aggDUI,-0.0055,0.002,-2.928,0.003,-0.009,-0.002

0,1,2,3
Omnibus:,93716.122,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,315395.844
Skew:,2.36,Prob(JB):,0.0
Kurtosis:,6.606,Cond. No.,1600000.0


In [13]:
#Applying the optimal bandwith around the threshold for 'agg_dui'

# Define 'agg_dui' within the specified bandwidth
optimal_bandwith = 0.027222222222222224

data['aggDUI'] = ((data['bac1'] > 0.15) & (data['bac1'] <= 0.15 + optimal_bandwith)).astype(int)
data['bac1_agg_dui_bandwidth'] = data['bac1'] * data['aggDUI']

# Define the independent variables for the regression with the new 'agg_dui_bandwidth' definition
X_bandwidth_optimal = data[['male', 'white', 'aged', 'acc', 'year', 'bac1', 'aggDUI']]
X_bandwidth_optimal = sm.add_constant(X_bandwidth_optimal)

# Fit the regression model with the new 'agg_dui_bandwidth' definition
model_bandwidth_optimal = sm.OLS(y, X_bandwidth_optimal).fit()

# Retrieve the regression results with the bandwidth
results_bandwidth_optimal = model_bandwidth_optimal.summary()

results_bandwidth_optimal

0,1,2,3
Dep. Variable:,recidivism,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,117.4
Date:,"Sun, 28 Apr 2024",Prob (F-statistic):,8.900000000000001e-173
Time:,21:40:15,Log-Likelihood:,-61012.0
No. Observations:,214558,AIC:,122000.0
Df Residuals:,214550,BIC:,122100.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,6.3166,0.553,11.423,0.000,5.233,7.400
male,0.0281,0.002,16.493,0.000,0.025,0.031
white,0.0016,0.002,0.776,0.438,-0.002,0.006
aged,-0.0006,6.07e-05,-9.846,0.000,-0.001,-0.000
acc,-0.0056,0.002,-2.807,0.005,-0.009,-0.002
year,-0.0031,0.000,-11.279,0.000,-0.004,-0.003
bac1,0.2341,0.013,17.660,0.000,0.208,0.260
aggDUI,-0.0057,0.002,-3.104,0.002,-0.009,-0.002

0,1,2,3
Omnibus:,93715.536,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,315392.18
Skew:,2.36,Prob(JB):,0.0
Kurtosis:,6.606,Cond. No.,1600000.0
