In [1]:
# import libraries
import numpy as np
import pandas as pd

In [2]:
# Loading the dataset
data_set = pd.read_csv('njmin3.csv')

data_set.head()

Unnamed: 0,NJ,POST_APRIL92,NJ_POST_APRIL92,fte,bk,kfc,roys,wendys,co_owned,centralj,southj,pa1,pa2,demp
0,1,0,0,15.0,1,0,0,0,0,1,0,0,0,12.0
1,1,0,0,15.0,1,0,0,0,0,1,0,0,0,6.5
2,1,0,0,24.0,0,0,1,0,0,1,0,0,0,-1.0
3,1,0,0,19.25,0,0,1,0,1,0,0,0,0,2.25
4,1,0,0,21.5,1,0,0,0,0,0,0,0,0,13.0


In [3]:
description = data_set.describe()
print(description)

               NJ  POST_APRIL92  NJ_POST_APRIL92         fte          bk  \
count  820.000000    820.000000       820.000000  794.000000  820.000000   
mean     0.807317      0.500000         0.403659   21.026511    0.417073   
std      0.394647      0.500305         0.490930    9.422746    0.493376   
min      0.000000      0.000000         0.000000    0.000000    0.000000   
25%      1.000000      0.000000         0.000000   14.500000    0.000000   
50%      1.000000      0.500000         0.000000   20.000000    0.000000   
75%      1.000000      1.000000         1.000000   25.500000    1.000000   
max      1.000000      1.000000         1.000000   85.000000    1.000000   

              kfc        roys      wendys    co_owned    centralj      southj  \
count  820.000000  820.000000  820.000000  820.000000  820.000000  820.000000   
mean     0.195122    0.241463    0.146341    0.343902    0.153659    0.226829   
std      0.396536    0.428232    0.353664    0.475299    0.360841    0.4

In [4]:
# to check if there are any missing values in our data set
data_set.isnull().any()

NJ                 False
POST_APRIL92       False
NJ_POST_APRIL92    False
fte                 True
bk                 False
kfc                False
roys               False
wendys             False
co_owned           False
centralj           False
southj             False
pa1                False
pa2                False
demp                True
dtype: bool

In [5]:
# replacig the missing values with the mean of the column we will use imputer from sklearn library
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

missing_values = imputer.fit(data_set[['fte','demp']])
data_set[['fte','demp']] = missing_values.transform(data_set[['fte','demp']])



In [6]:
# check if there are any missing values
data_set.isnull().any()

NJ                 False
POST_APRIL92       False
NJ_POST_APRIL92    False
fte                False
bk                 False
kfc                False
roys               False
wendys             False
co_owned           False
centralj           False
southj             False
pa1                False
pa2                False
demp               False
dtype: bool

In [11]:
# isolating the independent and dependent variables
X = data_set.iloc[:, 0:3].values
Y = data_set.iloc[:, 3].values


In [13]:
# CREATING THE FIRST MODEL
import statsmodels.api as sm
X = sm.add_constant(X)
model1 = sm.OLS(endog = Y, exog = X).fit()
model1.summary(yname='FTE',xname=['Intercept','New Jersey','After April 1992','New Jersey and After April 1992'])


0,1,2,3
Dep. Variable:,FTE,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,1.974
Date:,"Wed, 20 Dec 2023",Prob (F-statistic):,0.116
Time:,23:09:14,Log-Likelihood:,-2986.2
No. Observations:,820,AIC:,5980.0
Df Residuals:,816,BIC:,5999.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.2728,1.041,22.349,0.000,21.229,25.317
New Jersey,-2.8157,1.159,-2.430,0.015,-5.091,-0.541
After April 1992,-2.1108,1.473,-1.433,0.152,-5.001,0.780
New Jersey and After April 1992,2.6810,1.639,1.636,0.102,-0.536,5.898

0,1,2,3
Omnibus:,232.659,Durbin-Watson:,1.847
Prob(Omnibus):,0.0,Jarque-Bera (JB):,908.337
Skew:,1.289,Prob(JB):,5.7200000000000005e-198
Kurtosis:,7.465,Cond. No.,11.4


In [18]:
# isolating the independent and dependent variables Part 2 for regression 2 we use loc here for the independent variables as they are not numerical and n
# we use iloc for the dependent variable as it is numerical
X = data_set.loc[:,['NJ','POST_APRIL92','NJ_POST_APRIL92',
              'bk','kfc','wendys']].values
Y = data_set.iloc[:, 3].values



In [22]:
# CREATING THE FIRST MODEL and to remove the dummy variable trap we remove the roys from the independent variables
import statsmodels.api as sm
X = sm.add_constant(X)
model2 = sm.OLS(endog = Y, exog = X).fit()
model2.summary(yname='FTE',xname=['Intercept','New Jersey','After April 1992','New Jersey and After April 1992','Burger King','KFC','Wendys'])

0,1,2,3
Dep. Variable:,FTE,R-squared:,0.191
Model:,OLS,Adj. R-squared:,0.185
Method:,Least Squares,F-statistic:,31.95
Date:,"Wed, 20 Dec 2023",Prob (F-statistic):,1.3e-34
Time:,23:32:42,Log-Likelihood:,-2902.4
No. Observations:,820,AIC:,5819.0
Df Residuals:,813,BIC:,5852.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.4055,1.085,21.575,0.000,21.276,25.535
New Jersey,-2.2349,1.050,-2.129,0.034,-4.296,-0.174
After April 1992,-2.1108,1.332,-1.585,0.113,-4.725,0.504
New Jersey and After April 1992,2.6810,1.482,1.809,0.071,-0.229,5.591
Burger King,2.1620,0.748,2.891,0.004,0.694,3.630
KFC,-8.4912,0.890,-9.540,0.000,-10.238,-6.744
Wendys,1.0496,0.970,1.082,0.280,-0.855,2.954

0,1,2,3
Omnibus:,300.626,Durbin-Watson:,1.965
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1848.909
Skew:,1.53,Prob(JB):,0.0
Kurtosis:,9.69,Cond. No.,12.0


In [23]:
# thrid regression model. Start with the isolation of the independent and dependent variables
X = data_set.loc[:,['NJ','POST_APRIL92','NJ_POST_APRIL92',
              'bk','kfc','wendys','co_owned','centralj','southj']].values
Y = data_set.iloc[:, 3].values

In [25]:
import statsmodels.api as sm
X = sm.add_constant(X)
model2 = sm.OLS(endog = Y, exog = X).fit()
model2.summary(yname='FTE',xname=['Intercept','New Jersey','After April 1992','New Jersey and After April 1992','Burger King','KFC','Wendys','Co_owned','Central Jersey','South Jersey'])

0,1,2,3
Dep. Variable:,FTE,R-squared:,0.217
Model:,OLS,Adj. R-squared:,0.208
Method:,Least Squares,F-statistic:,24.89
Date:,"Thu, 21 Dec 2023",Prob (F-statistic):,6.45e-38
Time:,00:49:10,Log-Likelihood:,-2889.1
No. Observations:,820,AIC:,5798.0
Df Residuals:,810,BIC:,5845.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.9321,1.184,20.204,0.000,21.607,26.257
New Jersey,-1.3009,1.078,-1.207,0.228,-3.416,0.815
After April 1992,-2.1108,1.313,-1.608,0.108,-4.688,0.466
New Jersey and After April 1992,2.6810,1.461,1.835,0.067,-0.187,5.549
Burger King,1.6653,0.832,2.002,0.046,0.033,3.298
KFC,-8.2346,0.899,-9.161,0.000,-9.999,-6.470
Wendys,0.6218,1.017,0.612,0.541,-1.374,2.617
Co_owned,-0.7456,0.699,-1.066,0.287,-2.118,0.627
Central Jersey,0.0030,0.867,0.003,0.997,-1.699,1.705

0,1,2,3
Omnibus:,309.762,Durbin-Watson:,2.047
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1987.511
Skew:,1.57,Prob(JB):,0.0
Kurtosis:,9.951,Cond. No.,12.6


## Second Case Study

In [28]:
# Load the dataset from the working directory and we will use the stata extension file 
data_set_eitc = pd.read_stata('eitc.dta')
description = data_set_eitc.describe()
print(description)

              state          year         urate      children      nonwhite  \
count  13746.000000  13746.000000  13746.000000  13746.000000  13746.000000   
mean      54.524590   1993.347046      6.761734      1.192638      0.600684   
std       27.134893      1.703207      1.462464      1.382105      0.489776   
min       11.000000   1991.000000      2.600000      0.000000      0.000000   
25%       31.000000   1992.000000      5.700000      0.000000      0.000000   
50%       56.000000   1993.000000      6.800000      1.000000      1.000000   
75%       81.000000   1995.000000      7.700000      2.000000      1.000000   
max       95.000000   1996.000000     11.400000      9.000000      1.000000   

                finc           earn           age            ed          work  \
count   13746.000000   13746.000000  13746.000000  13746.000000  13746.000000   
mean    15255.319310   10432.475514     35.209661      8.806053      0.513022   
std     19444.249684   18200.758138     10.15

### Creating Dummy Variables

In [34]:
data_set_eitc['post93'] = np.where(data_set_eitc['year'] > 1993,1,0)
data_set_eitc['mom'] = np.where(data_set_eitc['children'] > 0,1,0)
data_set_eitc['mompost93'] = data_set_eitc['mom']*data_set_eitc['post93']
data_set_eitc.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom,mompost93
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0,1,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0,0,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0,1,0


#### Isolating X and Y variables 


In [36]:
Y = data_set_eitc.loc[:,['work']].values
X = data_set_eitc.loc[:,['post93','mom','mompost93']].values

##### First Logistic Regression

In [38]:
import statsmodels.api as sm
X = sm.add_constant(X)
model_1_logit = sm.Logit(endog = Y, exog = X).fit()
model_1_logit.summary(yname='Work',xname=['Intercept','After 1993','Mother','Mother and After 1993'],title='Logistic Regression Results for EITC Data')   

Optimization terminated successfully.
         Current function value: 0.686491
         Iterations 4


0,1,2,3
Dep. Variable:,Work,No. Observations:,13746.0
Model:,Logit,Df Residuals:,13742.0
Method:,MLE,Df Model:,3.0
Date:,"Fri, 22 Dec 2023",Pseudo R-squ.:,0.009118
Time:,00:52:56,Log-Likelihood:,-9436.5
converged:,True,LL-Null:,-9523.3
Covariance Type:,nonrobust,LLR p-value:,2.058e-37

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3042,0.036,8.443,0.000,0.234,0.375
After 1993,-0.0085,0.053,-0.161,0.872,-0.112,0.095
Mother,-0.5212,0.047,-10.985,0.000,-0.614,-0.428
Mother and After 1993,0.1885,0.070,2.708,0.007,0.052,0.325


In [41]:
#Y = data_set_eitc.loc[:,['work']].values
X = data_set_eitc.loc[:,['post93','mom','mompost93','nonwhite','ed']].values
X = sm.add_constant(X)
model_2_logit = sm.Logit(endog = Y, exog = X).fit()
model_2_logit.summary(yname='Work',xname=['Intercept','After 1993','Mother','Mother and After 1993','Hispanic or Black','Years of Education'],title='Logistic Regression Results for EITC Data')   

Optimization terminated successfully.
         Current function value: 0.680664
         Iterations 4


0,1,2,3
Dep. Variable:,Work,No. Observations:,13746.0
Model:,Logit,Df Residuals:,13740.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 22 Dec 2023",Pseudo R-squ.:,0.01753
Time:,01:02:28,Log-Likelihood:,-9356.4
converged:,True,LL-Null:,-9523.3
Covariance Type:,nonrobust,LLR p-value:,5.205e-70

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.1687,0.071,-2.367,0.018,-0.308,-0.029
After 1993,-0.0046,0.053,-0.086,0.932,-0.108,0.099
Mother,-0.5287,0.048,-10.986,0.000,-0.623,-0.434
Mother and After 1993,0.1973,0.070,2.817,0.005,0.060,0.335
Hispanic or Black,-0.2199,0.036,-6.129,0.000,-0.290,-0.150
Years of Education,0.0687,0.007,10.270,0.000,0.056,0.082



### Setting up a Placebo Experiment 

In [43]:
# Preparing dummy variables for the placebo test
data_set_eitc['post92'] = np.where(data_set_eitc['year'] > 1992,1,0)
#data_set_eitc['mom'] = np.where(data_set_eitc['children'] > 0,1,0)
data_set_eitc['mompost92'] = data_set_eitc['mom']*data_set_eitc['post92']
data_set_eitc.head()

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom,mompost93,post92,mompost92
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0,0,0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1,0,0,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0,1,0,0,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0,0,0,0,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0,1,0,0,0


In [45]:
# Prepare the plaebo data set
data_set_eitc_placebo = data_set_eitc[data_set_eitc['year'] < 1994]
data_set_eitc_placebo.head()    

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom,mompost93,post92,mompost92
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.0,0,0,0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1,0,0,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.0,33,11,0,8.178194,0,1,0,0,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.0,43,11,0,9.36957,0,0,0,0,0
4,15.0,1991.0,8.6,3,1,14706.60793,14706.60793,23,7,1,0.0,0,1,0,0,0


In [46]:
Y_placebo = data_set_eitc_placebo.loc[:,['work']].values
X_placebo = data_set_eitc_placebo.loc[:,['post92','mom','mompost92']].values


X_placebo = sm.add_constant(X_placebo)
model_1_logit = sm.Logit(endog = Y_placebo, exog = X_placebo).fit()
model_1_logit.summary(yname='Work',xname=['Intercept','After 1992','Mother','Mother and After 1992'],title='Logistic Regression Results for EITC Data- Palcebo Test')   

Optimization terminated successfully.
         Current function value: 0.684872
         Iterations 4


0,1,2,3
Dep. Variable:,Work,No. Observations:,7401.0
Model:,Logit,Df Residuals:,7397.0
Method:,MLE,Df Model:,3.0
Date:,"Fri, 22 Dec 2023",Pseudo R-squ.:,0.01193
Time:,01:13:16,Log-Likelihood:,-5068.7
converged:,True,LL-Null:,-5130.0
Covariance Type:,nonrobust,LLR p-value:,2.29e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.3124,0.044,7.154,0.000,0.227,0.398
After 1992,-0.0259,0.077,-0.335,0.737,-0.177,0.126
Mother,-0.5138,0.057,-8.950,0.000,-0.626,-0.401
Mother and After 1992,-0.0239,0.102,-0.234,0.815,-0.224,0.176
