In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [127]:
import warnings; warnings.simplefilter('ignore')

In [128]:
%matplotlib inline
plt.rcParams['figure.dpi'] = 300
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams.update({'font.size': 16})

# Exercise 1

In [129]:
hourwork = pd.read_excel('hourwork.xlsx')

## a. Regress average hours worked during the year on the variables given in the table and interpret your regression

In [130]:
hourwork_y =  hourwork[['HRS']].copy()

In [131]:
hourwork_y=hourwork_y.drop([0],axis=0)

In [132]:
columns_to_drop = ['HRS','obs']
hourwork_x = hourwork.drop(columns_to_drop, axis=1)
hourwork_x1 = hourwork_x.assign(constant=1)

In [133]:
hourwork_x1=hourwork_x1.drop([0], axis=0)

In [134]:
model=sm.OLS(hourwork_y,hourwork_x1)
results=model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    HRS   R-squared:                       0.826
Model:                            OLS   Adj. R-squared:                  0.772
Method:                 Least Squares   F-statistic:                     15.38
Date:                Sat, 28 Jan 2023   Prob (F-statistic):           4.57e-08
Time:                        19:28:37   Log-Likelihood:                -164.22
No. Observations:                  35   AIC:                             346.4
Df Residuals:                      26   BIC:                             360.4
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
RATE         -93.7526     47.145     -1.989      0.0

## Interpretation
Only school and Rate are the variables are significant, because their P value is lower than 0.05 which means we reject the null hypothesis that Beta is qual to 0. Rest of the variable are no signficant which means that they do not help predict the value of hourwork.

In [135]:
hourwork_y.head()

Unnamed: 0,HRS
1,2157.0
2,2174.0
3,2062.0
4,2111.0
5,2134.0


## b. Is there evidence of multicollinearity in the data? How do you know?

In [136]:
hourwork_x1.iloc[:,0:].corr()

Unnamed: 0,RATE,ERSP,ERNO,NEIN,ASSET,AGE,DEP,SCHOOL,constant
RATE,1.0,0.571693,0.058992,0.701787,0.778932,0.044173,-0.601358,0.881271,
ERSP,0.571693,1.0,-0.040994,0.234426,0.274094,-0.0153,-0.692881,0.549108,
ERNO,0.058992,-0.040994,1.0,0.359094,0.292243,0.775494,0.050212,-0.298555,
NEIN,0.701787,0.234426,0.359094,1.0,0.98751,0.502432,-0.520832,0.539173,
ASSET,0.778932,0.274094,0.292243,0.98751,1.0,0.417086,-0.513552,0.630899,
AGE,0.044173,-0.0153,0.775494,0.502432,0.417086,1.0,-0.04836,-0.331067,
DEP,-0.601358,-0.692881,0.050212,-0.520832,-0.513552,-0.04836,1.0,-0.602575,
SCHOOL,0.881271,0.549108,-0.298555,0.539173,0.630899,-0.331067,-0.602575,1.0,
constant,,,,,,,,,


This table shows that most of the variables does not have correlation of 0 with each other, which shows us that there is multicollinearity.

## c. Compute the variance inflation factors (VIF) and TOL measures for the various regressors.

In [137]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [138]:
hourwork_x=hourwork_x.drop([0],axis=0)

In [139]:
vif_data = pd.DataFrame()

vif_data["feature"] = hourwork_x.columns

vif_data["VIF"] = [vif(hourwork_x.values, i) \
                   for i in range(len(hourwork_x.columns))]
print(vif_data.round(1))

  feature     VIF
0    RATE   586.6
1    ERSP    64.9
2    ERNO    35.0
3    NEIN  1183.2
4   ASSET   956.1
5     AGE   305.4
6     DEP    63.6
7  SCHOOL   509.7


In [140]:
model1=smf.ols(formula='RATE~ERSP+ERNO+NEIN+ASSET+AGE+DEP+SCHOOL',data=hourwork_x)
model2=smf.ols(formula='ERSP~RATE+ERNO+NEIN+ASSET+AGE+DEP+SCHOOL',data=hourwork_x)
model3=smf.ols(formula='ERNO~ERSP+RATE+NEIN+ASSET+AGE+DEP+SCHOOL',data=hourwork_x)
model4=smf.ols(formula='NEIN~ERSP+ERNO+RATE+ASSET+AGE+DEP+SCHOOL',data=hourwork_x)
model5=smf.ols(formula='ASSET~ERSP+ERNO+NEIN+RATE+AGE+DEP+SCHOOL',data=hourwork_x)
model6=smf.ols(formula='AGE~ERSP+ERNO+NEIN+ASSET+RATE+DEP+SCHOOL',data=hourwork_x)
model7=smf.ols(formula='DEP~ERSP+ERNO+NEIN+ASSET+AGE+RATE+SCHOOL',data=hourwork_x)
model8=smf.ols(formula='SCHOOL~ERSP+ERNO+NEIN+ASSET+AGE+DEP+RATE',data=hourwork_x)

In [141]:
md1 = model1.fit()
md2 = model2.fit()
md3 = model3.fit()
md4 = model4.fit()
md5 = model5.fit()
md6 = model6.fit()
md7 = model7.fit()
md8 = model8.fit()


In [142]:
hourwork_aux = pd.DataFrame()

hourwork_aux["Dependent Variable"] = hourwork_x.columns

hourwork_aux["R2 value"] = [md1.rsquared, md2.rsquared, md3.rsquared, md4.rsquared, \
                           md5.rsquared, md6.rsquared, md7.rsquared, md8.rsquared]

hourwork_aux["Tolerance (TOL) = 1- R2"] = [1- md1.rsquared, 1- md2.rsquared, 1- md3.rsquared, \
                                  1- md4.rsquared, 1- md5.rsquared, 1- md6.rsquared, 1-md7.rsquared, 1-md8.rsquared ]

In [143]:
hourwork_aux.round(4)

Unnamed: 0,Dependent Variable,R2 value,Tolerance (TOL) = 1- R2
0,RATE,0.9415,0.0585
1,ERSP,0.7142,0.2858
2,ERNO,0.6813,0.3187
3,NEIN,0.9945,0.0055
4,ASSET,0.9948,0.0052
5,AGE,0.8971,0.1029
6,DEP,0.779,0.221
7,SCHOOL,0.9606,0.0394


## d. If there is the multicollinearity problem, what remedial action, if any, would you take?

One of the things that can be done is removing Rate, Nein, Asset, Age and School as indpendent variables, because they have very high VIF value, removing them might reduce the multicollinearity among the variables.
Also these variables R^2 value is higher than the model's R^2

## e. What does this study tell about the feasibility of a negative income tax?

Negative income is feasible for such nation where income level of people are low. Negative income tax means upto certain low level of income government does not charge tax and even give some money to raise the standard of living. And once such person's income increases, he/she starts paying taxes.
Such regime can help in income equality, because of its progressive nature. Also it will help those people who works for so many hours yet does not earn much money.

# Exercise 2

## a. Compute the multiple regression of per capita consumption of gasoline on per capita income, the price of gasoline, the other prices, and a time trend. Report all results. Do the signs of the estimates agree with your expectations?

In [144]:
gl=pd.read_csv('gasoline.csv')

In [145]:
gl.head()

Unnamed: 0,YEAR,GASEXP,POP,GASP,INCOME,PNC,PUC,PPT,PD,PN,PS
0,1953,7.4,159565,16.668,8883,47.2,26.7,16.8,37.7,29.7,19.4
1,1954,7.8,162391,17.029,8685,46.5,22.7,18.0,36.8,29.7,20.0
2,1955,8.6,165275,17.21,9137,44.8,21.5,18.5,36.1,29.5,20.4
3,1956,9.4,168221,17.729,9436,46.1,20.7,19.2,36.1,29.9,20.9
4,1957,10.2,171274,18.497,9534,48.5,23.2,19.9,37.2,30.9,21.8


In [152]:
gltr=gl['GASEXP']/(gl['GASP']*gl['POP'])

gl=gl.assign(gltr=gltr)

gl_y=gl[['gltr']].copy()

gl_x=gl.drop(['GASEXP','POP','gltr'],axis=1)
gl_x1=gl_x.assign(constant=1)

In [153]:
model = sm.OLS(gl_y, gl_x1)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                   gltr   R-squared:                       0.991
Model:                            OLS   Adj. R-squared:                  0.989
Method:                 Least Squares   F-statistic:                     530.8
Date:                Sat, 28 Jan 2023   Prob (F-statistic):           2.81e-40
Time:                        19:34:12   Log-Likelihood:                 765.45
No. Observations:                  52   AIC:                            -1511.
Df Residuals:                      42   BIC:                            -1491.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
YEAR         7.25e-08   1.42e-08      5.112      0.0

### Some of the esimates does meet the expectation like year coefficient is positive which means the per capita consumption increased with the number of years. 
### Also there is negative relation between per capita consumption and price of gasoline, which it got cheaper with more consmption.

## b. Test the hypothesis that at least in regard to demand for gasoline, consumers do not differentiate between changes in the prices of new and used cars.

In [154]:
import numpy as np
from scipy.stats import ttest_ind

data1 = gl[['PNC']].copy()
data2 = gl[['PUC']].copy()

null_hypothesis = "The means of the two samples are equal"
alternative_hypothesis = "The means of the two samples are not equal"

t_statistic, p_value = ttest_ind(data1, data2)

print("t-statistic: ", t_statistic)
print("p-value: ", p_value)

alpha = 0.05
if p_value < alpha:
    print("We reject the null hypothesis")
    print(alternative_hypothesis)
else:
    print("We fail to reject the null hypothesis")
    print(null_hypothesis)

t-statistic:  [1.11595741]
p-value:  [0.2670625]
We fail to reject the null hypothesis
The means of the two samples are equal


### So from above hypothesis testing we can conclude that in regard to demand for gasoline, consumers do not differentiate between changes in the prices for new and used cars.

## c. Compute the simple correlations of the price variables. Would you conclude that multicollinearity is a problem for the regression in part a?

In [156]:
gl.head()

Unnamed: 0,YEAR,GASEXP,POP,GASP,INCOME,PNC,PUC,PPT,PD,PN,PS,gltr
0,1953,7.4,159565,16.668,8883,47.2,26.7,16.8,37.7,29.7,19.4,3e-06
1,1954,7.8,162391,17.029,8685,46.5,22.7,18.0,36.8,29.7,20.0,3e-06
2,1955,8.6,165275,17.21,9137,44.8,21.5,18.5,36.1,29.5,20.4,3e-06
3,1956,9.4,168221,17.729,9436,46.1,20.7,19.2,36.1,29.9,20.9,3e-06
4,1957,10.2,171274,18.497,9534,48.5,23.2,19.9,37.2,30.9,21.8,3e-06


In [163]:
columns_drop=['YEAR','GASEXP','POP','INCOME','gltr']
gl_u=gl.drop(columns_drop,axis=1)

In [164]:
gl_u.iloc[:,0:].corr()

Unnamed: 0,GASP,PNC,PUC,PPT,PD,PN,PS
GASP,1.0,0.936053,0.922767,0.927014,0.938936,0.962672,0.939353
PNC,0.936053,1.0,0.993874,0.980736,0.993266,0.988529,0.97849
PUC,0.922767,0.993874,1.0,0.982421,0.987832,0.982195,0.976852
PPT,0.927014,0.980736,0.982421,1.0,0.958468,0.989863,0.997515
PD,0.938936,0.993266,0.987832,0.958468,1.0,0.977343,0.956326
PN,0.962672,0.988529,0.982195,0.989863,0.977343,1.0,0.993584
PS,0.939353,0.97849,0.976852,0.997515,0.956326,0.993584,1.0


### This correlation indicates that all the price variables are correlated which means there is multicollinearity in this data, which must have caused problem in the regression that we have done.