    Ben Christensen
    Fiscal Responsibility Index
    December 4, 2018
    Math 402

In [1]:
import pandas as pd
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from statsmodels.regression.linear_model import OLS
from itertools import combinations
import numpy as np

  from pandas.core import datetools


In [2]:
df = pd.read_csv("scores_w_regions")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Name,Party,Position,Score,State,Tenure,YOB,Avg_score,State Code,Region,Division
0,0,0,"Abercrombie, Neil",D,Rep,-1047578000000.0,HI,7,1938,-149654100000.0,HI,West,Pacific
1,1,280,"Mink, Patsy Takemoto",D,Rep,-392687000000.0,HI,3,1927,-130895700000.0,HI,West,Pacific
2,2,499,"Case, Ed",D,Rep,-633750000000.0,HI,3,1952,-211250000000.0,HI,West,Pacific
3,3,673,"Hirono, Mazie",D,Rep,-46893000000.0,HI,3,1947,-15631000000.0,HI,West,Pacific
4,4,721,"Djou, Charles",R,Rep,-30000000.0,HI,1,1970,-30000000.0,HI,West,Pacific


In [3]:
#One-hot encoding for categorical variables
df["Democrat"] = 1*(df["Party"] == "D")
df["Republican"] = 1*(df["Party"] == "R")
df = pd.get_dummies(df, drop_first=True, columns=["Position", "Region", "Division"])
#Add a tenure squared variable
df["Tenure_sq"] = df["Tenure"] ** 2
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Name,Party,Score,State,Tenure,YOB,Avg_score,State Code,...,Region_West,Division_East South Central,Division_Middle Atlantic,Division_Mountain,Division_New England,Division_Pacific,Division_South Atlantic,Division_West North Central,Division_West South Central,Tenure_sq
0,0,0,"Abercrombie, Neil",D,-1047578000000.0,HI,7,1938,-149654100000.0,HI,...,1,0,0,0,0,1,0,0,0,49
1,1,280,"Mink, Patsy Takemoto",D,-392687000000.0,HI,3,1927,-130895700000.0,HI,...,1,0,0,0,0,1,0,0,0,9
2,2,499,"Case, Ed",D,-633750000000.0,HI,3,1952,-211250000000.0,HI,...,1,0,0,0,0,1,0,0,0,9
3,3,673,"Hirono, Mazie",D,-46893000000.0,HI,3,1947,-15631000000.0,HI,...,1,0,0,0,0,1,0,0,0,9
4,4,721,"Djou, Charles",R,-30000000.0,HI,1,1970,-30000000.0,HI,...,1,0,0,0,0,1,0,0,0,1


In [43]:
print(df.columns)


Index(['Unnamed: 0', 'Unnamed: 0.1', 'Name', 'Party', 'Score', 'State',
       'Tenure', 'YOB', 'Avg_score', 'State Code', 'Democrat', 'Republican',
       'Position_Sen', 'Region_Northeast', 'Region_South', 'Region_West',
       'Division_East South Central', 'Division_Middle Atlantic',
       'Division_Mountain', 'Division_New England', 'Division_Pacific',
       'Division_South Atlantic', 'Division_West North Central',
       'Division_West South Central', 'Tenure_sq'],
      dtype='object')


## (i)

OLS without regularization using all features

In [37]:
Y = df["Score"]
X = df[["Republican", "Democrat", "Division_East South Central", "Division_Middle Atlantic", "Division_Mountain", "Division_New England", "Division_Pacific", "Division_South Atlantic", "Division_West North Central", "Division_West South Central", "Tenure", "Tenure_sq", "YOB"]]
X = sm.add_constant(X)
regression = sm.OLS(Y, X).fit()
print(regression.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.456
Model:                            OLS   Adj. R-squared:                  0.450
Method:                 Least Squares   F-statistic:                     83.21
Date:                Tue, 04 Dec 2018   Prob (F-statistic):          3.27e-160
Time:                        19:37:43   Log-Likelihood:                -36905.
No. Observations:                1305   AIC:                         7.384e+04
Df Residuals:                    1291   BIC:                         7.391e+04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
const             

In [4]:
features = np.array(["Republican", "Democrat", "Division_East South Central", "Division_Middle Atlantic", "Division_Mountain", "Division_New England", "Division_Pacific", "Division_South Atlantic", "Division_West North Central", "Division_West South Central", "Tenure", "Tenure_sq", "YOB"])
#Prime AIC and BIC with using all features
smallest_AIC = 7.384e+04
smallest_BIC = 7.391e+04
best_aic_subset, best_bic_subset = features, features
for num_features in range(1, len(features)+1):
    for subset in combinations(features, num_features):
        if num_features == 1:
            subset = [subset[0]]
        else:
            subset = list(subset)
        X = df[subset]
        X = sm.add_constant(X)
        Y = df["Score"]
        model = sm.OLS(Y, X).fit()
        if model.aic < smallest_AIC:
            smallest_AIC = model.aic
            best_aic_subset = subset
        if model.bic < smallest_BIC:
            smallest_BIC = model.bic
            best_bic_subset = subset
print("Features that give optimal AIC:")
print(best_aic_subset)
print("AIC:", smallest_AIC)
print("\nFeatures that give optimal BIC:")
print(best_bic_subset)
print("BIC:", smallest_BIC)

Features that give optimal AIC:
['Democrat', 'Division_Pacific', 'Division_West North Central', 'Tenure', 'Tenure_sq', 'YOB']
AIC: 73828.7091582

Features that give optimal BIC:
['Democrat', 'Tenure', 'Tenure_sq', 'YOB']
BIC: 73858.8675853


## (ii)

OLS without regularization using features that optimize BIC

In [5]:
X = df[['Democrat', 'Tenure', 'Tenure_sq', 'YOB']]
Y = df["Score"]
X = sm.add_constant(X)
regression = sm.OLS(Y, X).fit()
print(regression.summary())

                            OLS Regression Results                            
Dep. Variable:                  Score   R-squared:                       0.450
Model:                            OLS   Adj. R-squared:                  0.449
Method:                 Least Squares   F-statistic:                     266.4
Date:                Wed, 05 Dec 2018   Prob (F-statistic):          2.90e-167
Time:                        07:29:50   Log-Likelihood:                -36911.
No. Observations:                1305   AIC:                         7.383e+04
Df Residuals:                    1300   BIC:                         7.386e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const      -2.032e+13   2.12e+12     -9.601      0.0

## 4.22

In [7]:
X = df[features]
ols_MSE = -np.mean(cross_val_score(linear_model.LinearRegression(), X,Y,cv=7, scoring="neg_mean_squared_error"))
X = df[['Democrat', 'Tenure', 'Tenure_sq', 'YOB']]
few_MSE = -np.mean(cross_val_score(linear_model.LinearRegression(), X,Y,cv=7, scoring="neg_mean_squared_error"))


X = df[features]
ridge_MSE, lasso_MSE = 1e50,1e50
for k in range(-5, 6):
    lmbda = 10**k
    ridge_model = linear_model.Ridge(lmbda)
    rMSE = -np.mean(cross_val_score(ridge_model, X,Y,cv=7, scoring="neg_mean_squared_error"))
    if rMSE < ridge_MSE and rMSE>0:
        ridge_MSE = rMSE
        ridge_coef = ridge_model.fit(X,Y).coef_
        ridge_features = features[ridge_coef!=0]
    lasso_model = linear_model.Lasso(lmbda)
    lMSE = -np.mean(cross_val_score(lasso_model, X,Y,cv=7, scoring="neg_mean_squared_error"))
    if lMSE < lasso_MSE and lMSE>0:
        lasso_MSE = lMSE
        lasso_coef = lasso_model.fit(X,Y).coef_
        lasso_features = features[lasso_coef!=0]

print('(i)')
print("OLS without regularization")
print("Features:", features)
print("MSE:", ols_MSE, '\n')
print('(ii)')
print("OLS with fewer features and without regularization")
print("Features:", ['Democrat', 'Tenure', 'Tenure_sq', 'YOB'])
print("MSE:", few_MSE, '\n')
print('(iii)')
print("Ridge Model")
print("Features:", ridge_features)
print("MSE:", ridge_MSE, '\n')
print('(iv)')
print("Lasso Model")
print("Features:", lasso_features)
print("MSE:", lasso_MSE)
    
    
    







(i)
OLS without regularization
Features: ['Republican' 'Democrat' 'Division_East South Central'
 'Division_Middle Atlantic' 'Division_Mountain' 'Division_New England'
 'Division_Pacific' 'Division_South Atlantic' 'Division_West North Central'
 'Division_West South Central' 'Tenure' 'Tenure_sq' 'YOB']
MSE: 2.24922843234e+23 

(ii)
OLS with fewer features and without regularization
Features: ['Democrat', 'Tenure', 'Tenure_sq', 'YOB']
MSE: 2.2054274241e+23 

(iii)
Ridge Model
Features: ['Republican' 'Democrat' 'Division_East South Central'
 'Division_Middle Atlantic' 'Division_Mountain' 'Division_New England'
 'Division_Pacific' 'Division_South Atlantic' 'Division_West North Central'
 'Division_West South Central' 'Tenure' 'Tenure_sq' 'YOB']
MSE: 2.21424641709e+23 

(iv)
Lasso Model
Features: ['Republican' 'Democrat' 'Division_East South Central'
 'Division_Middle Atlantic' 'Division_Mountain' 'Division_New England'
 'Division_Pacific' 'Division_South Atlantic' 'Division_West North Centra

