In [1]:
## Principal Component Regression Example
## Generating model with 2 factors and 4 explanatory variables
## Try out partial correlation for dropping (or adding) factors (variable importance)
###############
## Apply algorithm for partial least squares as an alternative to PCR 
###############


import numpy as np
from numpy.testing import assert_array_almost_equal
import statsmodels.api as sm
from statsmodels.sandbox.tools import pca
from statsmodels.sandbox.tools.cross_val import LeaveOneOut


# Example: principal component regression
nobs = 1000
f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))]
f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T
f2xcoef = np.array([[ 1.,  1.,  0.,  0.],
                    [ 0.,  0.,  1.,  1.],
                    [ 3.,  2.,  1.,  0.]])
f2xcoef = np.array([[ 0.1,  3.,  1.,    0.],
                    [ 0.,  0.,  1.5,   0.1],
                    [ 3.,  2.,  1.,    0.]])
x0 = np.dot(f0, f2xcoef)
x0 += 0.1*np.random.normal(size=x0.shape)
ytrue = np.dot(f0,[1., 1., 1.])
y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)

xred, fact, eva, eve  = pca(x0, keepdim=0)
print(eve)
print(fact[:5])
print(f0[:5])



[[-0.03219213  0.01492453  0.28226119 -0.95868115]
 [-0.92145084  0.38637527  0.00882484  0.03955522]
 [-0.38711382 -0.91994608 -0.05909019 -0.0187201 ]
 [-0.0059076  -0.06473493  0.95747537  0.28109677]]
[[-1.01455946  2.37093862 -0.02738503  0.01241164]
 [-2.88891627 -0.51719545 -0.03634978  0.00765704]
 [ 0.90129992  1.04239594  0.15590092 -0.1337181 ]
 [ 0.32287412 -0.77497077  0.0544092   0.05001902]
 [ 4.24145574  0.17370851  0.06281221 -0.1798406 ]]
[[ 0.65915556 -1.59922571  1.        ]
 [ 0.80641896  0.55457505  1.        ]
 [-0.11311905 -0.73543379  1.        ]
 [-0.18999328  0.47237091  1.        ]
 [-1.23079945 -0.41157448  1.        ]]


In [2]:
import statsmodels.api as sm

res = sm.OLS(y0, sm.add_constant(x0, prepend=False)).fit()
print('OLS on original data')
print(res.params)
print(res.aic)
print(res.rsquared)

#print 'OLS on Factors'
#for k in range(x0.shape[1]):
#    xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
#    fact_wconst = sm.add_constant(fact)
#    res = sm.OLS(y0, fact_wconst).fit()
#    print 'k =', k
#    print res.params
#    print 'aic:  ', res.aic
#    print 'bic:  ', res.bic
#    print 'llf:  ', res.llf
#    print 'R2    ', res.rsquared
#    print 'R2 adj', res.rsquared_adj

print('OLS on Factors')
results = []
xred, fact, eva, eve  = pca(x0, keepdim=0, normalize=1)
for k in range(0, x0.shape[1]+1):
    #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
    # this is faster and same result
    fact_wconst = sm.add_constant(fact[:,:k], prepend=False)
    res = sm.OLS(y0, fact_wconst).fit()
##    print 'k =', k
##    print res.params
##    print 'aic:  ', res.aic
##    print 'bic:  ', res.bic
##    print 'llf:  ', res.llf
##    print 'R2    ', res.rsquared
##    print 'R2 adj', res.rsquared_adj
    prederr2 = 0.
    for inidx, outidx in LeaveOneOut(len(y0)):
        resl1o = sm.OLS(y0[inidx], fact_wconst[inidx,:]).fit()
        #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
        prederr2 += (y0[outidx] - resl1o.predict(fact_wconst[outidx,:]))**2.
    results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])

results = np.array(results)
print(results)
print('best result for k, by AIC, BIC, R2_adj, L1O')
print(np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
             np.argmin(results[:,-1],0))])



OLS on original data
[-0.00618973  0.11242093  0.65876826  0.09660511  0.13152815]
-1433.74985251
0.993185715098
OLS on Factors
[[  0.00000000e+00   3.54698430e+03   3.55189205e+03   1.11022302e-16
    2.03217822e+03]
 [  1.00000000e+00   2.40839683e+03   2.41821234e+03   6.80048534e-01
    6.50874761e+02]
 [  2.00000000e+00  -1.43515612e+03  -1.42043285e+03   9.93154313e-01
    1.39372612e+01]
 [  3.00000000e+00  -1.43529438e+03  -1.41566336e+03   9.93162076e-01
    1.39357565e+01]
 [  4.00000000e+00  -1.43374985e+03  -1.40921108e+03   9.93158321e-01
    1.39579586e+01]]
best result for k, by AIC, BIC, R2_adj, L1O
[3 2 3 3]


In [3]:
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt,
                        default_latex_fmt, default_html_fmt)

headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ')
numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f'
txt_fmt1 = dict(data_fmts = numformat)
tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1)

print("PCA regression on simulated data,")
print("DGP: 2 factors and 4 explanatory variables")
print(tabl)
print("Notes: k is number of components of PCA,")
print("       constant is added additionally")
print("       k=0 means regression on constant only")
print("       L1O: sum of squared prediction errors for leave-one-out")

PCA regression on simulated data,
DGP: 2 factors and 4 explanatory variables
  k       AIC        BIC       R2_adj      L1O    
--------------------------------------------------
     0   3546.984   3551.892      0.000   2032.178
     1   2408.397   2418.212      0.680    650.875
     2  -1435.156  -1420.433      0.993     13.937
     3  -1435.294  -1415.663      0.993     13.936
     4  -1433.750  -1409.211      0.993     13.958
--------------------------------------------------
Notes: k is number of components of PCA,
       constant is added additionally
       k=0 means regression on constant only
       L1O: sum of squared prediction errors for leave-one-out
