In [1]:
import pandas as pd
import statsmodels.api as sm

  from pandas.core import datetools


In [2]:
df = pd.read_csv('../data/ces2013.csv')

In [3]:
df = df[df.FDHO > 0] # filter FDHO > 0

In [4]:
results = sm.OLS(df.FDHO, sm.add_constant(df.EXP)).fit()

In [5]:
results.summary()

0,1,2,3
Dep. Variable:,FDHO,R-squared:,0.351
Model:,OLS,Adj. R-squared:,0.351
Method:,Least Squares,F-statistic:,3431.0
Date:,"Mon, 21 Aug 2017",Prob (F-statistic):,0.0
Time:,00:41:29,Log-Likelihood:,-48748.0
No. Observations:,6334,AIC:,97500.0
Df Residuals:,6332,BIC:,97510.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,369.4418,10.657,34.666,0.000,348.550,390.333
EXP,0.0627,0.001,58.575,0.000,0.061,0.065

0,1,2,3
Omnibus:,2386.301,Durbin-Watson:,1.927
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21677.738
Skew:,1.546,Prob(JB):,0.0
Kurtosis:,11.519,Cond. No.,15900.0


In [6]:
results.params

const    369.441758
EXP        0.062710
dtype: float64

In [23]:
coefficients = dict()
for col in df.columns:
    df_tmp = df[df[col] > 0]
    results = sm.OLS(df_tmp[col], sm.add_constant(df_tmp.EXP)).fit()
    print('{}: n={}, coeff={}, r^2={}'.format(col, results.nobs, results.params[1], results.rsquared))
    coefficients[results.params[1]] = (col, results.nobs)

ADM: n=2815.0, coeff=0.023456941175069156, r^2=0.2284350368237611
CLOT: n=4500.0, coeff=0.03162324030372991, r^2=0.17597590606343272
DOM: n=1661.0, coeff=0.04085154968381848, r^2=0.13393598222011516
EDUC: n=561.0, coeff=0.12018391618376151, r^2=0.24050598130835077
ELEC: n=5828.0, coeff=0.013099041903089234, r^2=0.17953421169561956
EXP: n=6334.0, coeff=0.9999999999999987, r^2=1.0
FDAW: n=5102.0, coeff=0.05270448193571886, r^2=0.35399334623355383
FDHO: n=6334.0, coeff=0.0627098936358519, r^2=0.35142962523074006
FOOT: n=1827.0, coeff=0.005753449964792445, r^2=0.08218288667208551
FURN: n=487.0, coeff=0.05223110804940814, r^2=0.10240778264530093
GASO: n=5710.0, coeff=0.0372745871299681, r^2=0.2780620570495127
HEAL: n=4802.0, coeff=0.0574458973248711, r^2=0.1743169806355802
HHTENURE: n=6334.0, coeff=-6.177345852678266e-05, r^2=0.08306945165298596
HOUS: n=6223.0, coeff=0.19758171369926336, r^2=0.4692379671856196
ID: n=6334.0, coeff=-0.766506437423469, r^2=0.01404213179166125
LIFE: n=1253.0, c

In [32]:
for c in sorted(coefficients, reverse=True):
    col, n = coefficients[c]
    print(col, round(c, 4), '{0: .1f}%'.format(n / len(df) * 100))

EXP 1.0  100.0%
HOUS 0.1976  98.2%
EDUC 0.1202  8.9%
FDHO 0.0627  100.0%
HEAL 0.0574  75.8%
FDAW 0.0527  80.5%
FURN 0.0522  7.7%
TRIP 0.0466  8.1%
DOM 0.0409  26.2%
GASO 0.0373  90.1%
MAPP 0.0329  6.3%
CLOT 0.0316  71.0%
ADM 0.0235  44.4%
LIFE 0.0193  19.8%
TOB 0.0165  18.2%
TELE 0.016  91.4%
TOYS 0.0145  39.5%
ELEC 0.0131  92.0%
PERS 0.0069  60.3%
LOCT 0.0068  10.9%
FOOT 0.0058  28.8%
READ 0.0048  36.1%
SAPP 0.0045  16.4%
TEXT 0.004  15.7%
REFEDUC 0.0001  99.6%
SIZE 0.0001  100.0%
SIZEAM 0.0  77.3%
SIZEJM 0.0  17.6%
SIZEAF 0.0  83.5%
SIZEIN 0.0  6.7%
REGION 0.0  99.4%
SIZEJF -0.0  17.6%
REFRACE -0.0  100.0%
URBAN -0.0  100.0%
REFSEX -0.0  100.0%
REFAGE -0.0  100.0%
HHTENURE -0.0001  100.0%
REFMS -0.0001  100.0%
ID -0.7665  100.0%


We sort the coefficients of each feature and see that
- When people's income increase, they spend more on housing and education
- Followed by food and healthcare
- however, we note that while housing is explained by 98% of observations, education is only explained by 8.9%. i.e. when income increases, ~8.9% of people will increase their spending on education, and when they do, it is a significant portion of their income. For housing however, 98.2% of people would increase spending on housing.