Overfitting examples in python and statsmodels

In [6]:
import numpy as np
import statsmodels.formula.api as smf

In [7]:
np.random.seed(414)

In [8]:
#Generate toy data:
X = np.linspace(0,15,1000)
y = 3 * np.sin(X) + np.random.normal(1 + X, 0.2, 1000)

In [9]:
train_X, train_y = X[:700],y[:700]
test_X, test_y = X[700:], y[700:]

In [13]:
X[:7]

array([ 0.        ,  0.01501502,  0.03003003,  0.04504505,  0.06006006,
        0.07507508,  0.09009009])

In [29]:
import pandas as pd
train_df = pd.DataFrame({"X":train_X,"y":train_y})
test_df = pd.DataFrame({"X":test_X, 'y':test_y})
#Syntax for above: generate dataframe with X col from test_X array and...
#y_col from test_y array.

In [45]:
#Linear Fit:
poly_lin = smf.ols(formula='y ~ 1 + X', data=train_df).fit()
poly_lin.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.642
Model:,OLS,Adj. R-squared:,0.642
Method:,Least Squares,F-statistic:,1254.0
Date:,"Sun, 13 Sep 2015",Prob (F-statistic):,5.52e-158
Time:,17:57:05,Log-Likelihood:,-1483.4
No. Observations:,700,AIC:,2971.0
Df Residuals:,698,BIC:,2980.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.9959,0.152,13.104,0.000,1.697 2.295
X,0.8896,0.025,35.405,0.000,0.840 0.939

0,1,2,3
Omnibus:,701.108,Durbin-Watson:,0.02
Prob(Omnibus):,0.0,Jarque-Bera (JB):,52.98
Skew:,-0.259,Prob(JB):,3.13e-12
Kurtosis:,1.756,Cond. No.,12.4


In [40]:
#Quadratic Fit:
poly_quad = smf.ols(formula='y ~ 1 + X + I(X**2)', data=train_df).fit()
poly_quad.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.666
Model:,OLS,Adj. R-squared:,0.665
Method:,Least Squares,F-statistic:,694.4
Date:,"Sun, 13 Sep 2015",Prob (F-statistic):,1.25e-166
Time:,17:49:56,Log-Likelihood:,-1459.6
No. Observations:,700,AIC:,2925.0
Df Residuals:,697,BIC:,2939.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,3.1458,0.221,14.261,0.000,2.713 3.579
X,0.2313,0.097,2.382,0.017,0.041 0.422
I(X ** 2),0.0627,0.009,7.004,0.000,0.045 0.080

0,1,2,3
Omnibus:,1210.467,Durbin-Watson:,0.022
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49.911
Skew:,-0.091,Prob(JB):,1.45e-11
Kurtosis:,1.705,Cond. No.,160.0


In [41]:
poly_cubic = smf.ols(formula='y ~ 1 + X + I(X**2) + I(X**3)', data=train_df).fit()
poly_cubic.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.731
Model:,OLS,Adj. R-squared:,0.73
Method:,Least Squares,F-statistic:,631.2
Date:,"Sun, 13 Sep 2015",Prob (F-statistic):,4.7400000000000005e-198
Time:,17:49:57,Log-Likelihood:,-1383.4
No. Observations:,700,AIC:,2775.0
Df Residuals:,696,BIC:,2793.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,5.4049,0.263,20.527,0.000,4.888 5.922
X,-2.3609,0.217,-10.859,0.000,-2.788 -1.934
I(X ** 2),0.6806,0.048,14.133,0.000,0.586 0.775
I(X ** 3),-0.0392,0.003,-13.013,0.000,-0.045 -0.033

0,1,2,3
Omnibus:,235.432,Durbin-Watson:,0.027
Prob(Omnibus):,0.0,Jarque-Bera (JB):,38.208
Skew:,-0.179,Prob(JB):,5.05e-09
Kurtosis:,1.913,Cond. No.,2220.0


In [63]:
#This generates an array, but does not calculate r^2...there should be a function call for this
y_pred = poly_lin.predict(test_df)
y_pred[:10]

array([ 11.34605278,  11.35941012,  11.37276745,  11.38612479,
        11.39948213,  11.41283947,  11.4261968 ,  11.43955414,
        11.45291148,  11.46626882])

In [64]:
test_df

Unnamed: 0,X,y
0,10.510511,9.113557
1,10.525526,8.798065
2,10.540541,8.931348
3,10.555556,9.047734
4,10.570571,8.659095
5,10.585586,9.115196
6,10.600601,8.966861
7,10.615616,8.947426
8,10.630631,8.761166
9,10.645646,8.570319
