In [1]:
import numpy as np
from families import Gaussian, Bernoulli, Poisson
from glm import GLM
from simulate import Simulation

import statsmodels.api as sm
import statsmodels

  from pandas.core import datetools


In [2]:
N = 10000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = np.random.uniform(size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

## Linear Model

In [3]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x114acfba8>

In [4]:
model.coef_

array([ 1.01238386, -2.02692086,  1.00498161])

In [5]:
model.coef_covariance_matrix_

array([[  7.29406396e-04,  -6.25511443e-04,  -6.28934787e-04],
       [ -6.25511443e-04,   1.26429000e-03,  -5.74527754e-06],
       [ -6.28934787e-04,  -5.74527754e-06,   1.26649767e-03]])

In [6]:
model.coef_standard_error_

array([ 0.02700752,  0.03555686,  0.03558789])

In [7]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     2016.
Date:                Wed, 30 Aug 2017   Prob (F-statistic):               0.00
Time:                        20:47:58   Log-Likelihood:                -14421.
No. Observations:               10000   AIC:                         2.885e+04
Df Residuals:                    9997   BIC:                         2.887e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0124      0.027     37.485      0.0

## Run some simulations off the linear model.

In [8]:
s = Simulation(model)

In [9]:
s.sample(X)

array([[  1.52380870e-02,  -1.43217420e+00,  -2.47216634e-01, ...,
         -1.24702957e+00,   1.44962731e+00,   4.80419269e-01],
       [ -1.51112035e+00,  -4.68894403e-01,   1.27018770e+00, ...,
          1.42758361e+00,   1.37558780e+00,  -3.65848322e-01],
       [ -4.93750695e-01,   1.75212205e-01,  -1.36323799e+00, ...,
          8.83007582e-01,   1.13984402e+00,   2.73590202e-01],
       ..., 
       [ -1.01897710e+00,  -1.20377889e+00,   1.25665156e+00, ...,
          2.00152570e+00,   2.04755658e+00,  -7.39501462e-01],
       [  4.57611140e-01,   3.61635944e-01,   5.58558758e-01, ...,
         -3.48736085e-03,   8.05166983e-01,  -8.36167764e-01],
       [  1.59743753e+00,  -2.16955668e+00,  -6.52000738e-01, ...,
          3.72824876e+00,   9.99491400e-01,  -1.18634680e+00]])

In [10]:
models = s.parametric_bootstrap(X, n_sim=10)
for model in models:
    print(model.coef_)

[ 1.02525081 -2.01630202  0.99422129]
[ 1.05504776 -2.03712266  0.9590787 ]
[ 0.94436376 -1.97377019  1.06187187]
[ 1.05377584 -2.09266306  1.0103103 ]
[ 0.987111   -2.04210078  1.03700606]
[ 1.03787618 -2.07612538  1.01219514]
[ 1.06335037 -2.11171194  0.97541253]
[ 0.97449525 -2.01537208  1.03413366]
[ 1.00163023 -1.99427679  1.01124415]
[ 1.05123197 -2.06726162  0.99822628]


In [11]:
models = s.non_parametric_bootstrap(X, y, n_sim=10)
for model in models:
    print(model.coef_)

[ 0.98654473 -2.00116224  1.04451165]
[ 1.02227717 -2.00770834  0.99704988]
[ 1.01737114 -2.00318522  0.94563499]
[ 1.03150124 -2.07023392  1.02932444]
[ 1.0103629  -2.02163142  1.00338946]
[ 1.01811914 -2.06185337  1.05102277]
[ 1.0365624  -2.08562508  0.97061296]
[ 1.01748239 -1.97761194  0.95211104]
[ 1.00125741 -2.04126103  1.06374633]
[ 1.0474316  -2.062493    0.98019981]


## Linear Model with Sample Weights

In [12]:
sample_weights = np.random.uniform(0, 2, size=N)

In [13]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [14]:
model.coef_

array([ 1.02894572, -2.04390337,  0.98124285])

## Logistic Model

In [15]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [16]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.GLM at 0x114b03f98>

In [17]:
model.coef_

array([ 0.98686695, -1.99342588,  1.02711229])

In [18]:
model.dispersion_

array(1.0)

In [19]:
model.coef_covariance_matrix_

array([[ 0.00325063, -0.00296448, -0.00251295],
       [-0.00296448,  0.0059812 , -0.00045033],
       [-0.00251295, -0.00045033,  0.00570695]])

In [20]:
model.coef_standard_error_

array([ 0.05701431,  0.07733824,  0.07554435])

In [21]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.622692
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Wed, 30 Aug 2017   Pseudo R-squ.:                 0.06645
Time:                        20:47:59   Log-Likelihood:                -6226.9
converged:                       True   LL-Null:                       -6670.2
                                        LLR p-value:                3.137e-193
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9869      0.057     17.309      0.000       0.875       1.099
x1            -1.9934      0.

In [22]:
coefs = np.empty((1000, 3))
for i in range(1000):
    y_logistic = np.random.binomial(1, p=p, size=N)
    model = GLM(family=Bernoulli())
    model.fit(X, y_logistic)
    coefs[i, :] = model.coef_
    
print(coefs.std(axis=0))

[ 0.05922469  0.07960627  0.07477743]


## Poission Model

In [23]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [24]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.GLM at 0x116c05780>

In [25]:
model.coef_

array([ 0.97314694, -1.97317453,  1.00622402])

In [26]:
model.coef_covariance_matrix_

array([[  3.48275471e-04,  -2.48972705e-04,  -3.66373025e-04],
       [ -2.48972705e-04,   7.29866279e-04,  -3.54697398e-06],
       [ -3.66373025e-04,  -3.54697398e-06,   6.33602735e-04]])

In [27]:
model.coef_standard_error_

array([ 0.01866214,  0.02701604,  0.02517147])

In [28]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.588687
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Wed, 30 Aug 2017   Pseudo R-squ.:                  0.1902
Time:                        20:48:04   Log-Likelihood:                -15887.
converged:                       True   LL-Null:                       -19618.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9731      0.019     52.145      0.000       0.937       1.010
x1            -1.9732      0.

## Poisson with Exposures

In [29]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [30]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.GLM at 0x114b03470>

In [31]:
model.coef_

array([ 1.00615836, -2.00238427,  0.99512772])

## Linear Model with Correlated Predictors

In [32]:
N = 100
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.5*X[:, 1] + np.random.uniform(-0.5, 0.5, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [33]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.GLM at 0x114b03358>

In [34]:
model.coef_

array([ 0.63541665, -1.6562498 ,  1.27005124])

In [35]:
model.coef_covariance_matrix_

array([[ 0.03694858, -0.05316921, -0.00696023],
       [-0.05316921,  0.12806291, -0.04196057],
       [-0.00696023, -0.04196057,  0.11609893]])

In [36]:
model.coef_standard_error_

array([ 0.19222013,  0.35785879,  0.34073294])

In [37]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.215
Model:                            OLS   Adj. R-squared:                  0.199
Method:                 Least Squares   F-statistic:                     13.29
Date:                Wed, 30 Aug 2017   Prob (F-statistic):           7.89e-06
Time:                        20:48:04   Log-Likelihood:                -135.47
No. Observations:                 100   AIC:                             276.9
Df Residuals:                      97   BIC:                             284.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.6354      0.192      3.306      0.0