In [1]:
import numpy as np
import pandas as pd

from glm.families import Gaussian, Bernoulli, Poisson, Gamma
from glm.glm import GLM
from glm.simulation import Simulation

import statsmodels.api as sm
import statsmodels

  mplDeprecation)
  from pandas.core import datetools


In [2]:
N = 10000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(-1, 1, size=N)
X[:, 2] = np.random.uniform(-1, 1, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

## Linear Model

In [3]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.glm.GLM at 0x10c81e710>

In [4]:
model.coef_

array([ 0.9897016 , -2.00250176,  0.98881371])

In [5]:
model.coef_covariance_matrix_

array([[  9.86688747e-05,   2.01537919e-07,   1.97002375e-06],
       [  2.01537919e-07,   2.94237716e-04,  -2.35181225e-06],
       [  1.97002375e-06,  -2.35181225e-06,   2.93528106e-04]])

In [6]:
model.coef_standard_error_

array([ 0.00993322,  0.01715336,  0.01713266])

In [7]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.628
Model:                            OLS   Adj. R-squared:                  0.628
Method:                 Least Squares   F-statistic:                     8426.
Date:                Mon, 14 May 2018   Prob (F-statistic):               0.00
Time:                        12:33:31   Log-Likelihood:                -14120.
No. Observations:               10000   AIC:                         2.825e+04
Df Residuals:                    9997   BIC:                         2.827e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9897      0.010     99.636      0.0

In [8]:
res.params

array([ 0.9897016 , -2.00250176,  0.98881371])

In [9]:
res.bse

array([ 0.00993322,  0.01715336,  0.01713266])

## Linear Regression With Formula

In [10]:
df = pd.DataFrame(
    np.concatenate([X[:, 1:], y.reshape(-1, 1)], axis=1),
    columns=['x1', 'x2', 'y'])

In [11]:
df.shape

(10000, 3)

In [12]:
model = GLM(family=Gaussian())
model.fit(df, formula='y ~ x1 + x2')

(10000, 3) (10000,)


In [13]:
model.coef_

array([ 1.00184986, -2.01060173,  0.99048869])

## Run some simulations off the linear model.

In [42]:
s = Simulation(model)

In [43]:
s.sample(X)

array([[-0.6385895 ,  2.29472678,  0.08176893, ...,  1.10601752,
         0.59920728,  0.09341794],
       [ 0.32499603,  1.73482618, -1.45293292, ...,  2.92558588,
         0.71620149,  0.94013495],
       [-0.74676549,  2.36181837, -1.190973  , ...,  2.61877519,
         0.55880989,  2.05884194],
       ..., 
       [-1.37889391,  4.63900321,  0.0298945 , ...,  3.55784638,
        -0.20323784,  0.07847411],
       [-0.15098367,  1.78856174, -0.73574306, ...,  1.44032413,
        -0.39243053,  0.61754173],
       [ 1.54129168,  3.3750924 ,  1.37896777, ...,  2.53257773,
        -1.1463541 ,  1.19458986]])

In [44]:
models = s.parametric_bootstrap(X, n_sim=10)
for model in models:
    print(model.coef_)

[ 0.9892057  -2.03295006  1.00690766]
[ 0.99506121 -1.98011593  1.00548968]
[ 0.9835295  -2.00289649  0.99072403]
[ 0.99675136 -2.02370061  1.02929154]
[ 1.00079786 -2.02529517  0.97845097]
[ 1.01135732 -2.02512804  0.99858418]
[ 1.01918269 -2.02625949  1.00793594]
[ 0.99916922 -2.00504491  0.96884728]
[ 1.00905667 -2.01561444  0.99489761]
[ 0.98805873 -2.0185236   0.98136945]


In [45]:
models = s.non_parametric_bootstrap(X, y, n_sim=10)
for model in models:
    print(model.coef_)

[ 1.01264719 -1.99610455  1.01048795]
[ 1.00911871 -2.04295315  1.01438511]
[ 0.99151282 -2.01223277  0.98905954]
[ 1.03501069 -1.98892677  0.97779124]
[ 0.9958797  -2.01847068  1.0048134 ]
[ 1.0054847  -2.02734174  0.99105912]
[ 1.00124734 -1.98809205  1.01905124]
[ 1.01597021 -2.00890686  1.00690941]
[ 0.99350914 -2.03620659  1.01136322]
[ 0.99948622 -2.01363437  0.99951741]


## Linear Model with Sample Weights

In [46]:
sample_weights = np.random.uniform(0, 2, size=N)

In [47]:
model = GLM(family=Gaussian())
model = model.fit(X, y, sample_weights=sample_weights)

In [48]:
model.coef_

array([ 0.99402166, -2.01612674,  0.99481641])

## Logistic Model

In [49]:
p = 1 / (1 + np.exp(-nu))
y_logistic = np.random.binomial(1, p=p, size=N)

In [50]:
model = GLM(family=Bernoulli())
model.fit(X, y_logistic)

<glm.glm.GLM at 0x114425908>

In [51]:
model.coef_

array([ 0.95704048, -1.96055446,  1.02600515])

In [52]:
model.dispersion_

array(1.0)

In [53]:
model.coef_covariance_matrix_

array([[ 0.00069246, -0.00045444,  0.00023481],
       [-0.00045444,  0.00238459, -0.00048793],
       [ 0.00023481, -0.00048793,  0.00196631]])

In [54]:
model.coef_standard_error_

array([ 0.02631471,  0.04883231,  0.0443431 ])

In [55]:
mod = sm.Logit(y_logistic, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.506791
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                10000
Model:                          Logit   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Sun, 17 Sep 2017   Pseudo R-squ.:                  0.1990
Time:                        16:09:50   Log-Likelihood:                -5067.9
converged:                       True   LL-Null:                       -6326.8
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9570      0.026     36.369      0.000       0.905       1.009
x1            -1.9606      0.

In [56]:
s = Simulation(model)

In [57]:
s.sample(X, n_sim=10)

array([[ 0.,  1.,  0., ...,  1.,  0.,  1.],
       [ 1.,  0.,  0., ...,  1.,  1.,  0.],
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       ..., 
       [ 0.,  1.,  1., ...,  1.,  0.,  0.],
       [ 0.,  1.,  0., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  0.]])

In [58]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 0.99422083 -1.96298402  1.05235721]
[ 0.96443709 -1.99466032  1.03640602]
[ 1.00035789 -2.06225649  1.00416288]
[ 0.95647016 -1.9472292   1.09357851]
[ 0.93641543 -1.96767443  1.01963135]
[ 0.96543839 -1.95048238  1.0720573 ]
[ 0.97230804 -1.96945012  1.01792291]
[ 0.94768126 -2.01506523  1.12307292]
[ 0.98593321 -2.01224627  1.03240051]
[ 0.95445028 -1.95445635  1.01371649]


In [59]:
for model in s.non_parametric_bootstrap(X, y_logistic, n_sim=10):
    print(model.coef_)

[ 0.97689309 -1.97413283  1.10627714]
[ 0.95892487 -1.99413312  0.98665457]
[ 0.94129474 -1.94450942  1.00859525]
[ 0.93336539 -1.95264506  0.99312847]
[ 0.93940596 -1.91603988  0.99230598]
[ 0.95260326 -1.95432899  0.96347066]
[ 0.97404336 -1.95443378  1.07337741]
[ 0.93040456 -1.85128555  0.98612584]
[ 0.94250477 -1.9621798   1.02531186]
[ 0.9412844  -1.94538671  1.06160179]


## Poission Model

In [60]:
mu = np.exp(nu)
y_poisson = np.random.poisson(lam=mu, size=N)

In [61]:
model = GLM(family=Poisson())
model.fit(X, y_poisson)

<glm.glm.GLM at 0x114430978>

In [62]:
model.coef_

array([ 0.9993738 , -2.01557039,  0.98708617])

In [63]:
model.coef_covariance_matrix_

array([[  5.19786341e-05,   5.41592437e-05,  -1.88130624e-05],
       [  5.41592437e-05,   1.00811556e-04,  -9.71528570e-07],
       [ -1.88130624e-05,  -9.71528570e-07,   6.18642911e-05]])

In [64]:
model.coef_standard_error_

array([ 0.00720962,  0.0100405 ,  0.00786539])

In [65]:
mod = statsmodels.discrete.discrete_model.Poisson(y_poisson, X)
res = mod.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 1.857216
         Iterations 19
                          Poisson Regression Results                          
Dep. Variable:                      y   No. Observations:                10000
Model:                        Poisson   Df Residuals:                     9997
Method:                           MLE   Df Model:                            2
Date:                Sun, 17 Sep 2017   Pseudo R-squ.:                  0.6578
Time:                        16:10:00   Log-Likelihood:                -18572.
converged:                       True   LL-Null:                       -54276.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.9994      0.007    138.617      0.000       0.985       1.014
x1            -2.0156      0

In [66]:
s = Simulation(model)

In [67]:
s.sample(X, n_sim=10)

array([[  1.,   7.,   0., ...,   5.,   2.,   2.],
       [  3.,   8.,   1., ...,   5.,   0.,   5.],
       [  1.,   6.,   0., ...,   7.,   0.,   2.],
       ..., 
       [  0.,  10.,   1., ...,  10.,   1.,   6.],
       [  2.,   4.,   0., ...,   4.,   1.,   2.],
       [  3.,   5.,   0., ...,   3.,   1.,   3.]])

In [68]:
for model in s.parametric_bootstrap(X, n_sim=10):
    print(model.coef_)

[ 0.9962561  -2.02223879  0.98520933]
[ 1.00174874 -2.016179    0.98438619]
[ 1.01444728 -2.00367046  0.97880814]
[ 0.99162838 -2.01637286  1.00481727]
[ 1.00802374 -2.01063381  0.96973567]
[ 1.00579857 -2.01884458  0.9866297 ]
[ 0.9960399  -2.01891399  0.99412272]
[ 0.99357372 -2.0331467   0.99903938]
[ 0.99830196 -2.01604988  0.98445944]
[ 1.00074698 -2.02003424  0.98114143]


In [36]:
for model in s.non_parametric_bootstrap(X, y_poisson, n_sim=10):
    print(model.coef_)

[ 0.99944037 -1.95435021  0.97540803]
[ 1.03624533 -1.96567739  0.91936164]
[ 0.98336717 -1.96622823  1.00794745]
[ 1.02681072 -1.9644957   0.92060739]
[ 1.01231914 -1.94929026  0.96059324]
[ 1.01857949 -1.96132365  0.95859381]
[ 0.99512153 -1.97303933  0.99006156]
[ 1.01719772 -1.93460717  0.9529537 ]
[ 0.99912808 -1.95763946  0.97810085]
[ 1.00010555 -1.94437559  0.97117927]


## Poisson with Exposures

In [37]:
mu = np.exp(nu)
expos = np.random.uniform(0, 10, size=N)
y_poisson = np.random.poisson(lam=(mu*expos), size=N)

In [38]:
model = GLM(family=Poisson())
model.fit(X, y_poisson, offset=np.log(expos))

<glm.glm.GLM at 0x118155438>

In [39]:
model.coef_

array([ 1.00325959, -1.99621054,  0.98932295])

In [40]:
model.coef_standard_error_

array([ 0.0084044 ,  0.01200987,  0.01120648])

## Gamma Regression

In [41]:
mu = np.exp(nu)
y_gamma = np.random.gamma(shape=2.0, scale=(mu / 2.0), size=N)

In [42]:
gamma_model = GLM(family=Gamma())
gamma_model.fit(X, y_gamma)

<glm.glm.GLM at 0x118155358>

In [43]:
gamma_model.coef_

array([ 1.04538732, -1.99544524,  0.92854162])

In [44]:
gamma_model.coef_standard_error_

array([ 0.01960164,  0.02551379,  0.02568215])

In [45]:
gamma_model.dispersion_

0.54319477864783849

In [46]:
gamma_model = sm.GLM(y_gamma, X, 
                     family=sm.families.Gamma(
                         link=statsmodels.genmod.families.links.log))
res = gamma_model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                   Gamma   Df Model:                            2
Link Function:                    log   Scale:                  0.499038505471
Method:                          IRLS   Log-Likelihood:                -13910.
Date:                Sun, 17 Sep 2017   Deviance:                       5430.3
Time:                        14:47:52   Pearson chi2:                 4.99e+03
No. Iterations:                     5                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0454      0.019     55.641      0.000       1.009       1.082
x1            -1.9954      0.024    -81.597      0.0

## Exponential Regression

In [47]:
mu = np.exp(nu)
y_exponential = np.random.exponential(scale=mu, size=N)

In [48]:
exponential_model = GLM(family=Gamma())
exponential_model.fit(X, y_exponential)

<glm.glm.GLM at 0x11814d4a8>

In [49]:
exponential_model.coef_

array([ 1.02547471, -2.03635258,  0.99990048])

In [50]:
exponential_model.coef_standard_error_

array([ 0.02824993,  0.03677053,  0.03701316])

In [51]:
exponential_model.dispersion_

1.1282497110609833

In [52]:
exponential_model = sm.GLM(y_exponential, X, 
                     family=sm.families.Gamma(
                         link=statsmodels.genmod.families.links.log))
res = exponential_model.fit()
print(res.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                10000
Model:                            GLM   Df Residuals:                     9997
Model Family:                   Gamma   Df Model:                            2
Link Function:                    log   Scale:                   0.98383654671
Method:                          IRLS   Log-Likelihood:                -15006.
Date:                Sun, 17 Sep 2017   Deviance:                       11279.
Time:                        14:47:52   Pearson chi2:                 9.84e+03
No. Iterations:                     6                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0255      0.026     38.873      0.000       0.974       1.077
x1            -2.0364      0.034    -59.305      0.0

## Linear Model with Correlated Predictors

In [53]:
N = 1000
X = np.empty(shape=(N, 3))
X[:, 0] = 1.0
X[:, 1] = np.random.uniform(size=N)
X[:, 2] = 0.5*X[:, 1] + np.random.uniform(-0.5, 0.5, size=N)
nu = 1 - 2*X[:, 1] + X[:, 2]

In [54]:
y = nu + np.random.normal(size=N)
model = GLM(family=Gaussian())
model.fit(X, y)

<glm.glm.GLM at 0x118155160>

In [55]:
model.coef_

array([ 1.04961873, -2.09917034,  0.92403124])

In [56]:
model.coef_covariance_matrix_

array([[ 0.00390364, -0.00599316,  0.00019188],
       [-0.00599316,  0.01504632, -0.00598741],
       [ 0.00019188, -0.00598741,  0.01107235]])

In [57]:
model.coef_standard_error_

array([ 0.06247909,  0.12266344,  0.10522523])

In [58]:
mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.228
Model:                            OLS   Adj. R-squared:                  0.226
Method:                 Least Squares   F-statistic:                     146.9
Date:                Sun, 17 Sep 2017   Prob (F-statistic):           1.23e-56
Time:                        14:47:52   Log-Likelihood:                -1398.6
No. Observations:                1000   AIC:                             2803.
Df Residuals:                     997   BIC:                             2818.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0496      0.062     16.800      0.0