## Lecture 04 - Generalized Linear Model

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import statsmodels.api as sm


## Generalized Linear Model: Gaussian Distribution
### Data Synthesize

In [29]:
X=np.random.randn(5000,3)
Y=.3*X[:,[0]]+X[:,[1]]+2*X[:,[2]]+2+np.random.randn(5000,1)

In [30]:
X_pd=pd.DataFrame(X)
X_pd = sm.add_constant(X_pd) # Create the [X,1] matrix
Y_pd=pd.DataFrame(Y)

### Build GLM in statsmodel

In [33]:
# Instantiate a gaussian family model with the default link function.
gaussian_model = sm.GLM(Y_pd, X_pd, family=sm.families.Gaussian())
gaussian_model_results = gaussian_model.fit()

In [34]:
print(gaussian_model_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      0   No. Observations:                 5000
Model:                            GLM   Df Residuals:                     4996
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.96845
Method:                          IRLS   Log-Likelihood:                -7012.5
Date:                Wed, 18 Mar 2020   Deviance:                       4838.4
Time:                        21:02:35   Pearson chi2:                 4.84e+03
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0043      0.014    143.971      0.0

### Compare with LR in sklearn

In [49]:
from sklearn.linear_model import LinearRegression
X_1=np.column_stack([X, np.ones([5000,1])])
reg=LinearRegression().fit(X_1, Y)
print([reg.coef_, reg.intercept_])

[array([[0.30788169, 1.01981942, 2.02757221, 0.        ]]), array([2.0042798])]


## Generalized Linear Model: Binomial Distribution


### Data Synthesize

In [62]:
X1 = 2*np.random.randn(5000, 1)
X2 = 5*np.random.randn(5000, 1)
X3 = np.random.randn(5000, 1)
eta=0.5*X1+0.1*X2+1.56*X3-1
X=np.column_stack([X1,X2, X3])
p=1/(1+np.exp(-eta))
y=np.random.binomial(1, p).reshape(5000,1)

In [63]:
## X_pd=pd.DataFrame(X)
X_pd = pd.DataFrame(X)
X_pd = sm.add_constant(X_pd) # Create the [X,1] matrix
Y_pd = pd.DataFrame(y)

### Build GLM in statsmodel

In [64]:
# Instantiate a gaussian family model with the default link function.
binomial_model = sm.GLM(Y_pd, X_pd, family=sm.families.Binomial())
binomial_model_results = binomial_model.fit()

In [65]:
binomial_model_results.summary()

0,1,2,3
Dep. Variable:,0,No. Observations:,5000.0
Model:,GLM,Df Residuals:,4996.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2195.8
Date:,"Wed, 18 Mar 2020",Deviance:,4391.5
Time:,21:15:06,Pearson chi2:,5330.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.0558,0.042,-25.137,0.000,-1.138,-0.973
0,0.5215,0.022,23.408,0.000,0.478,0.565
1,0.1107,0.008,14.013,0.000,0.095,0.126
2,1.5184,0.051,29.949,0.000,1.419,1.618


### Compare with logistic regression using sklearn

In [78]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty='none', solver='newton-cg')
lr.fit(X,y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [79]:
print(lr.coef_, lr.intercept_)

[[0.5214664  0.11069083 1.51836008]] [-1.05578799]
