## Lecture 04 - Generalized Linear Model

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import statsmodels.api as sm


## Generalized Linear Model: Gaussian Distribution
### Data Synthesize

In [None]:
X=np.random.randn(5000,3)
Y=.3*X[:,[0]]+X[:,[1]]+2*X[:,[2]]+2+np.random.randn(5000,1)

In [None]:
X.shape

In [None]:
Y

### Build GLM in statsmodel

##### Understanding p-value
- A p-value less than 0.05 (typically ≤ 0.05) is statistically significant
- A p-value higher than 0.05 (> 0.05) is not statistically significant 

In [29]:
X_pd=pd.DataFrame(X)
X_pd = sm.add_constant(X_pd) # Create the matrix that will lead to beta0 coefficient
Y_pd=pd.DataFrame(Y)

In [30]:
# Instantiate a gaussian family model with the default link function.
gaussian_model = sm.GLM(Y_pd, X_pd, family=sm.families.Gaussian())
gaussian_model_results = gaussian_model.fit()

In [31]:
print(gaussian_model_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      0   No. Observations:                 5000
Model:                            GLM   Df Residuals:                     4996
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                          1.0185
Method:                          IRLS   Log-Likelihood:                -7138.4
Date:                Sun, 09 Aug 2020   Deviance:                       5088.2
Time:                        16:10:50   Pearson chi2:                 5.09e+03
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0150      0.014    141.178      0.0

### Compare with LR in sklearn

In [32]:
from sklearn.linear_model import LinearRegression
reg=LinearRegression().fit(X, Y)
print([reg.coef_, reg.intercept_])

[array([[0.29449822, 0.97206461, 2.00689848]]), array([2.01502453])]


## Generalized Linear Model: Binomial Distribution


### Data Synthesize

In [33]:
X1 = 2*np.random.randn(5000, 1)
X2 = 5*np.random.randn(5000, 1)
X3 = np.random.randn(5000, 1)
eta=0.5*X1+0.1*X2+1.56*X3-1
X=np.column_stack([X1,X2, X3])
p=1/(1+np.exp(-eta))
y=np.random.binomial(1, p).reshape(5000,1)

In [38]:
## X_pd=pd.DataFrame(X)
X_pd = pd.DataFrame(X)
X_pd = sm.add_constant(X_pd) # Create the [X,1] matrix
Y_pd = pd.DataFrame(y)

### Build GLM in statsmodel

In [41]:
# Instantiate a gaussian family model with the default link function.
binomial_model = sm.GLM(Y_pd, X_pd, family=sm.families.Binomial())
binomial_model_results = binomial_model.fit()

In [43]:
print(binomial_model_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      0   No. Observations:                 5000
Model:                            GLM   Df Residuals:                     4996
Model Family:                Binomial   Df Model:                            3
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2243.4
Date:                Sun, 09 Aug 2020   Deviance:                       4486.7
Time:                        16:18:45   Pearson chi2:                 5.24e+03
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.9606      0.041    -23.589      0.0

### Compare with logistic regression using sklearn

In [44]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(penalty='none', solver='newton-cg')
lr.fit(X,y)

  return f(**kwargs)


LogisticRegression(penalty='none', solver='newton-cg')

In [45]:
print(lr.coef_, lr.intercept_)

[[0.51062385 0.09632258 1.58968119]] [-0.96056832]
