In [3]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

In [4]:
dir(smf)

['GEE',
 'GLM',
 'GLS',
 'GLSAR',
 'Logit',
 'MNLogit',
 'MixedLM',
 'NegativeBinomial',
 'NominalGEE',
 'OLS',
 'OrdinalGEE',
 'PHReg',
 'Poisson',
 'Probit',
 'QuantReg',
 'RLM',
 'WLS',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'gee',
 'glm',
 'gls',
 'glsar',
 'logit',
 'mixedlm',
 'mnlogit',
 'negativebinomial',
 'nominal_gee',
 'ols',
 'ordinal_gee',
 'phreg',
 'poisson',
 'probit',
 'quantreg',
 'rlm',
 'wls']

Formula compatible models have the following generic call signature: (formula, data, subset=none, *args, **kwargs)

In [5]:
df = sm.datasets.get_rdataset("Guerry", "HistData").data
df = df[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna()
df.head()

Unnamed: 0,Lottery,Literacy,Wealth,Region
0,41,37,73,E
1,38,51,22,N
2,66,13,61,C
3,80,46,76,E
4,79,69,83,E


Fit the model:

In [8]:
mod = smf.ols(formula='Lottery ~ Literacy + Wealth + Region', data=df)

res = mod.fit()

print(res.summary())


                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     6.636
Date:                Sun, 29 Jul 2018   Prob (F-statistic):           1.07e-05
Time:                        16:06:26   Log-Likelihood:                -375.30
No. Observations:                  85   AIC:                             764.6
Df Residuals:                      78   BIC:                             781.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      38.6517      9.456      4.087      

Setting up a variable as categorical requires the use of the uppercase C "C()" operator, e.g. Region can be fit using:

In [12]:
res = smf.ols(formula='Lottery ~ Literacy + Wealth + C(Region)', data=df).fit()

print(res.params)

Intercept         38.651655
C(Region)[T.E]   -15.427785
C(Region)[T.N]   -10.016961
C(Region)[T.S]    -4.548257
C(Region)[T.W]   -10.091276
Literacy          -0.185819
Wealth             0.451475
dtype: float64


Removing variables can be done using the "-" operator

In [13]:
res = smf.ols(formula='Lottery ~ Literacy + Wealth + C(Region) - 1', data=df).fit()

print(res.params)

C(Region)[C]    38.651655
C(Region)[E]    23.223870
C(Region)[N]    28.634694
C(Region)[S]    34.103399
C(Region)[W]    28.560379
Literacy        -0.185819
Wealth           0.451475
dtype: float64


Multiplicative interactions
“:” adds a new column to the design matrix with the product of the other two columns. “*” will also include the individual columns that were multiplied together:

In [14]:
res1 = smf.ols(formula='Lottery ~ Literacy : Wealth - 1', data=df).fit()

res2 = smf.ols(formula='Lottery ~ Literacy * Wealth - 1', data=df).fit()

print(res1.params)

print(res2.params)

Literacy:Wealth    0.018176
dtype: float64
Literacy           0.427386
Wealth             1.080987
Literacy:Wealth   -0.013609
dtype: float64


Functions:-->
You can apply vectorized functions to the variables in your model:

In [15]:
res = smf.ols(formula='Lottery ~ np.log(Literacy)', data=df).fit()

print(res.params)

Intercept           115.609119
np.log(Literacy)    -20.393959
dtype: float64


Define a custom function:

In [16]:
def log_plus_1(x):
    return np.log(x) + 1


In [17]:
res = smf.ols(formula='Lottery ~ log_plus_1(Literacy)', data=df).fit()

print(res.params)

Intercept               136.003079
log_plus_1(Literacy)    -20.393959
dtype: float64
