In [2]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
import pandas as pd

df = sm.datasets.get_rdataset("Guerry", "HistData").data
df.head(3)

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,...,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,...,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,...,4,82,36,38,82,24,327,65.945,7369,513.0
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,...,46,42,76,66,16,85,34,161.927,7340,298.26


In [4]:
df = df[['Lottery', "Literacy", "Wealth", "Region"]].dropna()
df.head(3)

Unnamed: 0,Lottery,Literacy,Wealth,Region
0,41,37,73,E
1,38,51,22,N
2,66,13,61,C


In [5]:
res = smf.ols(formula="Lottery ~ Literacy + Wealth + Region", data=df).fit()
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.338
Model:,OLS,Adj. R-squared:,0.287
Method:,Least Squares,F-statistic:,6.636
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,1.07e-05
Time:,14:49:15,Log-Likelihood:,-375.3
No. Observations:,85,AIC:,764.6
Df Residuals:,78,BIC:,781.7
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.6517,9.456,4.087,0.000,19.826,57.478
Region[T.E],-15.4278,9.727,-1.586,0.117,-34.793,3.938
Region[T.N],-10.0170,9.260,-1.082,0.283,-28.453,8.419
Region[T.S],-4.5483,7.279,-0.625,0.534,-19.039,9.943
Region[T.W],-10.0913,7.196,-1.402,0.165,-24.418,4.235
Literacy,-0.1858,0.210,-0.886,0.378,-0.603,0.232
Wealth,0.4515,0.103,4.390,0.000,0.247,0.656

0,1,2,3
Omnibus:,3.049,Durbin-Watson:,1.785
Prob(Omnibus):,0.218,Jarque-Bera (JB):,2.694
Skew:,-0.34,Prob(JB):,0.26
Kurtosis:,2.454,Cond. No.,371.0


In [6]:
# To explicitly that Region is a categorical variable
res_c = smf.ols(formula="Lottery ~ Literacy + Wealth + C(Region)", data=df).fit()
res_c.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.338
Model:,OLS,Adj. R-squared:,0.287
Method:,Least Squares,F-statistic:,6.636
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,1.07e-05
Time:,14:52:51,Log-Likelihood:,-375.3
No. Observations:,85,AIC:,764.6
Df Residuals:,78,BIC:,781.7
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.6517,9.456,4.087,0.000,19.826,57.478
C(Region)[T.E],-15.4278,9.727,-1.586,0.117,-34.793,3.938
C(Region)[T.N],-10.0170,9.260,-1.082,0.283,-28.453,8.419
C(Region)[T.S],-4.5483,7.279,-0.625,0.534,-19.039,9.943
C(Region)[T.W],-10.0913,7.196,-1.402,0.165,-24.418,4.235
Literacy,-0.1858,0.210,-0.886,0.378,-0.603,0.232
Wealth,0.4515,0.103,4.390,0.000,0.247,0.656

0,1,2,3
Omnibus:,3.049,Durbin-Watson:,1.785
Prob(Omnibus):,0.218,Jarque-Bera (JB):,2.694
Skew:,-0.34,Prob(JB):,0.26
Kurtosis:,2.454,Cond. No.,371.0


In [10]:
# Removing constant (intercept)
res_c = smf.ols(formula="Lottery ~ Literacy + Wealth + C(Region) - 1", data=df).fit()
res_c.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.338
Model:,OLS,Adj. R-squared:,0.287
Method:,Least Squares,F-statistic:,6.636
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,1.07e-05
Time:,14:54:33,Log-Likelihood:,-375.3
No. Observations:,85,AIC:,764.6
Df Residuals:,78,BIC:,781.7
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
C(Region)[C],38.6517,9.456,4.087,0.000,19.826,57.478
C(Region)[E],23.2239,14.931,1.555,0.124,-6.501,52.949
C(Region)[N],28.6347,13.127,2.181,0.032,2.501,54.769
C(Region)[S],34.1034,10.370,3.289,0.002,13.459,54.748
C(Region)[W],28.5604,10.018,2.851,0.006,8.616,48.505
Literacy,-0.1858,0.210,-0.886,0.378,-0.603,0.232
Wealth,0.4515,0.103,4.390,0.000,0.247,0.656

0,1,2,3
Omnibus:,3.049,Durbin-Watson:,1.785
Prob(Omnibus):,0.218,Jarque-Bera (JB):,2.694
Skew:,-0.34,Prob(JB):,0.26
Kurtosis:,2.454,Cond. No.,653.0


In [11]:
# Interactions
res_int1 = smf.ols(formula="Lottery ~ Literacy:Wealth + C(Region)", data=df).fit()
res_int1.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.229
Model:,OLS,Adj. R-squared:,0.181
Method:,Least Squares,F-statistic:,4.7
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,0.000827
Time:,14:55:17,Log-Likelihood:,-381.76
No. Observations:,85,AIC:,775.5
Df Residuals:,79,BIC:,790.2
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,47.8130,5.909,8.092,0.000,36.051,59.575
C(Region)[T.E],-31.8424,8.850,-3.598,0.001,-49.459,-14.226
C(Region)[T.N],-27.6259,7.707,-3.584,0.001,-42.967,-12.285
C(Region)[T.S],-8.8565,7.752,-1.143,0.257,-24.285,6.573
C(Region)[T.W],-11.0910,7.734,-1.434,0.156,-26.486,4.304
Literacy:Wealth,0.0070,0.002,2.989,0.004,0.002,0.012

0,1,2,3
Omnibus:,8.998,Durbin-Watson:,1.8
Prob(Omnibus):,0.011,Jarque-Bera (JB):,3.891
Skew:,-0.251,Prob(JB):,0.143
Kurtosis:,2.079,Cond. No.,11000.0


In [12]:
# Interactions2
res_int = smf.ols(formula="Lottery ~ Literacy + Wealth + Literacy:Wealth + C(Region)", data=df).fit()
res_int.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.338
Model:,OLS,Adj. R-squared:,0.278
Method:,Least Squares,F-statistic:,5.615
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,2.96e-05
Time:,14:55:36,Log-Likelihood:,-375.3
No. Observations:,85,AIC:,766.6
Df Residuals:,77,BIC:,786.1
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.0993,17.470,2.238,0.028,4.312,73.887
C(Region)[T.E],-15.4451,9.807,-1.575,0.119,-34.973,4.082
C(Region)[T.N],-9.9728,9.432,-1.057,0.294,-28.753,8.808
C(Region)[T.S],-4.5754,7.380,-0.620,0.537,-19.270,10.119
C(Region)[T.W],-10.1122,7.275,-1.390,0.169,-24.598,4.374
Literacy,-0.1960,0.396,-0.495,0.622,-0.984,0.592
Wealth,0.4432,0.290,1.530,0.130,-0.133,1.020
Literacy:Wealth,0.0002,0.007,0.031,0.976,-0.013,0.013

0,1,2,3
Omnibus:,3.076,Durbin-Watson:,1.784
Prob(Omnibus):,0.215,Jarque-Bera (JB):,2.709
Skew:,-0.341,Prob(JB):,0.258
Kurtosis:,2.452,Cond. No.,15600.0


In [13]:
# Interactions3
res_int = smf.ols(formula="Lottery ~ Literacy*Wealth + C(Region)", data=df).fit()
res_int.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.338
Model:,OLS,Adj. R-squared:,0.278
Method:,Least Squares,F-statistic:,5.615
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,2.96e-05
Time:,14:57:17,Log-Likelihood:,-375.3
No. Observations:,85,AIC:,766.6
Df Residuals:,77,BIC:,786.1
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.0993,17.470,2.238,0.028,4.312,73.887
C(Region)[T.E],-15.4451,9.807,-1.575,0.119,-34.973,4.082
C(Region)[T.N],-9.9728,9.432,-1.057,0.294,-28.753,8.808
C(Region)[T.S],-4.5754,7.380,-0.620,0.537,-19.270,10.119
C(Region)[T.W],-10.1122,7.275,-1.390,0.169,-24.598,4.374
Literacy,-0.1960,0.396,-0.495,0.622,-0.984,0.592
Wealth,0.4432,0.290,1.530,0.130,-0.133,1.020
Literacy:Wealth,0.0002,0.007,0.031,0.976,-0.013,0.013

0,1,2,3
Omnibus:,3.076,Durbin-Watson:,1.784
Prob(Omnibus):,0.215,Jarque-Bera (JB):,2.709
Skew:,-0.341,Prob(JB):,0.258
Kurtosis:,2.452,Cond. No.,15600.0


In [14]:
# using formulas
res = smf.ols(formula="Lottery ~ np.log(Literacy)", data=df).fit()
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.161
Model:,OLS,Adj. R-squared:,0.151
Method:,Least Squares,F-statistic:,15.89
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,0.000144
Time:,14:59:07,Log-Likelihood:,-385.38
No. Observations:,85,AIC:,774.8
Df Residuals:,83,BIC:,779.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,115.6091,18.374,6.292,0.000,79.064,152.155
np.log(Literacy),-20.3940,5.116,-3.986,0.000,-30.570,-10.218

0,1,2,3
Omnibus:,8.907,Durbin-Watson:,2.019
Prob(Omnibus):,0.012,Jarque-Bera (JB):,3.299
Skew:,0.108,Prob(JB):,0.192
Kurtosis:,2.059,Cond. No.,28.7


In [15]:
# Using custom functions

def log_plus_1(x):
    return np.log(x) + 1.

res = smf.ols(formula="Lottery ~ log_plus_1(Literacy)", data=df).fit()
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.161
Model:,OLS,Adj. R-squared:,0.151
Method:,Least Squares,F-statistic:,15.89
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,0.000144
Time:,15:00:20,Log-Likelihood:,-385.38
No. Observations:,85,AIC:,774.8
Df Residuals:,83,BIC:,779.7
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,136.0031,23.454,5.799,0.000,89.354,182.652
log_plus_1(Literacy),-20.3940,5.116,-3.986,0.000,-30.570,-10.218

0,1,2,3
Omnibus:,8.907,Durbin-Watson:,2.019
Prob(Omnibus):,0.012,Jarque-Bera (JB):,3.299
Skew:,0.108,Prob(JB):,0.192
Kurtosis:,2.059,Cond. No.,45.5
