In [5]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Get data
data = sm.datasets.get_rdataset("Guerry", "HistData").data
data.head(3)

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,Wealth,Commerce,Clergy,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,73,58,11,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,22,10,82,4,82,36,38,82,24,327,65.945,7369,513.0
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,61,66,68,46,42,76,66,16,85,34,161.927,7340,298.26


In [6]:
data.columns

Index(['dept', 'Region', 'Department', 'Crime_pers', 'Crime_prop', 'Literacy',
       'Donations', 'Infants', 'Suicides', 'MainCity', 'Wealth', 'Commerce',
       'Clergy', 'Crime_parents', 'Infanticide', 'Donation_clergy', 'Lottery',
       'Desertion', 'Instruction', 'Prostitutes', 'Distance', 'Area',
       'Pop1831'],
      dtype='object')

In [9]:
# Fit OLS regression use ln
# With R style formulas
res = smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=data).fit()
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.348
Model:,OLS,Adj. R-squared:,0.333
Method:,Least Squares,F-statistic:,22.2
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,1.9e-08
Time:,13:48:34,Log-Likelihood:,-379.82
No. Observations:,86,AIC:,765.6
Df Residuals:,83,BIC:,773.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,246.4341,35.233,6.995,0.000,176.358,316.510
Literacy,-0.4889,0.128,-3.832,0.000,-0.743,-0.235
np.log(Pop1831),-31.3114,5.977,-5.239,0.000,-43.199,-19.424

0,1,2,3
Omnibus:,3.713,Durbin-Watson:,2.019
Prob(Omnibus):,0.156,Jarque-Bera (JB):,3.394
Skew:,-0.487,Prob(JB):,0.183
Kurtosis:,3.003,Cond. No.,702.0


In [15]:
# Using numpy arrays

n_obs = 100
X = np.random.random((n_obs, 2))
X = sm.add_constant(X)

beta = [1, 0.1, 0.5]
e = np.random.random(n_obs)
y = np.dot(X, beta) + e

res_2 = sm.OLS(y, X).fit()
res_2.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.208
Model:,OLS,Adj. R-squared:,0.192
Method:,Least Squares,F-statistic:,12.77
Date:,"Sun, 09 Jun 2024",Prob (F-statistic):,1.19e-05
Time:,13:52:11,Log-Likelihood:,-18.432
No. Observations:,100,AIC:,42.86
Df Residuals:,97,BIC:,50.68
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.4398,0.080,17.912,0.000,1.280,1.599
x1,0.3058,0.102,3.007,0.003,0.104,0.508
x2,0.3867,0.099,3.905,0.000,0.190,0.583

0,1,2,3
Omnibus:,34.31,Durbin-Watson:,1.9
Prob(Omnibus):,0.0,Jarque-Bera (JB):,6.054
Skew:,0.024,Prob(JB):,0.0485
Kurtosis:,1.796,Cond. No.,5.37
