In [1]:
import statsmodels.api as sm
from statsmodels.compat import lzip
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df = sm.datasets.get_rdataset("Guerry", "HistData").data
df.head(3)

Unnamed: 0,dept,Region,Department,Crime_pers,Crime_prop,Literacy,Donations,Infants,Suicides,MainCity,...,Crime_parents,Infanticide,Donation_clergy,Lottery,Desertion,Instruction,Prostitutes,Distance,Area,Pop1831
0,1,E,Ain,28870,15890,37,5098,33120,35039,2:Med,...,71,60,69,41,55,46,13,218.372,5762,346.03
1,2,N,Aisne,26226,5521,51,8901,14572,12831,2:Med,...,4,82,36,38,82,24,327,65.945,7369,513.0
2,3,C,Allier,26747,7925,13,10973,17044,114121,2:Med,...,46,42,76,66,16,85,34,161.927,7340,298.26


In [8]:
res = smf.ols("Lottery ~ Literacy + np.log(Pop1831)", data=df).fit()
res.summary()

0,1,2,3
Dep. Variable:,Lottery,R-squared:,0.348
Model:,OLS,Adj. R-squared:,0.333
Method:,Least Squares,F-statistic:,22.2
Date:,"Fri, 14 Jun 2024",Prob (F-statistic):,1.9e-08
Time:,16:34:34,Log-Likelihood:,-379.82
No. Observations:,86,AIC:,765.6
Df Residuals:,83,BIC:,773.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,246.4341,35.233,6.995,0.000,176.358,316.510
Literacy,-0.4889,0.128,-3.832,0.000,-0.743,-0.235
np.log(Pop1831),-31.3114,5.977,-5.239,0.000,-43.199,-19.424

0,1,2,3
Omnibus:,3.713,Durbin-Watson:,2.019
Prob(Omnibus):,0.156,Jarque-Bera (JB):,3.394
Skew:,-0.487,Prob(JB):,0.183
Kurtosis:,3.003,Cond. No.,702.0


In [9]:
# 1) Normality of residuals
# 2) Jarque-Bera test.
name = ["Jarque-Bera", "Chi^2 two-tail prob.", "Skew", "Kurtosis"]
test = sms.jarque_bera(res.resid)
lzip(name, test)

[('Jarque-Bera', 3.3936080248431755),
 ('Chi^2 two-tail prob.', 0.18326831231663288),
 ('Skew', -0.48658034311223436),
 ('Kurtosis', 3.003417757881633)]

In [10]:
test

(3.3936080248431755,
 0.18326831231663288,
 -0.48658034311223436,
 3.003417757881633)

In [11]:
# Influence tests

from statsmodels.stats.outliers_influence import OLSInfluence
test_class = OLSInfluence(res)

test_class.dfbeta[5:, :]

array([[ 8.26794653e-01, -9.37228508e-03, -3.91782694e-02],
       [-4.60565242e-01, -6.00836231e-03,  1.07905336e-01],
       [ 2.20296666e+00, -8.83371102e-03, -2.94410442e-01],
       [-3.09677967e+00, -1.44344868e-02,  5.88879087e-01],
       [-6.94207982e-01,  8.22842167e-04,  1.03706847e-01],
       [ 4.10842584e-01, -1.11761663e-02,  7.29888204e-02],
       [-1.89562753e-01,  2.08460190e-03, -6.30431760e-02],
       [ 2.38248735e+00, -6.58585145e-03, -3.89952698e-01],
       [ 4.44134010e+00, -7.37883826e-03, -6.57505666e-01],
       [ 3.65734696e-02, -2.06183483e-03,  3.90285143e-02],
       [ 1.48071908e-01,  4.65456319e-05, -2.83829919e-02],
       [-4.66538011e+00,  2.34268272e-02,  5.91701800e-01],
       [ 2.88977165e+00, -2.35317798e-02, -2.90569539e-01],
       [-6.22764720e-02,  1.28974407e-03,  5.15640285e-03],
       [-8.19834682e+00, -3.43559449e-02,  1.69772160e+00],
       [ 4.31761529e+00, -1.53701973e-02, -5.82861140e-01],
       [-4.22405692e+00, -2.94719098e-02