# Python statistics

In [25]:
# regular imports

import os
import sys
import sysconfig
from pathlib import Path
import shutil
import subprocess
import tempfile
import re
import difflib
import pypdf
import docx
import chardet
import datetime
import dateutil

import pandas as pd

import numpy as np
import math
import decimal
from decimal import Decimal
import fractions
from fractions import Fraction
import random
import scipy
import sympy
from pprint import pprint
import statsmodels
# import statsmodels.api as sm

from typing import (
    List, Dict, Set, Tuple, 
    Optional, Union,        
    Callable,               
    # TypeVar, Generic,       
    Any, Iterable, Sequence, Mapping, 
    # NewType,                
    # cast, TYPE_CHECKING    
)

# statsmodels.api as sm

## Statsmodels tutorial

[User Guide](https://www.statsmodels.org/stable/user-guide.html)

In [24]:
# open a web browser to the statsmodels documentation
statsmodels.tools.web.webdoc()

In [1]:
# helloworld example

import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

dat = sm.datasets.get_rdataset("Guerry", "HistData").data
results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.348
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     22.20
Date:                Fri, 29 Mar 2024   Prob (F-statistic):           1.90e-08
Time:                        00:10:35   Log-Likelihood:                -379.82
No. Observations:                  86   AIC:                             765.6
Df Residuals:                      83   BIC:                             773.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         246.4341     35.233     

In [1]:
# numpy example

import numpy as np
import statsmodels.api as sm
nobs = 100
X = np.random.random((nobs, 2))
X = sm.add_constant(X)
beta = [1, .1, .5]
e = np.random.random(nobs)
y = np.dot(X, beta) + e
results = sm.OLS(y, X).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.249
Model:                            OLS   Adj. R-squared:                  0.233
Method:                 Least Squares   F-statistic:                     16.07
Date:                Fri, 29 Mar 2024   Prob (F-statistic):           9.38e-07
Time:                        14:50:18   Log-Likelihood:                -8.1530
No. Observations:                 100   AIC:                             22.31
Df Residuals:                      97   BIC:                             30.12
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.4592      0.070     20.870      0.0

In [3]:
# pandas example

import statsmodels.api as sm
import pandas
from patsy import dmatrices

df = sm.datasets.get_rdataset("Guerry", "HistData").data

vars = ['Department', 'Lottery', 'Literacy', 'Wealth', 'Region']

df = df[vars]

df[-5:]

Unnamed: 0,Department,Lottery,Literacy,Wealth,Region
81,Vienne,40,25,68,W
82,Haute-Vienne,55,13,67,C
83,Vosges,14,62,82,E
84,Yonne,51,47,30,C
85,Corse,83,49,37,


In [4]:
df = df.dropna()
y, X = dmatrices('Lottery ~ Literacy + Wealth + Region', data=df, return_type='dataframe')

In [5]:
y[:3]

Unnamed: 0,Lottery
0,41.0
1,38.0
2,66.0


In [6]:
X[:3]

Unnamed: 0,Intercept,Region[T.E],Region[T.N],Region[T.S],Region[T.W],Literacy,Wealth
0,1.0,1.0,0.0,0.0,0.0,37.0,73.0
1,1.0,0.0,1.0,0.0,0.0,51.0,22.0
2,1.0,0.0,0.0,0.0,0.0,13.0,61.0


In [7]:
# model and fit

mod = sm.OLS(y, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                Lottery   R-squared:                       0.338
Model:                            OLS   Adj. R-squared:                  0.287
Method:                 Least Squares   F-statistic:                     6.636
Date:                Fri, 29 Mar 2024   Prob (F-statistic):           1.07e-05
Time:                        14:52:45   Log-Likelihood:                -375.30
No. Observations:                  85   AIC:                             764.6
Df Residuals:                      78   BIC:                             781.7
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept      38.6517      9.456      4.087      

In [8]:
# access results

res.params

Intercept      38.651655
Region[T.E]   -15.427785
Region[T.N]   -10.016961
Region[T.S]    -4.548257
Region[T.W]   -10.091276
Literacy       -0.185819
Wealth          0.451475
dtype: float64

In [9]:
res.rsquared

0.3379508691928822

## datasets

1. use statsmodels built-in datasets
2. pandas can be accessed `.data` 

参考 [Statsmodels Datasets](https://www.statsmodels.org/stable/datasets/index.html#available-datasets)

In [14]:
import statsmodels.api as sm

data = sm.datasets.longley.load_pandas()
pd_data = data.data

In [15]:
pd_data.head()

Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP,YEAR
0,60323.0,83.0,234289.0,2356.0,1590.0,107608.0,1947.0
1,61122.0,88.5,259426.0,2325.0,1456.0,108632.0,1948.0
2,60171.0,88.2,258054.0,3682.0,1616.0,109773.0,1949.0
3,61187.0,89.5,284599.0,3351.0,1650.0,110929.0,1950.0
4,63221.0,96.2,328975.0,2099.0,3099.0,112075.0,1951.0


In [16]:
pd_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   TOTEMP   16 non-null     float64
 1   GNPDEFL  16 non-null     float64
 2   GNP      16 non-null     float64
 3   UNEMP    16 non-null     float64
 4   ARMED    16 non-null     float64
 5   POP      16 non-null     float64
 6   YEAR     16 non-null     float64
dtypes: float64(7)
memory usage: 1.0 KB


In [17]:
pd_data.describe()

Unnamed: 0,TOTEMP,GNPDEFL,GNP,UNEMP,ARMED,POP,YEAR
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,65317.0,101.68125,387698.4375,3193.3125,2606.6875,117424.0,1954.5
std,3511.968356,10.791553,99394.937795,934.464247,695.919604,6956.101561,4.760952
min,60171.0,83.0,234289.0,1870.0,1456.0,107608.0,1947.0
25%,62712.5,94.525,317881.0,2348.25,2298.0,111788.5,1950.75
50%,65504.0,100.6,381427.0,3143.5,2717.5,116803.5,1954.5
75%,68290.5,111.25,454085.5,3842.5,3060.75,122304.0,1958.25
max,70551.0,116.9,554894.0,4806.0,3594.0,130081.0,1962.0


In [18]:
# Exogenous variables and endogenous variables have been divided
# Univariate datasets, however, do not have an exog attribute.

print(data.endog_name)
print(data.exog_name)

TOTEMP
['GNPDEFL', 'GNP', 'UNEMP', 'ARMED', 'POP', 'YEAR']


In [19]:
data.endog.head()

0    60323.0
1    61122.0
2    60171.0
3    61187.0
4    63221.0
Name: TOTEMP, dtype: float64

In [20]:
data.exog.head()

Unnamed: 0,GNPDEFL,GNP,UNEMP,ARMED,POP,YEAR
0,83.0,234289.0,2356.0,1590.0,107608.0,1947.0
1,88.5,259426.0,2325.0,1456.0,108632.0,1948.0
2,88.2,258054.0,3682.0,1616.0,109773.0,1949.0
3,89.5,284599.0,3351.0,1650.0,110929.0,1950.0
4,96.2,328975.0,2099.0,3099.0,112075.0,1951.0


In [21]:
# assign exog to x, endog to y, then fit

y, x = data.endog, data.exog
res = sm.OLS(y, x).fit()
res.params

GNPDEFL   -52.993570
GNP         0.071073
UNEMP      -0.423466
ARMED      -0.572569
POP        -0.414204
YEAR       48.417866
dtype: float64

In [22]:
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                 TOTEMP   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          5.052e+04
Date:                Mon, 01 Apr 2024   Prob (F-statistic):                    8.20e-22
Time:                        11:31:58   Log-Likelihood:                         -117.56
No. Observations:                  16   AIC:                                      247.1
Df Residuals:                      10   BIC:                                      251.8
Df Model:                           6                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------



## 