In [1]:
%matplotlib inline
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set_style('whitegrid')

In [3]:
data = pd.DataFrame(sm.datasets.macrodata.load().data)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   year      203 non-null    float64
 1   quarter   203 non-null    float64
 2   realgdp   203 non-null    float64
 3   realcons  203 non-null    float64
 4   realinv   203 non-null    float64
 5   realgovt  203 non-null    float64
 6   realdpi   203 non-null    float64
 7   cpi       203 non-null    float64
 8   m1        203 non-null    float64
 9   tbilrate  203 non-null    float64
 10  unemp     203 non-null    float64
 11  pop       203 non-null    float64
 12  infl      203 non-null    float64
 13  realint   203 non-null    float64
dtypes: float64(14)
memory usage: 22.3 KB


In [4]:

data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [5]:
data['growth_rate'] = data.realgdp.pct_change(4)

In [12]:
data.growth_rate.rolling(20).mean()

0           NaN
1           NaN
2           NaN
3           NaN
4           NaN
         ...   
198    0.026648
199    0.023803
200    0.020083
201    0.016144
202    0.013348
Name: growth_rate, Length: 203, dtype: float64

In [23]:
data['target'] = (data.growth_rate > data.growth_rate.rolling(20).mean()).astype(int).shift(-1)
data.quarter = data.quarter.astype(int)

In [18]:
data.target

0      0
1      0
2      0
3      0
4      0
      ..
198    0
199    0
200    0
201    0
202    0
Name: target, Length: 203, dtype: int32

In [21]:
(data.growth_rate > data.growth_rate.rolling(20).mean()).astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
198    0
199    0
200    0
201    0
202    0
Name: growth_rate, Length: 203, dtype: int32

In [24]:
data.target.value_counts()

0.0    112
1.0     90
Name: target, dtype: int64

In [25]:
pct_cols = ['realcons', 'realinv', 'realgovt', 'realdpi', 'm1']
drop_cols = ['year', 'realgdp', 'pop', 'cpi', 'growth_rate']
data.loc[:, pct_cols] = data.loc[:, pct_cols].pct_change(4)

In [26]:
data = pd.get_dummies(data.drop(drop_cols, axis=1), columns=['quarter'], drop_first=True).dropna()

In [27]:
data

Unnamed: 0,realcons,realinv,realgovt,realdpi,m1,tbilrate,unemp,infl,realint,target,quarter_2,quarter_3,quarter_4
4,0.036957,0.156237,-0.016692,0.036356,-0.000716,3.50,5.2,2.31,1.19,0.0,0,0,0
5,0.034147,-0.040877,-0.043426,0.024170,-0.010586,2.68,5.2,0.14,2.55,0.0,1,0,0
6,0.019409,0.024718,-0.033758,0.026821,0.002847,2.36,5.6,2.70,-0.34,0.0,0,1,0
7,0.019673,-0.132257,-0.015738,0.018278,0.007857,2.29,6.3,1.21,1.08,0.0,0,0,1
8,0.009715,-0.196903,0.029544,0.014830,0.017908,2.37,6.8,-0.40,2.77,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,0.006404,-0.062758,0.069618,0.021976,0.028686,1.74,5.4,8.53,-6.79,0.0,1,0,0
198,-0.007273,-0.081144,0.078966,-0.004614,0.069243,1.17,6.0,-3.16,4.33,0.0,0,1,0
199,-0.017974,-0.125159,0.088814,0.003459,0.144548,0.12,6.9,-8.79,8.91,0.0,0,0,1
200,-0.015017,-0.251762,0.056091,0.010136,0.150867,0.22,8.1,0.94,-0.71,0.0,0,0,0


In [30]:
model = sm.Logit(data.target, sm.add_constant(data.drop('target', axis=1)))

In [31]:
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.342965
         Iterations 8


In [33]:
print(result.summary())

                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  198
Model:                          Logit   Df Residuals:                      185
Method:                           MLE   Df Model:                           12
Date:                Thu, 16 Feb 2023   Pseudo R-squ.:                  0.5022
Time:                        22:10:59   Log-Likelihood:                -67.907
converged:                       True   LL-Null:                       -136.42
Covariance Type:            nonrobust   LLR p-value:                 2.375e-23
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -8.5881      1.908     -4.502      0.000     -12.327      -4.849
realcons     130.1446     26.633      4.887      0.000      77.945     182.344
realinv       18.8414      4.053      4.648      0.0

In [34]:
result.predict()

array([8.86735658e-01, 2.00319140e-01, 1.87935294e-01, 2.83758530e-02,
       1.63660734e-03, 5.41483073e-02, 1.33917569e-01, 8.77443044e-01,
       9.50171874e-01, 7.13794296e-01, 5.94180001e-01, 4.11253507e-01,
       4.84696170e-01, 6.33572788e-01, 6.62811057e-01, 7.73213700e-01,
       8.02949468e-01, 7.26036591e-01, 8.04854989e-01, 8.08236296e-01,
       9.66547518e-01, 9.65853541e-01, 7.31067323e-01, 9.21749234e-01,
       7.35701495e-01, 3.16192231e-01, 1.36692496e-01, 1.26885663e-02,
       2.13540290e-04, 1.33683808e-03, 1.39750829e-03, 5.05143454e-03,
       3.04151923e-01, 6.34109778e-01, 7.45883940e-01, 6.74632320e-01,
       6.57604447e-01, 3.91816526e-01, 4.12899016e-02, 5.36175996e-02,
       2.03244712e-02, 2.90429580e-02, 5.47001183e-02, 4.12616821e-02,
       7.45840405e-01, 8.31946228e-01, 9.29786324e-01, 9.94689861e-01,
       9.77139446e-01, 9.91804481e-01, 9.95712323e-01, 9.92843509e-01,
       9.92585737e-01, 8.64692758e-01, 4.65877858e-01, 1.89391956e-01,
      