In [1]:
# some example data
import numpy as np

import pandas as pd

import statsmodels.api as sm



mdata = sm.datasets.macrodata.load_pandas().data

# prepare the dates index


# make a VAR model




In [2]:
"""
原始資料長相
"""
mdata


Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.980,139.7,2.82,5.8,177.146,0.00,0.00
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.150,141.7,3.08,5.1,177.830,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.260,1916.4,29.350,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.370,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.540,139.6,3.50,5.2,180.007,2.31,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,2008.0,3.0,13324.600,9267.7,1990.693,991.551,9838.3,216.889,1474.7,1.17,6.0,305.270,-3.16,4.33
199,2008.0,4.0,13141.920,9195.3,1857.661,1007.273,9920.4,212.174,1576.5,0.12,6.9,305.952,-8.79,8.91
200,2009.0,1.0,12925.410,9209.2,1558.494,996.287,9926.4,212.671,1592.8,0.22,8.1,306.547,0.94,-0.71
201,2009.0,2.0,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,1653.6,0.18,9.2,307.226,3.37,-3.19


In [3]:
"""
要把 dataframe 的格式清理成以下格式
dates_from_str 要放入 index
其他則是 feature
"""
from statsmodels.tsa.base.datetools import dates_from_str
dates = mdata[['year', 'quarter']].astype(int).astype(str)
quarterly = dates["year"] + "Q" + dates["quarter"]
quarterly = dates_from_str(quarterly)
mdata.index = pd.DatetimeIndex(quarterly)
data = mdata.drop(columns=["year","quarter"])
data

Unnamed: 0,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
1959-03-31,2710.349,1707.4,286.898,470.045,1886.9,28.980,139.7,2.82,5.8,177.146,0.00,0.00
1959-06-30,2778.801,1733.7,310.859,481.301,1919.7,29.150,141.7,3.08,5.1,177.830,2.34,0.74
1959-09-30,2775.488,1751.8,289.226,491.260,1916.4,29.350,140.5,3.82,5.3,178.657,2.74,1.09
1959-12-31,2785.204,1753.7,299.356,484.052,1931.3,29.370,140.0,4.33,5.6,179.386,0.27,4.06
1960-03-31,2847.699,1770.5,331.722,462.199,1955.5,29.540,139.6,3.50,5.2,180.007,2.31,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...
2008-09-30,13324.600,9267.7,1990.693,991.551,9838.3,216.889,1474.7,1.17,6.0,305.270,-3.16,4.33
2008-12-31,13141.920,9195.3,1857.661,1007.273,9920.4,212.174,1576.5,0.12,6.9,305.952,-8.79,8.91
2009-03-31,12925.410,9209.2,1558.494,996.287,9926.4,212.671,1592.8,0.22,8.1,306.547,0.94,-0.71
2009-06-30,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,1653.6,0.18,9.2,307.226,3.37,-3.19


In [4]:
from typing import Union
def vectorAutoregression(data:pd.DataFrame,maxlags:Union[int,str]="auto",ic:str=None):
    from statsmodels.tsa.api import VAR
    model = VAR(data)
    # ==== 這邊不要動 =====

    """ 
    這是套件設定的
    trend : str {"n", "c", "ct", "ctt"}
        * "n" - no deterministic terms
        * "c" - constant term
        * "ct" - constant and linear term
        * "ctt" - constant, linear, and quadratic term

    maxlags 不可以超過 max_estimable 的值
    maxlags 為模型擬合最大數值
    statemodel有設定條件，已經寫在下述的程式
    使用者要調整低於 maxlags
    """
    n_totobs = len(data)
    ntrend = 1 #len(trend) if trend.startswith("c") else 0
    neqs = data.shape[1]
    max_estimable = (n_totobs - neqs - ntrend) // (1 + neqs) # statemodel 套件給定的最大值，超出則會ERROR
    # ==== 這邊不要動 =====

    print(" maxlags 要小於等於: ", max_estimable)

    if maxlags == "auto":
        maxlags = max_estimable
    if maxlags > max_estimable:
        raise Exception(" maxlags 要小於等於: ", max_estimable)


    """
    ic 為評估模型的好壞
    ic = {'aic', 'fpe', 'hqic', 'bic', None}
    Information criterion to use for VAR order selection.
    aic : Akaike
    fpe : Final prediction error
    hqic : Hannan-Quinn
    bic : Bayesian a.k.a. Schwarz
    """

    results = model.fit(maxlags=maxlags, ic=ic)
    print(f"在最大 lag 數目為 {max_estimable} 的情況下，VAR 找出的最佳 lag 為: ",results.k_ar)
    return results

In [5]:
results_2 = vectorAutoregression(data,maxlags=10,ic="fpe")

 maxlags 要小於等於:  14
在最大 lag 數目為 14 的情況下，VAR 找出的最佳 lag 為:  6


  self._init_dates(dates, freq)


In [63]:
# results.summary() #15.7729  20.6162

In [8]:
results_2

<statsmodels.tsa.vector_ar.var_model.VARResultsWrapper at 0x214f3568820>

In [13]:
from statsmodels.tsa.vector_ar.var_model import VARResultsWrapper
def vectorAutoregressionRelationship(results:VARResultsWrapper,target:str,pvalue_threshold:int=0.05):
    # target  客人關心的 Y 是甚麼，Y 會包含在 results 中

    coef_df = results.params[target]
    pvalues_df = results.pvalues[target]

    # 合并系数和p值
    summary = pd.concat([coef_df, pvalues_df], axis=1)
    summary.columns = ['coef', 'pvalue']
    summary = summary.drop(index="const").reset_index()
    summary_index = summary["index"].str.split(".", expand=True).rename(columns={0:"time_lag",1:"feature"})
    summary_index["time_lag"] = summary_index["time_lag"].str.replace("L","").astype(int)
    summary = pd.concat([summary_index,summary],axis=1).drop(columns="index")
    summary = summary[summary["pvalue"]<pvalue_threshold].reset_index(drop=True)
    return summary

In [17]:
vectorAutoregressionRelationship(results=results_2,target="realgdp",pvalue_threshold=0.05)

Unnamed: 0,time_lag,feature,coef,pvalue
0,1,realcons,1.152813,4.2e-05
1,1,m1,-1.134177,0.031576
2,4,m1,-2.14543,0.019142
3,5,unemp,-86.909426,0.00752
4,6,unemp,39.947191,0.035746


In [31]:
r = results.test_causality(['realgdp','realinv'], ['realcons'], kind='f')

In [54]:

from statsmodels.stats.stattools import durbin_watson
out = durbin_watson(results.resid)

for col, val in zip(data.columns, out):
    print(col, ':', round(val, 2))

"""
The value of this statistic can vary between 0 and 4. The closer it is to the value 2, 
then there is no significant serial correlation. The closer to 0, there is a positive serial correlation, 
and the closer it is to 4 implies negative serial correlation.
"""

"""

Test statistic value of 0: Perfect positive autocorrelation
Test statistic value 0f 2: No autocorrelation
Test statistic value of 4: Perfect negative autocorrelation
"""

realgdp : 1.96
realcons : 2.02
realinv : 1.95


In [41]:
import pandas as pd

In [6]:
model.select_order(15)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults at 0x2913a3f01f0>

In [65]:
"""# 提取滞后值
X_t_minus_1 = new_data['realgdp'].iloc[-2]
X_t_minus_2 = new_data['realgdp'].iloc[-3]
X_t_minus_9 = new_data['realgdp'].iloc[-10]

# 使用关系式进行预测
predicted_value = 0.3 * X_t_minus_1 + 1.1 * X_t_minus_9 - 0.88 * X_t_minus_2"""

"# 提取滞后值\nX_t_minus_1 = new_data['realgdp'].iloc[-2]\nX_t_minus_2 = new_data['realgdp'].iloc[-3]\nX_t_minus_9 = new_data['realgdp'].iloc[-10]\n\n# 使用关系式进行预测\npredicted_value = 0.3 * X_t_minus_1 + 1.1 * X_t_minus_9 - 0.88 * X_t_minus_2"