In [1]:
import pandas as pd
from typing import Union
from statsmodels.tsa.vector_ar.var_model import VARResultsWrapper
from statsmodels.tsa.api import VAR

import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 這城市有問題
def time_series_format_preprocessing(df: pd.DataFrame, interval, datetime_col, set_index_flag=False):
    #
    # YS(年初), MS(月初), W(周), D(日), H(小時), T(分鐘), S(秒),
    #
    df = df.copy()
    df[datetime_col] = pd.to_datetime(df[datetime_col], format='%Y/%m/%d %H:%M')#:'%Y-%m-%d %H:%M%S'
    df = df.dropna(subset=[datetime_col])
    # 方法1
    # df = df.set_index(datetime_col)
    # df = df[~df.index.duplicated(keep='first')]
    # df = df.asfreq(interval)
    # return df if set_index_flag else df.reset_index(drop=False)

    # 方法2
    df = df.set_index(datetime_col, append=True)
    def asfreq(df, freq):
        # 防止索引名稱為null
        names = []
        for i, name in enumerate(df.index.names):
            if name is None:
                names.append(f"level_{i}")
            else:
                names.append(name)
        df.index.names = names
        # 重設第一個索引層級
        level_to_reset = df.index.names[0]
        df_reset = df.reset_index(level=level_to_reset)
        df_reset = df_reset[~df_reset.index.duplicated(keep='first')]
        # 重新設置頻率
        df_resampled = df_reset.asfreq(freq)
        df_resampled.reset_index(inplace=True) # 會導致freq設定消失
        # 設置新的索引
        for i, name in enumerate(df.index.names):
            df_resampled.set_index(name, inplace=True, append=True if i > 0 else False)
        return df_resampled

    df = df[~df.index.duplicated(keep='first')]
    df = asfreq(df, interval)

    df.index.levels[1].freq = interval
    return df if set_index_flag else df.reset_index(drop=False, level=df.index.names[1]) # 0是原索引，1是時間索引

In [2]:
class VectorAutoregression:
    def __init__(self):
        self.data = None
        self.results = None
        self.summary = None
    def remove_collinearity(self,data,remain:list=None)-> pd.DataFrame: #condition
        """
        移除資料中的共線性特徵

        參數:
        data : pd.DataFrame
            原始資料。
        remain : list, 可選
            要保留的特徵。

        返回:
        pd.DataFrame
            移除共線性特徵後的資料。
        """
        # remain 保留Y值
        if data.shape[1] <2:
            return {"info": ["The dataset must have at least 2 dimensions."]}
        
        if (remain != None) and data.shape[1] > 2 :
            remained_data = data.loc[:,remain]

        data = data.replace([np.inf, -np.inf], np.nan)
        data = data.ffill().bfill() 
        vif_data = pd.DataFrame()
        vif_data["feature"] = data.columns
        vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]


        # 去除VIF為NAN或INF
        vif_data = vif_data.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
        data = data[vif_data["feature"].to_list()]


        # 保留低相關的 feature
        from itertools import combinations
        relevance_threshold = 0.3
        coef_dataframe = data.corr()
        remain_col_index = ()
        for i_1,i_2 in list(combinations(coef_dataframe,2)):
            coef = coef_dataframe.loc[i_1,i_2]
            if abs(coef) < abs(relevance_threshold):
                remain_col_index += (i_1,i_2)
        feature_index = list(set(remain_col_index))  

        
        data = data[feature_index]
        if remain != None:
            remain_list = list(set(remain) - set(feature_index))
            if (data.shape[1] > 2) and (len(remain_list)>0):
                remained_data = remained_data.loc[:,remain_list]
                data = pd.concat([data,remained_data],axis=1)
        #data = data.fillna(method='ffill').fillna(method='bfill')
        if data.shape[1] >= 2:
            return data
        else:
            return {"info": ["Data exhibits multicollinearity, making analysis impossible"]}


    def cal_maxLag(self,data:pd.DataFrame):
        
        n_totobs = len(data)
        ntrend = 1 #len(trend) if trend.startswith("c") else 0
        neqs = data.shape[1]
        max_estimable = (n_totobs - neqs - ntrend) // (1 + neqs)
        if max_estimable > 1:
            return max_estimable
        else:
            return 1
    def fit(self,data:pd.DataFrame,maxlags:Union[int,str]="auto",ic:str=None,remain:list=None):
        data = data.copy()
        clean_data = self.remove_collinearity(data,remain)
        #print(data)
        #print(not data.empty)
        if not isinstance(clean_data,dict):
            model = VAR(clean_data)
            # ==== 這邊不要動 =====

            """ 
            這是套件設定的
            trend : str {"n", "c", "ct", "ctt"}
                * "n" - no deterministic terms
                * "c" - constant term
                * "ct" - constant and linear term
                * "ctt" - constant, linear, and quadratic term

            maxlags 不可以超過 max_estimable 的值
            maxlags 為模型擬合最大數值
            statemodel有設定條件，已經寫在下述的程式
            使用者要調整低於 maxlags
            """
            max_estimable = self.cal_maxLag(clean_data)
            # ==== 這邊不要動 =====

            print("maxlags 要小於等於: ", max_estimable)
            if maxlags == "auto":
                maxlags = max_estimable
            if maxlags > max_estimable:
                #raise Exception(" maxlags 要小於等於: ", max_estimable)
                return {"info": [f"MaxLags must be less than or equal to: {max_estimable}"]}


            """
            ic 為評估模型的好壞
            ic = {'aic', 'fpe', 'hqic', 'bic', None}
            Information criterion to use for VAR order selection.
            aic : Akaike
            fpe : Final prediction error
            hqic : Hannan-Quinn
            bic : Bayesian a.k.a. Schwarz
            """

            self.results = model.fit(maxlags=maxlags, ic=ic)
            self.data = clean_data
            print(f"在最大 lag 數目為 {max_estimable} 的情況下，VAR 找出的最佳 lag 為: ",self.results.k_ar)
        else:
            return clean_data
        
    def getRelationship(self,target:str,pvalue_threshold:float=0.05)-> pd.DataFrame:

        """
        獲取指定目標變量與其他特徵之間的關係。

        參數:
        target : str
            客戶關心的目標變量(Y)。
        pvalue_threshold : float, 可選
            顯著性水平的閾值，默認為0.05。

        返回:
        pd.DataFrame
            包含目標變量與其他特徵之間的係數和p值的數據框。
        """
        if isinstance(self.results,VARResultsWrapper):
            coef_df = self.results.params[target]
            pvalues_df = self.results.pvalues[target]

            # 合并系数和p值
            summary = pd.concat([coef_df, pvalues_df], axis=1)
            summary.columns = ['coef', 'pvalue']
            summary = summary.drop(index="const").reset_index()
            summary_index = summary["index"].str.split(".", expand=True).rename(columns={0:"time_lag",1:"feature"})
            if summary.empty:
                return {"info": ["there is no results found from VAR"]}
            else:
                summary_index["time_lag"] = summary_index["time_lag"].str.replace("L","").astype(int)
                summary = pd.concat([summary_index,summary],axis=1).drop(columns="index")
                summary = summary[summary["pvalue"]<pvalue_threshold].reset_index(drop=True)
                self.summary = summary
                return summary
        else:
            return {"info": ["there is no significant time lag"]}
    def shift_transform(self,data,remain_origin:list=None,exclude_lag:list=None,reference:pd.DataFrame=None)-> pd.DataFrame:
        """
        remain_origin
        : the output dataframe will contains original feature
            > remain_origin = None
                X_1_lag_1, X_1_lag_2, ....

            > remain_origin = X_1
                X_1, X_1_lag_1, X_1_lag_2, ....

        exclude_lag
        : the output dataframe will exclude feature with lagged time
            > exclude_lag = None
                X_1_lag_1, X_1_lag_2, X_2_lag_1, X_2_lag_2....
            > exclude_lag = X_1
                X_1, X_2_lag_1, X_2_lag_2....
        """
        #data = self.data
        summary = self.summary
        data = data.copy()
        # if summary is None:
        #     return {"info": ["due to there is no significant time lag, data could not be transform"]}
        
        # if exclude_lag !=None:
        #     summary = summary[~summary["feature"].isin(exclude_lag)]
        # for i in range(summary.shape[0]):
            
        #     feature = summary.iloc[i]["feature"]
        #     time_lag = summary.iloc[i]["time_lag"]
        #     feature_lag_name = feature+f"_lag_{time_lag}"
        #     data[feature_lag_name] = data[feature].shift(time_lag)
        #     if reference is not None:
        #         fill_values=reference[feature][-time_lag:].to_list()
        #         data.loc[data[feature_lag_name].isna(), feature_lag_name] = fill_values
        # if reference is None:
        #     data.dropna(inplace=True)

        # columns_list = summary["feature"].to_list()
        
        # if remain_origin != None:
        #     columns_list = set(columns_list) - set(remain_origin)
        #     columns_list = list(columns_list)
        # #print(columns_list)
        # data_transform = data.drop(columns=columns_list)
        #return data_transform.reset_index()
        
        if summary is None:
            return {"info": ["due to there is no significant time lag, data could not be transformed"]}

        if exclude_lag:
            summary = summary[~summary["feature"].isin(exclude_lag)]
        
        def apply_lag(row,reference):
            feature, time_lag = row["feature"], row["time_lag"]
            feature_lag_name = f"{feature}_lag_{time_lag}"
            data[feature_lag_name] = data[feature].shift(time_lag)

            if reference is not None:
                fill_values = reference[feature][-time_lag:].to_list()
                data.loc[data[feature_lag_name].isna(), feature_lag_name] = fill_values

        summary.apply(lambda row: apply_lag(row, reference), axis=1)
        
        if reference is None:
            data.dropna(inplace=True)
            

        columns_list = summary["feature"].tolist()

        if remain_origin:
            columns_list = list(set(columns_list) - set(remain_origin))
        
        data_transform = data.drop(columns=columns_list)
        return data_transform.reset_index()

In [7]:
#time_series_format_preprocessing(data,interval="1D",datetime_col="Process Start Time")


In [3]:
data = pd.read_csv("C:/Users/foresight_User/Data/測試資料/Chiller_CH14(01~24).csv").drop(columns=["CONTEXTID"])


#data = time_series_format_preprocessing(data,interval="5T",datetime_col="TIMETAG")
data = data.set_index("TIMETAG")

In [26]:
data = pd.read_csv("C:/Users/foresight_User/Data/測試資料/VISERA_170.csv").drop(columns=["Context Name","Metrology Start Time"])
data = data.set_index("Process Start Time")

In [27]:
train_data = data.iloc[:-10,:]
test_data = data.iloc[-10:,:]

In [28]:
train_data.shape


(157, 89)

In [29]:
test_data.shape

(10, 89)

In [30]:
train_data.shape

(157, 89)

In [31]:


maxlags = 1
target= "Point1"
var = VectorAutoregression()
var.fit(data=train_data,maxlags=maxlags,ic=None)
var_result = var.getRelationship(target=target)
var.shift_transform(data=train_data,remain_origin=None,exclude_lag=[target],reference=None).shape


maxlags 要小於等於:  1
在最大 lag 數目為 1 的情況下，VAR 找出的最佳 lag 為:  1


(156, 90)

In [32]:
var.shift_transform(data=test_data,remain_origin=None,exclude_lag=[target],reference=train_data).shape


(10, 90)

In [33]:
var.shift_transform(data=test_data,remain_origin=None,exclude_lag=[target],reference=None).shape

(9, 90)

In [34]:
test_data.shape

(10, 89)

In [35]:
train_data.shape

(157, 89)

In [None]:
data[feature+f"_lag_{time_lag}"] = data[feature].shift(time_lag)

In [77]:
feature = "Point1"
time_lag = 1


In [78]:
test_data
#test_data[feature+f"_lag_{time_lag}"].fillna(fill_values)

Unnamed: 0_level_0,Point1,Point2,Point3,Point4,Point5,Point6,Point7,Point8,Point9,Point10,...,PROCESS_EB_UpperTemp_Step=8_Filter0_Mean,PROCESS_EB_ChamberPressure_Step=8_Filter0_Range,PROCESS_EB_DeposhieldTopTemp_Step=8_Filter0_Accumulation,PROCESS_EB_RFVppLo_Step=8_Filter0_Counter,PROCESS_EB_LowerTemp_Step=8_Filter0_Range,PROCESS_EB_UpperTemp_Step=8_Filter0_Range,PROCESS_RF_RFPowerUp_Acc_Step=1_Filter0_Mean,Point1_1111111,Point1_lag_1,Point1_lag_3
Process Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022/5/1 09:53,7073.292,7275.228,7285.186,7352.964,7418.669,7430.941,7373.803,7307.144,7268.157,7117.329,...,150.022346,0.3,53703.2,358,0.8,0.4,3811382.0,,7059.702,7140.515
2022/5/2 10:23,7012.749,7123.311,7117.467,7148.851,7239.325,7251.051,7141.674,7163.87,7125.42,6934.874,...,150.023429,0.5,52530.2,350,0.9,0.4,4584658.0,7073.292,7073.292,7276.312
2022/5/3 09:15,7063.947,7322.537,7330.782,7387.779,7417.847,7420.947,7381.857,7345.661,7322.941,7181.11,...,150.035714,0.9,67232.4,448,0.8,0.4,5357422.0,7012.749,7012.749,7059.702
2022/5/4 10:13,7077.869,7182.421,7194.109,7261.567,7299.64,7325.255,7281.166,7230.282,7182.761,6959.051,...,150.015184,0.3,69150.0,461,0.7,0.4,6131381.0,7063.947,7063.947,7073.292
2022/5/5 11:36,7071.598,7212.919,7222.816,7284.498,7301.849,7314.892,7276.631,7233.178,7215.0,6987.414,...,150.007312,0.2,69792.3,465,0.7,0.4,6905126.0,7077.869,7077.869,7012.749
2022/5/6 10:21,7113.737,7380.136,7398.638,7431.838,7457.345,7456.634,7408.698,7349.272,7373.771,7247.665,...,150.012695,0.3,67363.3,449,0.8,0.4,7678175.0,7071.598,7071.598,7063.947
2022/5/7 09:42,7087.395,7237.731,7250.205,7269.661,7296.107,7328.334,7284.018,7232.585,7224.836,6963.775,...,150.024868,0.3,56732.5,378,0.8,0.4,8458623.0,7113.737,7113.737,7077.869
2022/5/8 09:34,7046.015,7264.215,7289.584,7326.882,7371.567,7376.23,7286.673,7298.051,7253.413,7052.751,...,150.031222,0.5,33165.3,221,0.8,0.4,9242707.0,7087.395,7087.395,7071.598
2022/5/9 10:17,7075.952,7273.234,7291.476,7341.604,7392.016,7425.716,7394.17,7346.548,7308.362,7088.084,...,150.0125,0.0,55234.3,368,0.2,0.3,10027830.0,7046.015,7046.015,7113.737
2022/5/10 10:04,7014.477,7135.582,7148.397,7196.372,7196.816,7159.28,7178.483,7168.414,7152.545,6918.514,...,150.037296,0.4,64384.4,429,0.8,0.4,10812096.0,7075.952,7075.952,7087.395


In [76]:
test_data

Unnamed: 0_level_0,Point1,Point2,Point3,Point4,Point5,Point6,Point7,Point8,Point9,Point10,...,PROCESS_EB_UpperTemp_Step=8_Filter0_Mean,PROCESS_EB_ChamberPressure_Step=8_Filter0_Range,PROCESS_EB_DeposhieldTopTemp_Step=8_Filter0_Accumulation,PROCESS_EB_RFVppLo_Step=8_Filter0_Counter,PROCESS_EB_LowerTemp_Step=8_Filter0_Range,PROCESS_EB_UpperTemp_Step=8_Filter0_Range,PROCESS_RF_RFPowerUp_Acc_Step=1_Filter0_Mean,Point1_1111111,Point1_lag_1,Point1_lag_3
Process Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022/5/1 09:53,7073.292,7275.228,7285.186,7352.964,7418.669,7430.941,7373.803,7307.144,7268.157,7117.329,...,150.022346,0.3,53703.2,358,0.8,0.4,3811382.0,,,7140.515
2022/5/2 10:23,7012.749,7123.311,7117.467,7148.851,7239.325,7251.051,7141.674,7163.87,7125.42,6934.874,...,150.023429,0.5,52530.2,350,0.9,0.4,4584658.0,7073.292,7073.292,7276.312
2022/5/3 09:15,7063.947,7322.537,7330.782,7387.779,7417.847,7420.947,7381.857,7345.661,7322.941,7181.11,...,150.035714,0.9,67232.4,448,0.8,0.4,5357422.0,7012.749,7012.749,7059.702
2022/5/4 10:13,7077.869,7182.421,7194.109,7261.567,7299.64,7325.255,7281.166,7230.282,7182.761,6959.051,...,150.015184,0.3,69150.0,461,0.7,0.4,6131381.0,7063.947,7063.947,7073.292
2022/5/5 11:36,7071.598,7212.919,7222.816,7284.498,7301.849,7314.892,7276.631,7233.178,7215.0,6987.414,...,150.007312,0.2,69792.3,465,0.7,0.4,6905126.0,7077.869,7077.869,7012.749
2022/5/6 10:21,7113.737,7380.136,7398.638,7431.838,7457.345,7456.634,7408.698,7349.272,7373.771,7247.665,...,150.012695,0.3,67363.3,449,0.8,0.4,7678175.0,7071.598,7071.598,7063.947
2022/5/7 09:42,7087.395,7237.731,7250.205,7269.661,7296.107,7328.334,7284.018,7232.585,7224.836,6963.775,...,150.024868,0.3,56732.5,378,0.8,0.4,8458623.0,7113.737,7113.737,7077.869
2022/5/8 09:34,7046.015,7264.215,7289.584,7326.882,7371.567,7376.23,7286.673,7298.051,7253.413,7052.751,...,150.031222,0.5,33165.3,221,0.8,0.4,9242707.0,7087.395,7087.395,7071.598
2022/5/9 10:17,7075.952,7273.234,7291.476,7341.604,7392.016,7425.716,7394.17,7346.548,7308.362,7088.084,...,150.0125,0.0,55234.3,368,0.2,0.3,10027830.0,7046.015,7046.015,7113.737
2022/5/10 10:04,7014.477,7135.582,7148.397,7196.372,7196.816,7159.28,7178.483,7168.414,7152.545,6918.514,...,150.037296,0.4,64384.4,429,0.8,0.4,10812096.0,7075.952,7075.952,7087.395


In [56]:
test_data

Unnamed: 0_level_0,Point1,Point2,Point3,Point4,Point5,Point6,Point7,Point8,Point9,Point10,...,PROCESS_EB_TopGap_Step=8_Filter0_Mean,PROCESS_EB_UpperTemp_Step=8_Filter0_Mean,PROCESS_EB_ChamberPressure_Step=8_Filter0_Range,PROCESS_EB_DeposhieldTopTemp_Step=8_Filter0_Accumulation,PROCESS_EB_RFVppLo_Step=8_Filter0_Counter,PROCESS_EB_LowerTemp_Step=8_Filter0_Range,PROCESS_EB_UpperTemp_Step=8_Filter0_Range,PROCESS_RF_RFPowerUp_Acc_Step=1_Filter0_Mean,Point1_1111111,Point1_lag_1
Process Start Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022/5/1 09:53,7073.292,7275.228,7285.186,7352.964,7418.669,7430.941,7373.803,7307.144,7268.157,7117.329,...,38.996453,150.022346,0.3,53703.2,358,0.8,0.4,3811382.0,,
2022/5/2 10:23,7012.749,7123.311,7117.467,7148.851,7239.325,7251.051,7141.674,7163.87,7125.42,6934.874,...,38.996657,150.023429,0.5,52530.2,350,0.9,0.4,4584658.0,7073.292,7073.292
2022/5/3 09:15,7063.947,7322.537,7330.782,7387.779,7417.847,7420.947,7381.857,7345.661,7322.941,7181.11,...,38.99654,150.035714,0.9,67232.4,448,0.8,0.4,5357422.0,7012.749,7012.749
2022/5/4 10:13,7077.869,7182.421,7194.109,7261.567,7299.64,7325.255,7281.166,7230.282,7182.761,6959.051,...,38.996377,150.015184,0.3,69150.0,461,0.7,0.4,6131381.0,7063.947,7063.947
2022/5/5 11:36,7071.598,7212.919,7222.816,7284.498,7301.849,7314.892,7276.631,7233.178,7215.0,6987.414,...,38.99372,150.007312,0.2,69792.3,465,0.7,0.4,6905126.0,7077.869,7077.869
2022/5/6 10:21,7113.737,7380.136,7398.638,7431.838,7457.345,7456.634,7408.698,7349.272,7373.771,7247.665,...,38.996236,150.012695,0.3,67363.3,449,0.8,0.4,7678175.0,7071.598,7071.598
2022/5/7 09:42,7087.395,7237.731,7250.205,7269.661,7296.107,7328.334,7284.018,7232.585,7224.836,6963.775,...,38.99418,150.024868,0.3,56732.5,378,0.8,0.4,8458623.0,7113.737,7113.737
2022/5/8 09:34,7046.015,7264.215,7289.584,7326.882,7371.567,7376.23,7286.673,7298.051,7253.413,7052.751,...,38.996606,150.031222,0.5,33165.3,221,0.8,0.4,9242707.0,7087.395,7087.395
2022/5/9 10:17,7075.952,7273.234,7291.476,7341.604,7392.016,7425.716,7394.17,7346.548,7308.362,7088.084,...,38.99644,150.0125,0.0,55234.3,368,0.2,0.3,10027830.0,7046.015,7046.015
2022/5/10 10:04,7014.477,7135.582,7148.397,7196.372,7196.816,7159.28,7178.483,7168.414,7152.545,6918.514,...,38.99634,150.037296,0.4,64384.4,429,0.8,0.4,10812096.0,7075.952,7075.952
