In [80]:
# ---------------
# Pytofi 檢定分析
# ---------------
import joblib
import pandas as pd
import numpy as np
from tofi import CIT
from sklearn.ensemble import RandomForestClassifier

print("--- 步驟 1: 載入所有必要檔案 ---")
try:
    # 載入不含預處理的最終模型作為 learner
    learner = joblib.load('final_model_only.joblib')
    preprocessor = joblib.load('preprocessor.joblib')

    # 載入原始資料
    X_train = pd.read_csv('X_train_data.csv')
    X_test = pd.read_csv('X_test_data.csv')
    y_train = pd.read_csv('y_train_data.csv').squeeze()
    y_test = pd.read_csv('y_test_data.csv').squeeze()
    
    print("模型、預處理器和資料集全部載入成功！")
    print("Learner 的類型:", type(learner))

except FileNotFoundError as e:
    print(f"錯誤：找不到檔案 {e.filename}，請確認所有檔案都在同一個資料夾中。")
    # exit()

#  (所有資料都先轉換) 
print("\n--- 步驟 2: 準備 Pytofi 分析 ---")

# 使用載入的 preprocessor 轉換所有資料
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# 取得轉換後的特徵名稱
transformed_feature_names = preprocessor.get_feature_names_out()

# 將轉換後的資料變為 DataFrame
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=transformed_feature_names)
print(f"資料已轉換，維度: {X_train_transformed_df.shape}")

# 選擇要分析的特徵
removal_feature = 'cat__無法節制時間的網路活動項目_其他_0' 

# 訓練 sampler 模型
sampler = RandomForestClassifier(random_state=42)
sampler.fit(
    X_train_transformed_df.drop(columns=removal_feature), 
    X_train_transformed_df[removal_feature]
)
print(f"為特徵 '{removal_feature}' 成功訓練好 Sampler。")

# (CIT) 
print("\n--- 步驟 3: 執行 Pytofi 條件獨立性測試 ---")


from sklearn.base import BaseEstimator

class ProbabilisticLearner(BaseEstimator):
    def __init__(self, model):
        self.model = model
        
    def fit(self, X, y):
        return self.model.fit(X, y)
    
    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]


wrapped_learner = ProbabilisticLearner(learner)

y_test_float = y_test.astype(float)


cpi = CIT(
    learner=wrapped_learner,
    sampler=sampler,
    removal=removal_feature,
    method="CPI"
)

y_test_float = y_test.astype(float)


_ = cpi.infer(X_test_transformed_df, y_test_float)


print("\n--- 分析結果 ---")
cpi.summarize()

--- 步驟 1: 載入所有必要檔案 ---
模型、預處理器和資料集全部載入成功！
Learner 的類型: <class 'xgboost.sklearn.XGBClassifier'>

--- 步驟 2: 準備 Pytofi 分析 ---
資料已轉換，維度: (1114, 61)
為特徵 'cat__無法節制時間的網路活動項目_其他_0' 成功訓練好 Sampler。

--- 步驟 3: 執行 Pytofi 條件獨立性測試 ---

--- 分析結果 ---
Inferer Summary (cross_fit=False, combine=False)
 + Method: CPI (double_split=None, perturb_size=None)
 + Null Distribution: Normality (n_copies=1, n_permutations=None)
 + Loss Function: Mean Squared Error (reverse=False)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0_level_0,size,estimate,std_error,p_value
removal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cat__無法節制時間的網路活動項目_其他_0,372,0.001973,0.001641,0.114647
