In [1]:
import warnings
import pickle
import copy
import torch
import json
import sys
import os
import numpy as np
import pandas as pd
import torch.nn as nn
from transformers import BertModel
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from console1 import generate_partition
from model import BertClassifier

# F_P = {
#     0: [0,1],
#     1: [2,3],
#     2: [4,5,6,7,8,9,10,11],
#     3: [12,13],
# }


In [2]:
df = pd.read_csv('data_with_output.csv')
df = df.drop(columns=['Output'])
df_mean = df.mean()
df_std = df.std()
sample_df = pd.read_csv('sampled_df100.csv',index_col=0)
print(sample_df.head(5))


       X_-TACT_TIME_mean  X_-CONVEYOR_SPEED_mean    PUMP_high  PUMP_low  \
16010                150                    3150  37494.28000  12305.50   
19201                165                    4200  38257.14000  12518.56   
21232                140                    3150  45430.73333   9255.75   
33991                160                    3150  47262.94000  12518.58   
25200                170                    3150  49986.46667  12039.14   

       CLN1_over-etching-ratio  CLN1_EPT_time  clean_count  \
16010                 0.002079          12987            5   
19201                 0.006691          13002            5   
21232                 0.004852          12984            5   
33991                 0.002744          12027            5   
25200                 0.000750          12000            7   

       EPT_clean_count_ratio  NH3_TREAT_-RF_FREQ-max  \
16010            2597.400000                   13970   
19201            2600.400000                   13935   
21232   

In [3]:
u_set = [i for i in range(14)]
consider1 = [0,1,6,7,8,9,10]
leng = [i for i in range(1,len(consider1)+1)]
consider_combine1 =sum(list(list(combinations(consider1,r)) for r in leng),[])
consider_combine_c1 = [tuple(set(u_set)-set(comb)) for comb in consider_combine1]
all_feature_dict1 = []
for idx in range(len(consider_combine1)):
    feature_dict = {}
    feature_dict[0] = list(consider_combine1[idx])
    feature_dict[1] = list(consider_combine_c1[idx])
    all_feature_dict1.append(feature_dict)


In [7]:
u_set = [i for i in range(14)]
consider = [0,1,2,4,5,6,7,8,9,10]
leng = [i for i in range(1,4)]
consider_combine0 =sum(list(list(combinations(consider,r)) for r in leng),[])
consider_combine = list(set(consider_combine0) - set(consider_combine1))
consider_combine.append((3,))
consider_combine.append((11,))
consider_combine.append((12,))
consider_combine.append((13,))
consider_combine_c = [tuple(set(u_set)-set(comb)) for comb in consider_combine]
all_feature_dict = []
for idx in range(len(consider_combine)):
    feature_dict = {}
    feature_dict[0] = list(consider_combine[idx])
    feature_dict[1] = list(consider_combine_c[idx])
    all_feature_dict.append(feature_dict)

In [8]:
all_feature_dict = sorted(all_feature_dict, key=lambda x: x[0])

In [9]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
s1_model_path = 'stage_1_checkpoint.pth'
s1_model =  torch.load(s1_model_path, map_location=device).to(device)
json_file_path = 'controllable_para_v014_14.json'
tool_name = 'ASCVD'
with open(json_file_path, 'r') as f:
    params = json.load(f)[tool_name]
    print(params)
    f.close()
target_features = []

''' 
    The feature_translation is a list of tuples, each tuple contains two integers.
    The tuples record the correponding position of the ith feature in the 4*freature matrix
    as the input the prediction model.
    Ex.
    [(0, 0), (0, 1), 
     (1, 2), (1, 3), 
     (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), 
     (3, 12),(3, 13)]
'''
# Flat the feature list and construct teh freature translation list to map feature into the input of the model
feature_translation = []
sub_op_num = 0
for entry in [sub_op for _, sub_op in params.items()]:
    if isinstance(entry, str):
        target_features.append(entry)
    else:
        target_features.extend(entry)
        feature_translation.extend([(sub_op_num,len(feature_translation)+j) for j in range(len(entry))])
        sub_op_num = sub_op_num+1 

print(target_features)
print(f"There are {sub_op_num} sub processes. So the feature translation is {feature_translation}")

def padding_zero(df, flag): 
    # 將一維參數matrix擴展為4維
    data_arr = df.to_numpy()
    result = []
    for data in data_arr:
        empty_arr = np.zeros((sub_op_num, len(feature_translation))) # chamber數 * 總參數數量
        for i, pos in enumerate(feature_translation):
            empty_arr[pos[0]][pos[1]] = data[i]
        if(flag == 1): # bert.py使用
            result.append(empty_arr)
        if(flag == 2): # bert_du.py使用
            result.append(empty_arr.tolist())
    
    if(flag == 1): # bert.py使用
        result = pd.DataFrame({'X': [result[i] for i in range(len(result))]})
    return result

def model_inference(data):  #standardize dataframe
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    s1_model.eval()
    # scaler = StandardScaler()
    if 'Output' in data.columns:
        data = data.drop(columns=['Output'])
    # data_standardized_df = pd.DataFrame(scaler.fit_transform(data))
    #print(data.head())
    #print(f'Standardized data:\n{data_standardized_df.head(3)}')  # Debug
    
    data_4d = padding_zero(data,flag=1)
    
    #print(f'4D data:\n{data_4d.head()}')  # Debug
    
    data_4d_array = np.array([e for entry in data_4d.values for e in entry])
    data_4d_tensor = torch.tensor(data_4d_array,dtype=torch.float)
    
    #print(f'Tensor data shape: {data_4d_tensor.shape}')  # Debug 
    
    my_dataset = TensorDataset(data_4d_tensor)
    batch_size = min(256, int(data_4d_tensor.size()[0]))
    my_loader = DataLoader(my_dataset, batch_size=batch_size,num_workers=2)
    data_output = []
    with torch.no_grad():
        for batch_data in my_loader:
        # 将数据移到指定的设备上（如 CUDA 设备）
            batch_data = batch_data[0].to(device)
        # 将数据传递给模型进行推理
            batch_output = s1_model(batch_data)
            probs = (torch.nn.functional.softmax(batch_output, dim=1))
        # 将输出保存起来
            data_output += probs
    data_output_array = np.array([output.cpu().numpy()[0] for output in data_output])
    data_expectation_value = data_output_array.mean()
    return data_expectation_value, data_output_array

{'EQ': ['X_-TACT_TIME_mean', 'X_-CONVEYOR_SPEED_mean'], 'PUMP': ['PUMP_high', 'PUMP_low'], 'CH': ['CLN1_over-etching-ratio', 'CLN1_EPT_time', 'clean_count', 'EPT_clean_count_ratio', 'NH3_TREAT_-RF_FREQ-max', 'NH3_TREAT_-RF_FREQ-range', 'NH3_TREAT_-RF_FREQ-mean', 'NP_3_-MFC_VOL_SIH4-range'], 'VENT': ['VENT_high', 'VENT_low'], 'y': 'Output'}
['X_-TACT_TIME_mean', 'X_-CONVEYOR_SPEED_mean', 'PUMP_high', 'PUMP_low', 'CLN1_over-etching-ratio', 'CLN1_EPT_time', 'clean_count', 'EPT_clean_count_ratio', 'NH3_TREAT_-RF_FREQ-max', 'NH3_TREAT_-RF_FREQ-range', 'NH3_TREAT_-RF_FREQ-mean', 'NP_3_-MFC_VOL_SIH4-range', 'VENT_high', 'VENT_low', 'Output']
There are 4 sub processes. So the feature translation is [(0, 0), (0, 1), (1, 2), (1, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (3, 12), (3, 13)]


In [10]:
# 過濾掉FutureWarning
warnings.filterwarnings("ignore", category=FutureWarning)

# First, calculate the output mean of the experiment data
# base_mean, details = model_inference(s_df)
# print(f'Base mean: {base_mean}')
# creat dataframe for saving vertex value
# import pickle

# # Open the pickle file in binary read mode
# with open('savedata/E_vi_20240612.pkl', 'rb') as f:
#     # Load the data from the file
#     data = pickle.load(f)

# # The data is now available in the 'data' variable
# print(data.shape)
# vertex_df = data

In [12]:
count_n = 0
for dict_index in range(0,len(all_feature_dict)):
    F_P = all_feature_dict[dict_index]
    running_f = F_P[0]
    print(f'now running {running_f}')
    AUO_coalitions, _, _ = generate_partition(F_P) 
    collist = [i.tolist() for i in AUO_coalitions]
    colname = [tuple(i) for i in collist]
    vertex_df = pd.DataFrame(columns=colname)
    vertices_fl = f'C:/Users/User/Desktop/folder/pyfile/shap_residual/savedata/E_vi_feature{running_f}.pkl'
    for i_index in range(0,50):
        #print(f'index = {i_index}')
        instance = sample_df.iloc[i_index]
        std_instance = (instance - df_mean) / df_std
        coalition_estimated_values = {}
        #print(f'coalition_estimated_values_start = {coalition_estimated_values}')
        eval_df = df.sample(n=1000)  #不斷sample
        std_eval_df = (eval_df - df_mean) / df_std
        base_mean, details = model_inference(std_eval_df)
        #print(f'Base mean: {base_mean}')
        for coalition in AUO_coalitions: 
            vi_df = std_eval_df.copy()  #用copy()才不會去更改到原始的dataframe
            if len(coalition)!=0:
                vi_df.iloc[:,coalition] = instance.iloc[coalition] 
                exp, details = model_inference(vi_df)         
            #print(f'coalition = {coalition}')
            #print(vi_df.head(3))
            elif len(coalition)==0:
                exp = base_mean
            #print(f'exp = {exp}')
            coalition_estimated_values[tuple(coalition)] =  exp - base_mean
            #print(f'coalition_estimated_values[tuple(coalition)] = {coalition_estimated_values[tuple(coalition)]}')
            count_n += 1  
            if count_n % 100 == 0:
                print(f"gen {count_n} vertex")
        #print(f'coalition_estimated_values = {coalition_estimated_values}')
        instance_df = pd.DataFrame([coalition_estimated_values])
        vertex_df = pd.concat([vertex_df, instance_df], ignore_index=True)
    with open(vertices_fl, 'wb') as f:
        pickle.dump(vertex_df, f)
        print(f'{running_f} done')          

now running [0, 1, 2]
gen 100 vertex
gen 200 vertex
[0, 1, 2] done
now running [0, 1, 4]
gen 300 vertex
gen 400 vertex
[0, 1, 4] done
now running [0, 1, 5]
gen 500 vertex
gen 600 vertex
[0, 1, 5] done
now running [0, 2]
gen 700 vertex
gen 800 vertex
[0, 2] done
now running [0, 2, 4]
gen 900 vertex
gen 1000 vertex
[0, 2, 4] done
now running [0, 2, 5]
gen 1100 vertex
gen 1200 vertex
[0, 2, 5] done
now running [0, 2, 6]
gen 1300 vertex
gen 1400 vertex
[0, 2, 6] done
now running [0, 2, 7]
gen 1500 vertex
gen 1600 vertex
[0, 2, 7] done
now running [0, 2, 8]
gen 1700 vertex
gen 1800 vertex
[0, 2, 8] done
now running [0, 2, 9]
gen 1900 vertex
gen 2000 vertex
[0, 2, 9] done
now running [0, 2, 10]
gen 2100 vertex
gen 2200 vertex
[0, 2, 10] done
now running [0, 4]
gen 2300 vertex
gen 2400 vertex
[0, 4] done
now running [0, 4, 5]
gen 2500 vertex
gen 2600 vertex
[0, 4, 5] done
now running [0, 4, 6]
gen 2700 vertex
gen 2800 vertex
[0, 4, 6] done
now running [0, 4, 7]
gen 2900 vertex
gen 3000 vertex

In [12]:
print(vertex_df.shape)
with open('vertices_fl', 'wb') as f:
    pickle.dump(vertex_df, f)

(100, 16)
