In [1]:
# === 1. 匯入套件 ===
import pandas as pd
import numpy as np
from google.cloud import bigquery
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout, Reshape, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from IPython.display import FileLink, display
import joblib
import os

2025-06-18 14:36:58.421629: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-18 14:36:59.336418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2025-06-18 14:36:59.336526: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

In [2]:
# === 2. BigQuery 匯入資料 ===
PROJECT_ID = "lab-martech-cxl"
client = bigquery.Client(project=PROJECT_ID, location="asia-east1")
query = "SELECT * FROM `lab-martech-cxl.cobine.DataLabel_1v5` WHERE action_group IS NOT NULL"
df = client.query(query).to_dataframe()
df = df.sort_values(by=['user_pseudo_id', 'event_time'])



In [3]:
# === 3. 基本參數 ===
SEQ_LEN = 10
cat_features = ['action_group', 'source', 'medium', 'platform']
num_features = ['staytime', 'has_shared', 'revisit_count']

In [4]:
# === 4. Label Encoding 分類欄位 ===
encoders = {}
for col in cat_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le
    joblib.dump(le, f'encoder_{col}_v4.pkl')

In [5]:
# === 5. 排序與建立序列樣本 ===
sequences, y_action, y_online, y_o2o = [], [], [], []

for _, user_df in df.groupby('user_pseudo_id'):
    if len(user_df) < SEQ_LEN + 1:
        continue

    for i in range(len(user_df) - SEQ_LEN):
        seq = user_df.iloc[i:i+SEQ_LEN]
        target = user_df.iloc[i+SEQ_LEN]

        cat_seq = [seq[feat].values for feat in cat_features]
        num_seq = [seq[feat].values for feat in num_features]

        sequences.append(cat_seq + num_seq)
        y_action.append(target['action_group'])
        y_online.append(target['is_online_converted'])
        y_o2o.append(target['is_o2o_reserved'])

In [6]:
# === 6. Padding + 數值標準化 ===
X = []
scalers = {}  # 新增 scaler dict 儲存對應欄位 scaler

for i in range(len(sequences[0])):
    feature_i = [seq[i] for seq in sequences]

    if i < len(cat_features):
        X.append(pad_sequences(feature_i, maxlen=SEQ_LEN, padding='pre'))
    else:
        # 數值欄位 index offset
        num_idx = i - len(cat_features)
        num_col = num_features[num_idx]

        feature_array = np.array(feature_i, dtype=float)
        feature_array = np.nan_to_num(feature_array, nan=0.0)

        scaler = StandardScaler()
        scaled = scaler.fit_transform(feature_array)

        # 儲存 scaler
        joblib.dump(scaler, f"scaler_feature_{num_col}_v4.pkl")
        scalers[num_col] = scaler

        X.append(scaled)

# 最後轉成 numpy 陣列
X = [np.array(x) for x in X]
y_action = np.array(y_action)
y_online = np.array(y_online)
y_o2o = np.array(y_o2o)


In [7]:
# === 7. 切分訓練/驗證/測試資料 (70/20/10) ===
X_all = list(zip(*X))
X_temp, X_test, y_action_temp, y_action_test, y_online_temp, y_online_test, y_o2o_temp, y_o2o_test = train_test_split(
    X_all, y_action, y_online, y_o2o, test_size=0.1, random_state=42
)
X_train, X_val, y_action_train, y_action_val, y_online_train, y_online_val, y_o2o_train, y_o2o_val = train_test_split(
    X_temp, y_action_temp, y_online_temp, y_o2o_temp, test_size=2/9, random_state=42
)

def unzip_X(X_packed):
    return [np.array([x[i] for x in X_packed]) for i in range(len(X_packed[0]))]

X_train = unzip_X(X_train)
X_val = unzip_X(X_val)
X_test = unzip_X(X_test)

In [10]:
# === 8. 建立模型 ===
from tensorflow.keras.metrics import Precision, Recall
#####輸入層
num_classes = len(encoders['action_group'].classes_)
inputs, embeddings = [], []
emb_dims = {'action_group': 64, 'source': 8, 'medium': 8, 'platform': 8}

#####類別特徵處理
for i, feat in enumerate(cat_features):
    inp = Input(shape=(SEQ_LEN,), name=f'in_{feat}')
    inputs.append(inp)
    emb = Embedding(input_dim=df[feat].nunique() + 1, output_dim=emb_dims[feat], mask_zero=True)(inp) #mask_zero=True:忽略填充的0值
    embeddings.append(emb)
    
#####數值特徵處理
for i, feat in enumerate(num_features):
    inp = Input(shape=(SEQ_LEN,), name=f'in_{feat}')
    inputs.append(inp)
    reshaped = Reshape((SEQ_LEN, 1))(inp)
    embeddings.append(reshaped)

x = Concatenate()(embeddings) # 拼接所有特徵
x = Masking()(x) # 處理序列中的填充值0
x = LSTM(128, return_sequences=False)(x) # LSTM處理序列
x = Dropout(0.3)(x) # 防止過擬合

##### 輸出
#action_group預測
out_action = Dense(64, activation='relu')(x)
out_action = Dense(num_classes, activation='softmax', name='out_action')(out_action) 

#網投成功預測
out_online = Dense(32, activation='relu')(x)
out_online = Dense(1, activation='sigmoid', name='out_online')(out_online)

#O2O預約預測
out_o2o = Dense(32, activation='relu')(x)
out_o2o = Dense(1, activation='sigmoid', name='out_o2o')(out_o2o)

model = Model(inputs=inputs, outputs=[out_action, out_online, out_o2o])
model.compile(optimizer='adam',
              loss={'out_action': 'sparse_categorical_crossentropy',
                    'out_online': 'binary_crossentropy',
                    'out_o2o': 'binary_crossentropy'},
              metrics={'out_action': 'accuracy',
                       'out_online': ['accuracy',Precision(name='precision_online'), Recall(name='recall_online')],
                       'out_o2o': ['accuracy',Precision(name='precision_o2o'), Recall(name='recall_o2o')]})

model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 in_action_group (InputLayer)   [(None, 10)]         0           []                               
                                                                                                  
 in_source (InputLayer)         [(None, 10)]         0           []                               
                                                                                                  
 in_medium (InputLayer)         [(None, 10)]         0           []                               
                                                                                                  
 in_platform (InputLayer)       [(None, 10)]         0           []                               
                                                                                            

In [11]:
# === 9. 模型訓練 ===
early_stop = EarlyStopping(patience=3, restore_best_weights=True)
checkpoint = ModelCheckpoint('lstm_multi_output_model_v4.h5', save_best_only=True)

model.fit(
    X_train,
    [y_action_train, y_online_train, y_o2o_train],
    validation_data=(X_val, [y_action_val, y_online_val, y_o2o_val]),
    batch_size=512,
    epochs=20,
    callbacks=[early_stop, checkpoint]
)

Epoch 1/20


2025-06-18 15:07:56.987738: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_40/output/_23'
2025-06-18 15:07:57.354201: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8900
2025-06-18 15:07:57.492910: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x55fb8d38af60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-06-18 15:07:57.492951: I tensorflow/compiler/

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fcd79e82b90>

In [12]:
# === 10. 測試集評估 ===
print("\在測試集上的評估結果：")
results = model.evaluate(X_test, [y_action_test, y_online_test, y_o2o_test], batch_size=512)
for name, val in zip(model.metrics_names, results):
    print(f"{name}: {val:.4f}")

\在測試集上的評估結果：
loss: 1.0447
out_action_loss: 0.8176
out_online_loss: 0.1472
out_o2o_loss: 0.0799
out_action_accuracy: 0.7492
out_online_accuracy: 0.9448
out_online_precision_online: 0.9488
out_online_recall_online: 0.8927
out_o2o_accuracy: 0.9675
out_o2o_precision_o2o: 0.8817
out_o2o_recall_o2o: 0.7519


In [None]:
#TopN
from tensorflow.keras.models import load_model
model = load_model('lstm_multi_output_model_v4.h5')
​
from sklearn.metrics import top_k_accuracy_score
​
y_pred_prob = model.predict(X_test)[0]  # 只拿出 action 的 softmax 機率分布
y_true = y_action_test  # 實際類別
​
top1_acc = top_k_accuracy_score(y_true, y_pred_prob, k=1)
top3_acc = top_k_accuracy_score(y_true, y_pred_prob, k=3)
top5_acc = top_k_accuracy_score(y_true, y_pred_prob, k=5)
​
print(f"Top-1 Accuracy: {top1_acc:.4f}")
print(f"Top-3 Accuracy: {top3_acc:.4f}")
print(f"Top-5 Accuracy: {top5_acc:.4f}")
