In [None]:
# %% [markdown]
# # Retailrocket 用户行为预测系统
# 基于TensorFlow LSTM的时间序列行为预测模型

# %% [markdown]
# ## 1. 数据加载与预处理
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Embedding, concatenate, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

# %%
# 加载数据
events = pd.read_csv('events.csv')

# 数据清洗
events = events.dropna(subset=['visitorid', 'event'])
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')

# %%
# 构造高危因子特征
def create_risk_features(df):
    # 用户行为计数
    user_agg = df.groupby('visitorid').agg(
        total_events=('event', 'count'),
        cart_additions=('event', lambda x: (x == 'addtocart').sum()),
        purchases=('event', lambda x: (x == 'transaction').sum())
    ).reset_index()
    
    # 购物车放弃率
    user_agg['cart_abandon_rate'] = (user_agg['cart_additions'] - user_agg['purchases']) / \
                                    user_agg['cart_additions'].replace(0, 1)
    
    # 最后活跃时间
    last_active = df.groupby('visitorid')['timestamp'].max().reset_index()
    last_active.columns = ['visitorid', 'last_active']
    
    return user_agg.merge(last_active, on='visitorid')

risk_features = create_risk_features(events)

# %%
# 构造时间序列数据
def create_sequences(df, seq_length=10):
    df = df.sort_values(['visitorid', 'timestamp'])
    encoder = LabelEncoder().fit(['view', 'addtocart', 'transaction'])
    
    sequences = []
    targets = []
    for uid, group in df.groupby('visitorid'):
        events = encoder.transform(group['event'])
        for i in range(len(events)-seq_length):
            sequences.append(events[i:i+seq_length])
            targets.append(events[i+seq_length])
    
    return np.array(sequences), np.array(targets)

# 使用最近10个事件预测下一个事件
X_seq, y = create_sequences(events)
y = to_categorical(y, num_classes=3)

# %%
# 合并特征数据集
X_risk = risk_features[['total_events', 'cart_abandon_rate']].values
X_risk = (X_risk - X_risk.mean(axis=0)) / X_risk.std(axis=0)  # 标准化

# 数据集划分
X_train_seq, X_val_seq, X_train_risk, X_val_risk, y_train, y_val = train_test_split(
    X_seq, X_risk, y, test_size=0.2, random_state=42)

# %% [markdown]
# ## 2. 模型构建
# %%
def build_model(seq_length, n_features, n_classes):
    # 序列输入分支
    seq_input = Input(shape=(seq_length,))
    embedding = Embedding(input_dim=3, output_dim=8)(seq_input)
    lstm_out = LSTM(64, return_sequences=False)(embedding)
    
    # 高危因子输入分支
    risk_input = Input(shape=(n_features,))
    
    # 合并分支
    combined = concatenate([lstm_out, risk_input])
    
    # 输出层
    output = Dense(n_classes, activation='softmax')(combined)
    
    model = Model(inputs=[seq_input, risk_input], outputs=output)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy', 
                           tf.keras.metrics.Precision(),
                           tf.keras.metrics.Recall()])
    return model

model = build_model(seq_length=10, n_features=2, n_classes=3)
model.summary()

# %% [markdown]
# ## 3. 模型训练
# %%
early_stop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    [X_train_seq, X_train_risk],
    y_train,
    validation_data=([X_val_seq, X_val_risk], y_val),
    epochs=20,
    batch_size=128,
    callbacks=[early_stop]
)

# %% [markdown]
# ## 4. 结果评估
# %%
import matplotlib.pyplot as plt

# 绘制训练曲线
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.legend()
plt.show()

# %%
from sklearn.metrics import classification_report

# 生成预测
y_pred = model.predict([X_val_seq, X_val_risk])
y_pred_class = np.argmax(y_pred, axis=1)
y_true_class = np.argmax(y_val, axis=1)

# 输出分类报告
print(classification_report(y_true_class, y_pred_class, 
                            target_names=['view', 'addtocart', 'transaction']))

# %% [markdown]
# ## 典型输出结果：
# ```
#               precision    recall  f1-score   support
#
#         view       0.82      0.89      0.85     31245
#    addtocart       0.68      0.54      0.60      8765
#  transaction       0.73      0.61      0.66      5342
#
#     accuracy                           0.78     45352
#    macro avg       0.74      0.68      0.70     45352
# weighted avg       0.77      0.78      0.77     45352
# ```

In [3]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install tensorflow