In [41]:
import pandas as pd
import glob
import os

In [42]:
# 读取所有抖音场景 CSV 文件
csv_files = glob.glob("../data/psi_douyin_*.csv")
print(f"找到 {len(csv_files)} 个 CSV 文件:")
for f in csv_files:
    print(f"  - {os.path.basename(f)}")

# 合并所有文件
dfs = []
for f in csv_files:
    df_temp = pd.read_csv(f)
    df_temp['source_file'] = os.path.basename(f)
    dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)
print(f"\n总共 {len(df)} 行数据")

找到 4 个 CSV 文件:
  - psi_douyin_20260228_211328.csv
  - psi_douyin_20260228_213146.csv
  - psi_douyin_20260228_215103.csv
  - psi_douyin_20260228_221442.csv

总共 4740 行数据


In [43]:
# 丢弃每个文件第一行（初始 delta 不准确）
first_rows = df.groupby('source_file').head(1).index
df = df.drop(first_rows).reset_index(drop=True)
print(f"去掉首行后: {len(df)} 行")
print(f"columns: {df.columns.tolist()}")

去掉首行后: 4736 行
columns: ['ts', 'phase', 'some_delta', 'full_delta', 'mem_available', 'pgscan_direct', 'pgsteal_direct', 'pgmajfault', 'workingset_refault', 'allocstall', 'pswpin', 'pswpout', 'source_file']


In [44]:
# 删除 phase 列（不参与模型训练）
df = df.drop(columns=['phase'])
print(f"已删除 phase 列，当前列: {df.columns.tolist()}")

已删除 phase 列，当前列: ['ts', 'some_delta', 'full_delta', 'mem_available', 'pgscan_direct', 'pgsteal_direct', 'pgmajfault', 'workingset_refault', 'allocstall', 'pswpin', 'pswpout', 'source_file']


In [45]:
# 未来 1 秒 = 下两个样本（采样间隔 500ms）
# 按 source_file 分组计算，避免跨文件
df['future_1s_some'] = df.groupby('source_file')['some_delta'].shift(-1) + \
                        df.groupby('source_file')['some_delta'].shift(-2)
df['future_1s_full'] = df.groupby('source_file')['full_delta'].shift(-1) + \
                        df.groupby('source_file')['full_delta'].shift(-2)

# 删除每个文件尾部 2 行（无法计算 future）
df = df.dropna(subset=['future_1s_some', 'future_1s_full'])

In [46]:
# 基础特征列表
base_features = [
    "some_delta",
    "full_delta",
    "mem_available",
    "pgscan_direct",
    "pgsteal_direct",
    "pgmajfault",
    "workingset_refault",
    "allocstall",
    "pswpin",
    "pswpout"
]

# 滑动窗口特征（趋势建模）
for w in [2, 6, 10]:
    df[f"some_sum_{w}"] = df.groupby('source_file')['some_delta'].rolling(w).sum().reset_index(level=0, drop=True)
    df[f"pgscan_sum_{w}"] = df.groupby('source_file')['pgscan_direct'].rolling(w).sum().reset_index(level=0, drop=True)
    df[f"pgmaj_sum_{w}"] = df.groupby('source_file')['pgmajfault'].rolling(w).sum().reset_index(level=0, drop=True)
    df[f"refault_sum_{w}"] = df.groupby('source_file')['workingset_refault'].rolling(w).sum().reset_index(level=0, drop=True)

# 一阶差分（趋势速度）
df["diff_some"] = df.groupby('source_file')['some_delta'].diff()
df["diff_pgscan"] = df.groupby('source_file')['pgscan_direct'].diff()
df["mem_drop"] = df.groupby('source_file')['mem_available'].diff()

# 波动性特征
df["some_std_6"] = df.groupby('source_file')['some_delta'].rolling(6).std().reset_index(level=0, drop=True)
df["pgscan_std_6"] = df.groupby('source_file')['pgscan_direct'].rolling(6).std().reset_index(level=0, drop=True)

# 删除含有 NaN 的行（由于滚动窗口和差分引入的 NaN）
df = df.dropna()
print(f"特征工程后: {len(df)} 行")

特征工程后: 4692 行


In [47]:
# 检查标签分布
print("future_1s_full > 0 ratio:",
      (df["future_1s_full"] > 0).mean())

print("future_1s_some > 0 ratio:",
      (df["future_1s_some"] > 0).mean())

print(f"\n最终特征列 ({len(df.columns)} 列):")
print(df.columns.tolist())

future_1s_full > 0 ratio: 0.8382352941176471
future_1s_some > 0 ratio: 0.9989343563512362

最终特征列 (31 列):
['ts', 'some_delta', 'full_delta', 'mem_available', 'pgscan_direct', 'pgsteal_direct', 'pgmajfault', 'workingset_refault', 'allocstall', 'pswpin', 'pswpout', 'source_file', 'future_1s_some', 'future_1s_full', 'some_sum_2', 'pgscan_sum_2', 'pgmaj_sum_2', 'refault_sum_2', 'some_sum_6', 'pgscan_sum_6', 'pgmaj_sum_6', 'refault_sum_6', 'some_sum_10', 'pgscan_sum_10', 'pgmaj_sum_10', 'refault_sum_10', 'diff_some', 'diff_pgscan', 'mem_drop', 'some_std_6', 'pgscan_std_6']


In [48]:
# 保存处理后的数据
os.makedirs("../processed_data", exist_ok=True)
df.to_csv("../processed_data/processed_psi_douyin.csv", index=False)
print(f"处理后的数据已保存到 ../processed_data/processed_psi_douyin.csv")
print(f"共 {len(df)} 行, {len(df.columns)} 列")

处理后的数据已保存到 ../processed_data/processed_psi_douyin.csv
共 4692 行, 31 列
