In [25]:
import pandas as pd
import glob
import os

In [26]:
# 读取所有 CSV 文件
csv_files = glob.glob("../data/psi_data_*.csv")
print(f"找到 {len(csv_files)} 个 CSV 文件:")
for f in csv_files:
    print(f"  - {os.path.basename(f)}")

# 合并所有文件
dfs = []
for f in csv_files:
    df_temp = pd.read_csv(f)
    df_temp['source_file'] = os.path.basename(f)
    dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)
print(f"\n总共 {len(df)} 行数据")

找到 1 个 CSV 文件:
  - psi_data_20260228_173302.csv

总共 3383 行数据


In [27]:
# 未来 1 秒 = 下两个样本
df["future_1s_some"] = df["some_delta"].shift(-1) + df["some_delta"].shift(-2)
df["future_1s_full"] = df["full_delta"].shift(-1) + df["full_delta"].shift(-2)
df = df.iloc[:-2]

In [28]:
# 基础特征列表
base_features = [
    "some_delta",
    "full_delta",
    "mem_available",
    "pgscan_direct",
    "pgsteal_direct",
    "pgmajfault",
    "allocstall",
    "pswpin",
    "pswpout"
]

# 滑动窗口特征（趋势建模）
for w in [2, 6, 10]:
    df[f"some_sum_{w}"] = df["some_delta"].rolling(w).sum()
    df[f"pgscan_sum_{w}"] = df["pgscan_direct"].rolling(w).sum()
    df[f"pgmaj_sum_{w}"] = df["pgmajfault"].rolling(w).sum()

# 一阶差分（趋势速度）
df["diff_some"] = df["some_delta"].diff()
df["diff_pgscan"] = df["pgscan_direct"].diff()
df["mem_drop"] = df["mem_available"].diff()

# 波动性特征
df["some_std_6"] = df["some_delta"].rolling(6).std()
df["pgscan_std_6"] = df["pgscan_direct"].rolling(6).std()


# 删除含有 NaN 的行（由于滚动窗口和差分引入的 NaN）
df = df.dropna()

In [30]:
# 检查 full 是否稀疏
print("future_1s_full > 0 ratio:",
      (df["future_1s_full"] > 0).mean())

# 检查 some 是否稀疏
print("future_1s_some > 0 ratio:",
      (df["future_1s_some"] > 0).mean())

future_1s_full > 0 ratio: 0.4765717674970344
future_1s_some > 0 ratio: 0.9148873072360617


In [31]:
# 保存处理后的数据
df.to_csv("../processed_data/processed_psi_data.csv", index=False)
print("\n处理后的数据已保存到 ../processed_data/processed_psi_data.csv")


处理后的数据已保存到 ../processed_data/processed_psi_data.csv
