In [1]:
import pandas as pd
import glob
import os

In [2]:
# 读取所有抖音场景 CSV 文件
csv_files = glob.glob("../data/psi_douyin_*.csv")
print(f"找到 {len(csv_files)} 个 CSV 文件:")
for f in csv_files:
    print(f"  - {os.path.basename(f)}")

# 合并所有文件
dfs = []
for f in csv_files:
    df_temp = pd.read_csv(f)
    df_temp['source_file'] = os.path.basename(f)
    dfs.append(df_temp)

df = pd.concat(dfs, ignore_index=True)
print(f"\n总共 {len(df)} 行数据")

找到 4 个 CSV 文件:
  - psi_douyin_20260228_211328.csv
  - psi_douyin_20260228_213146.csv
  - psi_douyin_20260228_215103.csv
  - psi_douyin_20260228_221442.csv

总共 4740 行数据


In [3]:
# 数据预览
print("=== 数据预览 ===")
print(df.head(10))

print("\n=== 数据类型 ===")
print(df.dtypes)

print("\n=== 统计信息 ===")
print(df.describe())

print("\n=== 缺失值统计 ===")
print(df.isna().sum())

print("\n=== 每个文件的行数 ===")
print(df.groupby('source_file').size())

=== 数据预览 ===
              ts       phase  some_delta  full_delta  mem_available  \
0  1772283575770        idle     5757707     1466496        3986832   
1  1772283576619        idle         108           0        3925240   
2  1772283577468        idle          20           0        3923728   
3  1772283578319  cold_start         567          25        3869840   
4  1772283579112  cold_start        1773         572        3720076   
5  1772283579873  cold_start       10933        3459        3642868   
6  1772283580591  cold_start       12848        4657        3576884   
7  1772283581309  cold_start       26485        8268        3544192   
8  1772283582187  cold_start         503           1        3566472   
9  1772283583041  cold_start          13           0        3499368   

   pgscan_direct  pgsteal_direct  pgmajfault  workingset_refault  allocstall  \
0        2890445         1776059      708884             1554259        3342   
1              0               0         383 

In [4]:
# 阶段 (phase) 分布
print("=== Phase 分布 ===")
print(df['phase'].value_counts())

print("\n=== 每个 Phase 的样本数占比 ===")
print(df['phase'].value_counts(normalize=True).round(4))

=== Phase 分布 ===
phase
douyin_browse          3299
cold_start              624
switch_news             127
switch_mobileqq          73
switch_cloudmusic        70
switch_xhs               70
switch_taobao            59
switch_bili              58
switch_weibo             45
launch_douyin            42
switch_qqmusic           42
switch_meituan           35
switch_BaiduMap          33
switch_mall              32
switch_searchbox         26
switch_AlipayGphone      24
switch_mm                19
switch_phone             17
switch_pinduoduo         16
switch_minimap           16
idle                     13
Name: count, dtype: int64

=== 每个 Phase 的样本数占比 ===
phase
douyin_browse          0.6960
cold_start             0.1316
switch_news            0.0268
switch_mobileqq        0.0154
switch_cloudmusic      0.0148
switch_xhs             0.0148
switch_taobao          0.0124
switch_bili            0.0122
switch_weibo           0.0095
launch_douyin          0.0089
switch_qqmusic         0.0089
sw

In [5]:
# 各阶段关键指标对比
print("=== 各阶段关键指标均值 ===")
phase_stats = df.groupby('phase')[[
    'some_delta', 'full_delta', 'mem_available',
    'pgscan_direct', 'pgmajfault', 'workingset_refault',
    'pswpin', 'pswpout'
]].mean().round(1)
print(phase_stats)

=== 各阶段关键指标均值 ===
                     some_delta  full_delta  mem_available  pgscan_direct  \
phase                                                                       
cold_start              18965.7      5525.2      2632940.6        10623.3   
douyin_browse            9658.8      3445.1      1602933.9          322.5   
idle                   456038.1    113772.9      3815704.9       369486.5   
launch_douyin           66019.3     25682.5      1676699.5         3509.6   
switch_AlipayGphone     15712.4      7205.2      1615862.8          622.6   
switch_BaiduMap         52215.9     18024.2      1560396.6         3166.5   
switch_bili             27118.2     11973.6      1501975.7          733.1   
switch_cloudmusic       22212.9      8281.4      1586629.7         1365.1   
switch_mall             17521.5      7816.4      1335464.4          722.5   
switch_meituan          57814.1     16849.6      1649149.9         1670.3   
switch_minimap          21094.2      3878.8      1584469.5