In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# --- 因为本 Notebook 与数据文件在同一目录下，直接加载即可 ---
clean_data_file = 'a_stock_daily_data_clean.parquet'

try:
    df = pd.read_parquet(clean_data_file)
    print(f"✅ 数据加载成功！文件: {clean_data_file}")
    print(f"   共加载 {len(df)} 条数据。")
except FileNotFoundError:
    print(f"❌ 错误：找不到已清洗的数据文件 '{clean_data_file}'！")
    print("   请确保你已经成功运行了 data_cleaner 中的 run_cleaning_pipeline.py 脚本。")

# 显示数据的前几行，对数据有个初步印象
df.head()

✅ 数据加载成功！文件: a_stock_daily_data_clean.parquet
   共加载 10216839 条数据。


Unnamed: 0,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount,name,list_date
0,2010-01-04,24.52,24.58,23.68,23.71,24.37,-0.66,-2.71,241922.76,580249.5,平安银行,1991-04-03
1,2010-01-05,23.75,23.9,22.75,23.3,23.71,-0.41,-1.73,556499.82,1293477.0,平安银行,1991-04-03
2,2010-01-06,23.25,23.25,22.72,22.9,23.3,-0.4,-1.72,412143.13,944453.7,平安银行,1991-04-03
3,2010-01-07,22.9,23.05,22.4,22.65,22.9,-0.25,-1.09,355336.85,804166.3,平安银行,1991-04-03
4,2010-01-08,22.5,22.75,22.35,22.6,22.65,-0.05,-0.22,288543.06,650667.4,平安银行,1991-04-03


In [2]:
# --- 验证1: 检查是否还存在ST股票 ---
print("正在检查ST股票...")
# 依据《因子投资：方法与实践》3.1.4节的黑名单规则
st_stocks = df[df['name'].str.contains('ST')]

if st_stocks.empty:
    print("✅ ST股票已全部剔除，验证通过！")
else:
    print("❌ 验证失败：数据中仍存在ST股票！")
    print("残留的ST股票列表：")
    print(st_stocks['name'].unique())

正在检查ST股票...
✅ ST股票已全部剔除，验证通过！


In [3]:
# --- 验证2: 检查是否还存在上市不足一年的次新股 ---
print("正在检查次新股...")
# 依据《因子投资：方法与实践》3.1.4节的黑名单规则
# 重新计算上市天数用于验证
df['days_on_market'] = (df['trade_date'] - df['list_date']).dt.days

# 寻找上市天数小于等于365的记录
new_stocks = df[df['days_on_market'] <= 365]

if new_stocks.empty:
    print("✅ 次新股已全部剔除，验证通过！")
else:
    print("❌ 验证失败：数据中仍存在上市不足一年的股票！")
    print("残留的次新股记录示例：")
    print(new_stocks[['ts_code', 'trade_date', 'list_date', 'days_on_market']].head())

# 删除辅助列
df = df.drop(columns=['days_on_market'])

正在检查次新股...
✅ 次新股已全部剔除，验证通过！


In [4]:
# --- 验证3: 检查离群值处理（收益率压缩） ---
print("正在检查 `pct_chg` 列的范围...")
# 依据《因子投资：方法与实践》3.1.2节的处理方法
min_pct_chg = df['pct_chg'].min()
max_pct_chg = df['pct_chg'].max()

print(f"收益率范围：[{min_pct_chg:.2f}%, {max_pct_chg:.2f}%]")

if min_pct_chg >= -10 and max_pct_chg <= 10:
    print("✅ 离群值处理验证通过，所有收益率均在 [-10, 10] 区间内！")
else:
    print("❌ 验证失败：存在超出范围的收益率！")

# 也可以用 describe() 做一个整体的描述性统计概览
df[['pct_chg']].describe()

正在检查 `pct_chg` 列的范围...
收益率范围：[-10.00%, 10.00%]
✅ 离群值处理验证通过，所有收益率均在 [-10, 10] 区间内！


Unnamed: 0,pct_chg
count,10216840.0
mean,0.05219337
std,2.897599
min,-10.0
25%,-1.3909
50%,0.0
75%,1.36
max,10.0


In [5]:
# --- 验证4: 检查缺失值 ---
print("正在检查是否存在缺失值...")
# 依据《因子投资：方法与实践》3.1.2节的处理方法
missing_values = df.isnull().sum()

# 筛选出仍然存在缺失值的列
missing_cols = missing_values[missing_values > 0]

if missing_cols.empty:
    print("✅ 缺失值处理验证通过，数据集中没有NaN值！")
else:
    print("❌ 验证失败：以下列仍然存在缺失值！")
    print(missing_cols)
    
# 使用.info()可以更全面地查看每一列的数据类型和非空值数量
print("\n--- DataFrame.info() ---")
df.info()

正在检查是否存在缺失值...
✅ 缺失值处理验证通过，数据集中没有NaN值！

--- DataFrame.info() ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10216839 entries, 0 to 10216838
Data columns (total 12 columns):
 #   Column      Dtype         
---  ------      -----         
 0   trade_date  datetime64[ns]
 1   open        float64       
 2   high        float64       
 3   low         float64       
 4   close       float64       
 5   pre_close   float64       
 6   change      float64       
 7   pct_chg     float64       
 8   vol         float64       
 9   amount      float64       
 10  name        object        
 11  list_date   datetime64[ns]
dtypes: datetime64[ns](2), float64(9), object(1)
memory usage: 935.4+ MB


In [3]:
# --- 验证4: 检查指数数据---
clean_data_file = 'a_index_daily_data_clean.parquet'

try:
    df = pd.read_parquet(clean_data_file)
    print(f"✅ 数据加载成功！文件: {clean_data_file}")
    print(f"   共加载 {len(df)} 条数据。")
except FileNotFoundError:
    print(f"❌ 错误：找不到已清洗的数据文件 '{clean_data_file}'！")
    print("   请确保你已经成功运行了 data_cleaner 中的 run_cleaning_pipeline.py 脚本。")

# 显示数据的前几行，对数据有个初步印象
df.head()

✅ 数据加载成功！文件: a_index_daily_data_clean.parquet
   共加载 22956 条数据。


Unnamed: 0,trade_date,close,open,high,low,pre_close,change,pct_chg,vol,amount
0,2010-01-04,3243.76,3289.75,3295.279,3243.319,3277.139,-33.379,-1.0185,109447927.0,133773500.0
1,2010-01-05,3282.179,3254.468,3290.512,3221.462,3243.76,38.419,1.1844,126115066.0,161858200.0
2,2010-01-06,3254.215,3277.517,3295.868,3253.044,3282.179,-27.964,-0.852,123651384.0,158040100.0
3,2010-01-07,3192.776,3253.991,3268.819,3176.707,3254.215,-61.439,-1.888,128652827.0,157229600.0
4,2010-01-08,3195.997,3177.259,3198.92,3149.017,3192.776,3.221,0.1009,98377147.0,121739900.0


In [4]:
# --- 验证4: 检查资产负债表数据---
clean_data_file = 'a_stock_balancesheet_data_clean.parquet'

try:
    df = pd.read_parquet(clean_data_file)
    print(f"✅ 数据加载成功！文件: {clean_data_file}")
    print(f"   共加载 {len(df)} 条数据。")
except FileNotFoundError:
    print(f"❌ 错误：找不到已清洗的数据文件 '{clean_data_file}'！")
    print("   请确保你已经成功运行了 data_cleaner 中的 run_cleaning_pipeline.py 脚本。")

# 显示数据的前几行，对数据有个初步印象
df.head()

✅ 数据加载成功！文件: a_stock_balancesheet_data_clean.parquet
   共加载 356393 条数据。


Unnamed: 0,ts_code,ann_date,f_ann_date,end_date,report_type,comp_type,end_type,total_share,cap_rese,undistr_porfit,...,accounts_receiv_bill,accounts_pay,oth_rcv_total,fix_assets_total,cip_total,oth_pay_total,long_pay_total,debt_invest,oth_debt_invest,update_flag
0,000001.SZ,20250823,20250823,20250630,1,2,,1655173000.0,2658589000.0,1856439000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,000001.SZ,20250823,20250823,20250630,1,2,2.0,1655173000.0,2658589000.0,1856439000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,751372000000.0,180612000000.0,1
2,000001.SZ,20250419,20250419,20250331,1,2,1.0,1655173000.0,2658589000.0,1856439000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,777634000000.0,173453000000.0,1
3,000001.SZ,20250315,20250315,20241231,1,2,4.0,1655173000.0,2658589000.0,1856439000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,785075000000.0,176655000000.0,1
4,000001.SZ,20241019,20241019,20240930,1,2,3.0,1655173000.0,2658589000.0,1856439000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,777403000000.0,179794000000.0,0


In [5]:
# --- 验证5: 检查现金流量表数据---
clean_data_file = 'a_stock_cashflow_data_clean.parquet'

try:
    df = pd.read_parquet(clean_data_file)
    print(f"✅ 数据加载成功！文件: {clean_data_file}")
    print(f"   共加载 {len(df)} 条数据。")
except FileNotFoundError:
    print(f"❌ 错误：找不到已清洗的数据文件 '{clean_data_file}'！")
    print("   请确保你已经成功运行了 data_cleaner 中的 run_cleaning_pipeline.py 脚本。")

# 显示数据的前几行，对数据有个初步印象
df.head()

✅ 数据加载成功！文件: a_stock_cashflow_data_clean.parquet
   共加载 338328 条数据。


Unnamed: 0,ts_code,ann_date,f_ann_date,end_date,comp_type,report_type,end_type,net_profit,finan_exp,c_fr_sale_sg,...,net_dism_capital_add,net_cash_rece_sec,credit_impa_loss,use_right_asset_dep,oth_loss_asset,end_bal_cash,beg_bal_cash,end_bal_cash_equ,beg_bal_cash_equ,update_flag
0,000001.SZ,20250823,20250823,20250630,2,1,2,48807390.0,0.0,0.0,...,2835000000.0,0.0,19385000000.0,938000000.0,65000000.0,159103400.0,140040000.0,300944000000.0,253525000000.0,0
1,000001.SZ,20250419,20250419,20250331,2,1,1,0.0,0.0,0.0,...,11907000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,000001.SZ,20250315,20250315,20241231,2,1,4,48807390.0,0.0,0.0,...,62516000000.0,0.0,48939000000.0,2247000000.0,0.0,159103400.0,140040000.0,253525000000.0,294532000000.0,0
3,000001.SZ,20250315,20250315,20241231,2,1,4,48807390.0,0.0,0.0,...,62516000000.0,0.0,48939000000.0,2247000000.0,489000000.0,159103400.0,140040000.0,253525000000.0,294532000000.0,1
4,000001.SZ,20241019,20241019,20240930,2,1,3,0.0,0.0,0.0,...,48026000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [6]:
# --- 验证6: 检查收益表数据---
clean_data_file = 'a_stock_income_data_clean.parquet'

try:
    df = pd.read_parquet(clean_data_file)
    print(f"✅ 数据加载成功！文件: {clean_data_file}")
    print(f"   共加载 {len(df)} 条数据。")
except FileNotFoundError:
    print(f"❌ 错误：找不到已清洗的数据文件 '{clean_data_file}'！")
    print("   请确保你已经成功运行了 data_cleaner 中的 run_cleaning_pipeline.py 脚本。")

# 显示数据的前几行，对数据有个初步印象
df.head()

✅ 数据加载成功！文件: a_stock_income_data_clean.parquet
   共加载 362154 条数据。


Unnamed: 0,ts_code,ann_date,f_ann_date,end_date,report_type,comp_type,end_type,basic_eps,diluted_eps,total_revenue,...,withdra_biz_devfund,withdra_rese_fund,withdra_oth_ersu,workers_welfare,distr_profit_shrhder,prfshare_payable_dvd,comshare_payable_dvd,capit_comstock_div,continued_net_profit,update_flag
0,000001.SZ,20250823,20250823,20250630,1,2,2,0.762692,0.708214,3399053000.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,56252800.0,1
1,000001.SZ,20250419,20250419,20250331,1,2,1,0.62,0.62,3399053000.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,56252800.0,1
2,000001.SZ,20250315,20250315,20241231,1,2,4,0.762692,0.708214,3399053000.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,56252800.0,1
3,000001.SZ,20241019,20241019,20240930,1,2,3,0.762692,0.708214,3399053000.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,56252800.0,1
4,000001.SZ,20240816,20240816,20240630,1,2,2,0.762692,0.708214,3399053000.0,...,,,0.0,0.0,0.0,0.0,0.0,0.0,56252800.0,1
