## 准备依赖环境
在运行其余单元前，请先安装/更新读取 Parquet 所需的 `pandas`、`pyarrow` 与 `fastparquet`。

In [5]:
%pip install --quiet pandas pyarrow fastparquet

Note: you may need to restart the kernel to use updated packages.


## 载入 Parquet 数据
导入 `pandas` 与 `pathlib.Path`，并将 `data/csmar_0110sample.parquet` 读取为 DataFrame。

In [6]:
from pathlib import Path
import pandas as pd

RAW_CANDIDATES = [
    Path("csmar_0110sample.parquet"),
    Path("data/csmar_0110sample.parquet"),
    Path("../data/csmar_0110sample.parquet"),
]
ENGINE = "pyarrow"  # 可改为 fastparquet 以测试不同后端

for candidate in RAW_CANDIDATES:
    if candidate.exists():
        DATA_PATH = candidate
        break
else:
    raise FileNotFoundError("Could not locate csmar_0110sample.parquet in expected locations.")

df = pd.read_parquet(DATA_PATH, engine=ENGINE)
print(f"Loaded {DATA_PATH} with shape {df.shape} using {ENGINE}.")

Loaded csmar_0110sample.parquet with shape (100712, 13) using pyarrow.


## 查看表结构与基础信息
使用 `df.info()`、`df.dtypes` 与 `df.columns` 快速了解字段类型和总行列数。

In [7]:
row_count, col_count = df.shape
print(f"Rows: {row_count:,} | Columns: {col_count}")
print("Column names (first 20 shown):")
print(df.columns.tolist()[:20])

print("\nDataFrame info:")
df.info()

display(df.dtypes.to_frame(name="dtype"))

Rows: 100,712 | Columns: 13
Column names (first 20 shown):
['stkcd', 'year', 'month', 'ret', 'size', 'r11', 'bm', 'ep', 'roe', 'ivff', 'beta', 'tur', 'srev']

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100712 entries, 0 to 100711
Data columns (total 13 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   stkcd   100712 non-null  float64
 1   year    100712 non-null  float64
 2   month   100712 non-null  float64
 3   ret     100712 non-null  float64
 4   size    100712 non-null  float64
 5   r11     100482 non-null  float64
 6   bm      96791 non-null   float64
 7   ep      87176 non-null   float64
 8   roe     96014 non-null   float64
 9   ivff    100687 non-null  float64
 10  beta    91135 non-null   float64
 11  tur     100712 non-null  float64
 12  srev    100455 non-null  float64
dtypes: float64(13)
memory usage: 10.0 MB


Unnamed: 0,dtype
stkcd,float64
year,float64
month,float64
ret,float64
size,float64
r11,float64
bm,float64
ep,float64
roe,float64
ivff,float64


## 预览样本数据
通过 `head()`、`tail()` 与 `sample()` 在不同位置抽样，便于核查数据是否如预期。

In [8]:
display(df.head())
display(df.tail())
display(df.sample(n=min(5, len(df)), random_state=0))

Unnamed: 0,stkcd,year,month,ret,size,r11,bm,ep,roe,ivff,beta,tur,srev
0,2.0,2001.0,1.0,0.065335,5577979.16,0.526364,0.519452,0.054286,0.053467,0.007631,1.328661,0.014812,-0.001934
1,2.0,2001.0,2.0,-0.079552,5952768.32,0.313276,0.519452,0.054286,0.053467,0.011457,1.324697,0.015279,0.066283
2,2.0,2001.0,3.0,0.096909,5490262.55,-0.003385,0.519452,0.054286,0.053467,0.013954,1.325665,0.012114,-0.07871
3,2.0,2001.0,4.0,-0.038208,6032510.7,0.074358,0.519452,0.054286,0.06204,0.008473,1.321602,0.010159,0.097368
4,2.0,2001.0,5.0,-0.027919,5813219.17,0.200568,0.519452,0.054286,0.06204,0.006057,1.218307,0.009749,-0.037588


Unnamed: 0,stkcd,year,month,ret,size,r11,bm,ep,roe,ivff,beta,tur,srev
100707,601999.0,2010.0,8.0,0.033647,1750840.0,-0.120581,0.236569,0.017195,0.022266,0.017566,1.062866,0.011004,0.160608
100708,601999.0,2010.0,9.0,-0.000223,1813000.0,0.233295,0.236569,0.017195,0.007811,0.013419,1.059267,0.010649,0.034117
100709,601999.0,2010.0,10.0,0.09737,1815960.0,0.329478,0.236569,0.017195,0.007811,0.019768,1.057593,0.010872,0.000532
100710,601999.0,2010.0,11.0,-0.178487,1996520.0,0.234614,0.236569,0.017195,0.001159,0.02244,1.05074,0.011787,0.098301
100711,601999.0,2010.0,12.0,-0.048168,1644280.0,0.146657,0.236569,0.017195,0.001159,0.02285,1.061028,0.011328,-0.177614


Unnamed: 0,stkcd,year,month,ret,size,r11,bm,ep,roe,ivff,beta,tur,srev
79026,600626.0,2009.0,11.0,0.043599,3541744.18,1.149489,1.005635,0.05134,0.010229,0.011152,1.141858,0.029096,0.146128
51966,600168.0,2008.0,1.0,-0.156407,1726758.0,1.047613,0.876125,0.035185,0.005611,0.023398,1.094522,0.020243,0.265572
22836,822.0,2007.0,8.0,0.209009,6453330.75,1.741626,0.847206,0.093079,0.034271,0.034611,1.10262,0.02383,0.118352
90349,600790.0,2010.0,7.0,0.135457,3063087.72,0.077874,0.239949,0.007165,0.011696,0.013847,1.262751,0.028492,-0.14336
8749,522.0,2002.0,7.0,0.045279,1268009.28,-0.311734,0.132147,,,0.009286,0.946636,0.003566,0.074539


## 执行基础筛选与统计
选择可用的字段做简单筛选并查看描述性统计，确认数据可用于后续建模。

In [9]:
numeric_cols = df.select_dtypes(include="number").columns.tolist()
if numeric_cols:
    target_col = numeric_cols[0]
    threshold = df[target_col].median()
    filtered_df = df[df[target_col] > threshold]
    print(f"Filtering {target_col} > median ({threshold:.4g}) -> {len(filtered_df):,} rows")
else:
    target_col = df.columns[0]
    filtered_df = df
    print("No numeric columns detected; skipping threshold filter.")

print("\nDescribe (first 10 columns shown):")
display(df.describe(include="all").transpose().head(10))

print(f"\nTop values for {target_col}:")
display(df[target_col].value_counts(dropna=False).head(10))

print("\nFiltered sample (up to 5 rows):")
display(filtered_df.head())

Filtering stkcd > median (6.001e+05) -> 50,293 rows

Describe (first 10 columns shown):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
stkcd,100712.0,361976.7,293039.6,2.0,876.0,600145.0,600578.0,601999.0
year,100712.0,2005.862,2.895431,2001.0,2003.0,2006.0,2008.0,2010.0
month,100712.0,6.511389,3.472162,1.0,3.0,7.0,10.0,12.0
ret,100712.0,0.0119974,0.1442838,-0.783737,-0.07198,0.002538,0.08458675,1.892659
size,100712.0,3542280.0,12930550.0,70083.96,812523.3225,1485626.0,3156000.0,1788049000.0
r11,100482.0,0.2711957,0.885901,-0.920918,-0.248325,-0.01005385,0.5112201,24.08918
bm,96791.0,0.4505261,0.2983485,0.001007,0.231743,0.3783042,0.5930598,4.413232
ep,87176.0,0.03574977,0.03148655,4e-06,0.01514,0.02733812,0.04663083,0.4099058
roe,96014.0,0.02686763,0.09711665,-7.206794,0.007,0.02084493,0.03985859,16.84678
ivff,100687.0,0.01803064,0.01151837,0.0,0.011181,0.01626946,0.02304891,1.148091



Top values for stkcd:


stkcd
600630.0    120
600059.0    120
600009.0    120
600601.0    120
88.0        119
600628.0    119
601.0       119
960.0       119
600123.0    119
600138.0    119
Name: count, dtype: int64


Filtered sample (up to 5 rows):


Unnamed: 0,stkcd,year,month,ret,size,r11,bm,ep,roe,ivff,beta,tur,srev
50419,600146.0,2001.0,4.0,0.258924,876600.0,0.087622,0.284986,0.004267,0.019283,0.018509,,0.007927,0.267725
50420,600146.0,2001.0,5.0,0.243529,1105200.0,0.381097,0.284986,0.004267,0.019283,0.025827,,0.007686,0.25926
50421,600146.0,2001.0,6.0,-0.003164,1376400.0,0.64199,0.284986,0.004267,0.019283,0.031774,,0.008123,0.244036
50422,600146.0,2001.0,7.0,-0.102685,1374600.0,1.34707,0.179687,0.003253,0.019283,0.0235,0.717162,0.007708,-0.002588
50423,600146.0,2001.0,8.0,0.017561,1236000.0,1.173683,0.179687,0.003253,0.019283,0.02655,0.775403,0.00746,-0.102044
