# 数据概览
加载 train/test 数据并检查列信息与缺失情况。

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display


def _count_empty_strings(series: pd.Series) -> int:
    if not (pd.api.types.is_string_dtype(series.dtype) or series.dtype == object):
        return 0
    return int(series.dropna().eq('').sum())


def build_missing_summary(df: pd.DataFrame) -> pd.DataFrame:
    total_rows = len(df)
    records = []
    for column in df.columns:
        col_series = df[column]
        null_count = int(col_series.isna().sum())
        empty_count = _count_empty_strings(col_series)
        missing_total = null_count + empty_count
        available = max(total_rows - missing_total, 0)
        records.append(
            {
                'column': column,
                'missing_null': null_count,
                'missing_empty': empty_count,
                'missing_total': missing_total,
                'available': available,
                'missing_ratio': missing_total / total_rows if total_rows else float('nan'),
            }
        )

    summary_df = (
        pd.DataFrame.from_records(records)
        .set_index('column')
        .sort_values('missing_total', ascending=False)
    )
    return summary_df


project_root = Path.cwd()
train_path = project_root / 'data/parquet/train.parquet'
test_path = project_root / 'data/parquet/test.parquet'
if not train_path.exists() or not test_path.exists():
    project_root = project_root.parent
    train_path = project_root / 'data/parquet/train.parquet'
    test_path = project_root / 'data/parquet/test.parquet'

train_df = pd.read_parquet(train_path)
test_df = pd.read_parquet(test_path)
combined_df = pd.concat(
    [train_df.assign(dataset='train'), test_df.assign(dataset='test')],
    ignore_index=True,
)

print(f'Train rows: {len(train_df):,}')
print(f'Test rows: {len(test_df):,}')
print(f'Combined rows: {len(combined_df):,}')
print('Columns:', combined_df.columns.tolist())

missing_summary = build_missing_summary(combined_df)

display(missing_summary)

fig, ax = plt.subplots(
    figsize=(min(18, max(8, len(missing_summary) * 0.8)),
            max(4, len(missing_summary) * 0.35))
)
missing_summary['missing_total'].plot(kind='bar', ax=ax, color='#1f77b4')
ax.set_title('Missing values per column (null + empty)')
ax.set_xlabel('Column')
ax.set_ylabel('Missing count')
ax.tick_params(axis='x', rotation=45, labelsize=9)
plt.tight_layout()
plt.show()
