In [138]:
import pandas as pd
from datetime import datetime
import io

In [139]:
def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        html_lines = []
        for line in f:
            if '<tr>' in line:
                html_lines.append(line.strip())
    single_html_str = ''.join(html_lines)
    html_io = io.StringIO(single_html_str)
    # raw = pd.concat(pd.read_html(html_io), ignore_index=True)
    raw_list = pd.read_html(html_io)

    keep = ['记账日期', '记账时间', '币别', '金额', '余额', '交易名称', '附言', '对方账户名',]
    for i in raw_list:
        i.columns = i.iloc[0]
        i.drop(index=[0], inplace=True)
        drop = [c for c in i.columns if c not in keep]
        i.drop(columns=drop, inplace=True)

    raw = pd.concat(raw_list, ignore_index=True, names=keep)
    return raw

In [140]:
raw = read_file('boc_test.md')

In [141]:
raw

Unnamed: 0,记账日期,记账时间,币别,金额,余额,交易名称,附言,对方账户名
0,2025-09-17,08:19:43,人民币,-9.75,400.20,网上快捷支付,支付宝,支付宝
1,2025-09-16,13:27:03,人民币,-10.00,409.95,网上快捷支付,支付宝,支付宝
2,2025-09-16,08:19:58,人民币,-1.50,419.95,网上快捷支付,支付宝,支付宝
3,2025-09-15,19:47:03,人民币,-28.90,421.45,网上快捷支付,支付宝,支付宝
4,2025-09-15,12:51:15,人民币,-16.70,450.35,网上快捷支付,支付宝,支付宝
...,...,...,...,...,...,...,...,...
201,2025-06-25,14:03:45,美元,7097.00,7097.00,购汇,---,阅一多
202,2025-06-23,16:49:47,人民币,-122.80,1313.84,网上快捷支付,财付通,财付通
203,2025-06-23,08:08:53,人民币,-18.90,1436.64,网上快捷支付,支付宝,支付宝
204,2025-06-21,23:41:38,人民币,943.90,1455.54,银联入账,阅一多支付宝余额提现,支付宝(中国)网络技术有限公司


In [142]:
def clean_raw(raw: pd.DataFrame):
    raw = raw.copy()
    raw.columns=['date', 'time', 'currency', 'amount', 'balance', 'name', 'note', 'counterparty']
    raw.date = raw.date.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    raw.currency = raw.currency.map({'人民币': 'CNY', '美元': 'USD'})
    raw.amount = raw.amount.apply(lambda x: float(x))
    raw.balance = raw.balance.apply(lambda x: float(x))

    raw.drop(index=raw[raw['name'] == '网上快捷支付'].index, inplace=True)
    raw.drop(index=raw[raw['name'] == '银联入账'].index, inplace=True)
    raw.drop(index=raw[raw['name'] == '网上快捷退款'].index, inplace=True)
    raw.drop(index=raw[raw['name'] == '网上快捷提现'].index, inplace=True)

    return raw

In [143]:
cleaned = clean_raw(raw)
cleaned

Unnamed: 0,date,time,currency,amount,balance,name,note,counterparty
50,2025-09-02,15:52:03,CNY,850.0,2797.65,跨行转账,代转,陈萍
74,2025-08-21,07:49:34,CNY,-1707.7,185.3,转账支出,------------------------------------,门联霖
76,2025-08-17,11:00:59,CNY,1093.0,1093.0,跨行转账,报销回校机票,陈军
78,2025-08-14,12:35:24,CNY,2000.0,2225.4,跨行转账,旅游报销,陈军
121,2025-07-29,16:34:10,CNY,3398.0,5223.12,跨行转账,赞助switch二代,陈军
126,2025-07-29,08:51:48,CNY,2000.0,2421.47,跨行转账,八月,陈军
168,2025-07-20,11:56:55,CNY,500.0,1159.12,跨行转账,帮转,陈莘
171,2025-07-10,12:14:59,CNY,-1702.4,794.43,自助取款,------------------,------------------
186,2025-06-30,23:27:14,CNY,3500.0,6202.59,跨行转账,陈萍,6222021202005570511
194,2025-06-25,15:47:09,CNY,-49123.16,2720.94,POS消费,XIAMEN UNIVERSITY MTS IA SEPANT MTS,000009129054512
