In [1]:
import torch

In [2]:
print("torch:", torch.__version__)

torch: 2.9.1+cu126


In [3]:
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("GPU count:", torch.cuda.device_count())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)

CUDA available: True
CUDA version: 12.6
GPU count: 1
GPU name: NVIDIA GeForce RTX 4060 Laptop GPU


In [8]:
import numpy as np
print(np.__version__)


2.2.6


# tabluar

In [1]:
import os, json
import numpy as np
import pandas as pd

def load_tab_bucket_as_df(root, bucket_id=0, decode_cats=False):
    bucket_dir = os.path.join(root, "tab_tensor_v1", f"bucket_{bucket_id:03d}")
    schema_fp = os.path.join(bucket_dir, "schema.json")
    with open(schema_fp, "r", encoding="utf-8") as f:
        schema = json.load(f)

    sample_id = np.load(os.path.join(bucket_dir, "sample_id.npy"), allow_pickle=True)
    X_num = np.load(os.path.join(bucket_dir, "numeric.npy"))  # float32
    num_cols = schema["num_cols"]
    cat_cols = schema["cat_cols"]

    df = pd.DataFrame(X_num, columns=num_cols)
    df.insert(0, "sample_id", sample_id.astype(str))

    # 读入所有 cat 列（idx）
    for i, c in enumerate(cat_cols):
        fp = os.path.join(bucket_dir, f"cat_{i:02d}_{c}.npy")
        if not os.path.exists(fp):
            # 容错：目录里可能缺某个列
            df[c] = 0
            continue
        df[c] = np.load(fp).astype(np.int64)

    if decode_cats:
        # vocabs.json 在 tab_tensor_v1 根目录
        vocabs_fp = os.path.join(root, "tab_tensor_v1", "vocabs.json")
        with open(vocabs_fp, "r", encoding="utf-8") as f:
            vocabs = json.load(f)

        # token->idx 反转成 idx->token
        inv = {}
        for c in cat_cols:
            v = vocabs.get(c, {})
            inv[c] = {int(idx): tok for tok, idx in v.items()}

        for c in cat_cols:
            df[c] = df[c].map(lambda x: inv[c].get(int(x), "__UNK__"))

    return df, schema

# 用法
ROOT = r"E:/NUS/data/perdata/train_text_all_samples"
df0, schema0 = load_tab_bucket_as_df(ROOT, bucket_id=0, decode_cats=False)
print(df0.shape)
print(df0.head())


(430, 24)
  sample_id  anchor_age  hadm_count  icustay_count  transfer_count  \
0  s_000000        52.0         1.0            1.0             6.0   
1  s_000256        69.0         1.0            1.0             4.0   
2  s_000512        68.0         1.0            1.0             6.0   
3  s_000768        87.0         1.0            1.0             2.0   
4  s_001024        56.0         1.0            1.0            10.0   

   presc_count  presc_unique_drug  proc_count  proc_unique_icd  icu_los_hours  \
0         24.0               17.0         0.0              0.0       9.846389   
1         87.0               52.0         0.0              0.0      17.021111   
2        158.0               71.0         0.0              0.0      20.616112   
3         78.0               52.0         0.0              0.0     116.258057   
4        298.0               72.0         0.0              0.0     123.025833   

   ...  insurance  language  marital_status  first_careunit  \
0  ...          1  

In [5]:
import os, json
import numpy as np
import torch

ROOT = r"E:/NUS/data/perdata/train_text_all_samples"
bucket = 0
bucket_dir = os.path.join(ROOT, "tab_tensor_v1", f"bucket_{bucket:03d}")

with open(os.path.join(bucket_dir, "schema.json"), "r", encoding="utf-8") as f:
    schema = json.load(f)

# load
X_num = np.load(os.path.join(bucket_dir, "numeric.npy")).astype(np.float32)
num_cols = schema["num_cols"]
cat_cols = schema["cat_cols"]

cat_list = []
for i, c in enumerate(cat_cols):
    cat_path = os.path.join(bucket_dir, f"cat_{i:02d}_{c}.npy")
    cat_list.append(np.load(cat_path).astype(np.int64))

# to torch
device = "cuda" if torch.cuda.is_available() else "cpu"
x_num = torch.from_numpy(X_num).to(device)

cat_tensors = [torch.from_numpy(a).to(device) for a in cat_list]

# (可选) 标准化：这里先用桶内统计演示，真正训练建议用全训练集统计
mean = x_num.mean(dim=0, keepdim=True)
std = x_num.std(dim=0, keepdim=True).clamp_min(1e-6)
x_num_std = (x_num - mean) / std

# init model
from data import TabularEncoder  # 你自己改成实际import
model = TabularEncoder(
    numeric_dim=len(num_cols),
    categorical_cardinalities=schema["cat_cardinalities"],
    cat_embed_dim=16,
    reg_weights={"ent": 0.0, "lap": 0.0, "stein": 0.0},
).to(device)

# forward
tab_embed, loss = model(x_num_std, cat_tensors)
print("tab_embed:", tab_embed.shape, "loss:", float(loss.detach().cpu()))


  from .autonotebook import tqdm as notebook_tqdm


tab_embed: torch.Size([430, 233]) loss: 1.5567512512207031


In [1]:
import os, json
import numpy as np
import torch

# cat+mlp
ROOT = r"E:/NUS/data/perdata/train_text_all_samples"
bucket = 0
bucket_dir = os.path.join(ROOT, "tab_tensor_v1", f"bucket_{bucket:03d}")

with open(os.path.join(bucket_dir, "schema.json"), "r", encoding="utf-8") as f:
    schema = json.load(f)

# load
X_num = np.load(os.path.join(bucket_dir, "numeric.npy")).astype(np.float32)
num_cols = schema["num_cols"]
cat_cols = schema["cat_cols"]

cat_list = []
for i, c in enumerate(cat_cols):
    cat_path = os.path.join(bucket_dir, f"cat_{i:02d}_{c}.npy")
    cat_list.append(np.load(cat_path).astype(np.int64))

# to torch
device = "cuda" if torch.cuda.is_available() else "cpu"
x_num = torch.from_numpy(X_num).to(device)

cat_tensors = [torch.from_numpy(a).to(device) for a in cat_list]

# (可选) 标准化：这里先用桶内统计演示，真正训练建议用全训练集统计
mean = x_num.mean(dim=0, keepdim=True)
std = x_num.std(dim=0, keepdim=True).clamp_min(1e-6)
x_num_std = (x_num - mean) / std

# init model
from data import TabularEncoderCMLP  # 你自己改成实际import
model = TabularEncoderCMLP(
    numeric_dim=len(num_cols),
    categorical_cardinalities=schema["cat_cardinalities"],
    cat_embed_dim=16,
    reg_weights={"ent": 0.0, "lap": 0.0, "stein": 0.0},
).to(device)

# forward
tab_embed, loss = model(x_num_std, cat_tensors)
print("tab_embed:", tab_embed.shape, "loss:", float(loss.detach().cpu()))

  from .autonotebook import tqdm as notebook_tqdm


tab_embed: torch.Size([430, 352]) loss: 0.0
