In [8]:
import os
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import argparse

In [9]:
def create_manifest(tiles_basedir, sentences_basedir, output_dir, val_split=0.05):
    """
    Scans tile and sentence directories to create a CSV manifest for OpenClip training.

    Args:
        tiles_basedir (str): The root directory containing the sample tile folders (e.g., '.../TENX101_BN1_tiles').
        sentences_basedir (str): The root directory containing the sample sentence folders.
        output_dir (str): Directory where the manifest CSV files will be saved.
        val_split (float): The fraction of data to use for the validation set.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    manifest_data = []
    print("Scanning directories to build manifest...")

    # Assumes sentence directories match tile directories but with a different suffix
    sentence_dirs = [d for d in os.listdir(sentences_basedir) if d.endswith('_sentences_hvg')]

    for sent_dir_name in tqdm(sentence_dirs, desc="Processing Samples"):
        sample_id = sent_dir_name.replace('_sentences_hvg', '')
        
        # Construct corresponding tile directory name
        tile_dir_name = f"{sample_id}_tiles" # Or another suffix if it's different
        
        sent_dir_path = os.path.join(sentences_basedir, sent_dir_name)
        tile_dir_path = os.path.join(tiles_basedir, tile_dir_name)
        
        if not os.path.isdir(tile_dir_path):
            print(f"Warning: Tile directory not found for {sample_id}, skipping.")
            continue
            
        for sentence_file in os.listdir(sent_dir_path):
            if not sentence_file.endswith('.txt'):
                continue
            
            # Construct the corresponding image file name from the sentence file name
            # e.g., 'TENX101_BN1_2544_13499.txt' -> 'TENX101_BN1_2544_13499.png'
            image_file = sentence_file.replace('.txt', '.png')
            image_path = os.path.join(tile_dir_path, image_file)
            
            # Check if both files actually exist before adding to the manifest
            if os.path.exists(image_path):
                # Read the gene sentence from the file
                with open(os.path.join(sent_dir_path, sentence_file), 'r') as f:
                    gene_sentence = f.read().strip()
                
                manifest_data.append({
                    'image_path': image_path,
                    'gene_sentence': gene_sentence
                })

    if not manifest_data:
        raise ValueError("No matching image-sentence pairs were found. Check your directory paths and file naming conventions.")

    # Convert to a DataFrame
    df = pd.DataFrame(manifest_data)
    print(f"Successfully found {len(df)} matching image-sentence pairs.")

    # Split into training and validation sets
    if val_split > 0:
        train_df, val_df = train_test_split(df, test_size=val_split, random_state=42)
        val_path = os.path.join(output_dir, 'validation_manifest.csv')
        val_df.to_csv(val_path, index=False, sep=',')
        print(f"Validation manifest saved to: {val_path} ({len(val_df)} samples)")
    else:
        train_df = df

    train_path = os.path.join(output_dir, 'train_manifest.csv')
    train_df.to_csv(train_path, index=False, sep=',')
    print(f"Training manifest saved to: {train_path} ({len(train_df)} samples)")

In [10]:
sentences_basedir = '/cwStorage/nodecw_group/jijh/hest_sentences'

In [11]:
tile_basedir = '/cwStorage/nodecw_group/jijh/hest_output'

In [12]:
output_dir = "/cwStorage/nodecw_group/jijh/openclip_train"

In [13]:
create_manifest(tiles_basedir=tile_basedir,
                sentences_basedir=sentences_basedir,
                output_dir=output_dir,
                val_split=0.05)

Scanning directories to build manifest...


Processing Samples:   0%|          | 0/168 [00:00<?, ?it/s]

Successfully found 307724 matching image-sentence pairs.
Validation manifest saved to: /cwStorage/nodecw_group/jijh/openclip_train/validation_manifest.csv (15387 samples)
Training manifest saved to: /cwStorage/nodecw_group/jijh/openclip_train/train_manifest.csv (292337 samples)


# Clip Train!

In [7]:
# ==============================================================================
#           Cell: OpenCLIP Fine-tuning Launcher (Robust Version)
# ==============================================================================
import os
import sys
import subprocess
import shlex
import fcntl
import select
import datetime

# --- 1. Training Configuration ---
class OpenClipFinetuneConfig:
    # --- Hardware & Distributed Training ---
    GPU_IDS = [0, 1, 2]
    NUM_GPUS = len(GPU_IDS)
    DDP_MASTER_PORT = 29501

    # --- Paths (使用绝对路径) ---
    PROJECT_ROOT = "/cwStorage/nodecw_group/jijh/openclip_train"
    OPEN_CLIP_SRC_DIR = os.path.join(PROJECT_ROOT, "open_clip/src")
    TRAIN_MANIFEST = os.path.join(PROJECT_ROOT, "train_manifest.csv")
    VAL_MANIFEST = os.path.join(PROJECT_ROOT, "validation_manifest.csv")
    
    TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    LOGS_DIR = os.path.join(PROJECT_ROOT, "train_log", f"finetune_{TIMESTAMP}")
    
    PRETRAINED_MODEL_PATH = os.path.join(PROJECT_ROOT, "open_clip/CLIP-ViT-B-32/open_clip_model.safetensors")

    # --- Model & Data Hyperparameters ---
    MODEL_NAME = "ViT-B-32"
    DATASET_TYPE = "csv"
    CSV_SEPARATOR = ","
    CSV_IMG_KEY = "image_path"
    CSV_CAPTION_KEY = "gene_sentence"

    # --- Training Hyperparameters ---
    EPOCHS = 10
    BATCH_SIZE_PER_GPU = 128
    LEARNING_RATE = 5e-6
    WEIGHT_DECAY = 0.1
    WARMUP_STEPS = 500
    PRECISION = "amp_bfloat16" # For H100
    NUM_WORKERS = 8

    # --- Logging & Saving ---
    REPORT_TO = "tensorboard"
    SAVE_FREQUENCY = 1
    LOG_EVERY_N_STEPS = 100

    @classmethod
    def build_command(cls):
        """
        构建一个更简洁、更具兼容性的torchrun命令。
        移除了所有可能导致'unrecognized arguments'错误的非核心参数。
        """
        python_executable = sys.executable
        
        cmd = [
            python_executable, "-m", "torch.distributed.run",
            f"--nproc_per_node={cls.NUM_GPUS}",
            f"--master_port={cls.DDP_MASTER_PORT}",
            # 确保主脚本路径正确
            os.path.join(cls.OPEN_CLIP_SRC_DIR, "open_clip_train/main.py"),
            
            # 核心参数
            "--model", cls.MODEL_NAME,
            "--pretrained", cls.PRETRAINED_MODEL_PATH,
            "--train-data", cls.TRAIN_MANIFEST,
            "--val-data", cls.VAL_MANIFEST,
            "--dataset-type", cls.DATASET_TYPE,
            "--csv-separator", cls.CSV_SEPARATOR,
            "--csv-img-key", cls.CSV_IMG_KEY,
            "--csv-caption-key", cls.CSV_CAPTION_KEY,
            "--logs", cls.LOGS_DIR,
            
            # 训练超参数
            "--epochs", str(cls.EPOCHS),
            "--batch-size", str(cls.BATCH_SIZE_PER_GPU),
            "--lr", str(cls.LEARNING_RATE),
            "--wd", str(cls.WEIGHT_DECAY),
            "--warmup", str(cls.WARMUP_STEPS),
            "--precision", cls.PRECISION,
            
            # 系统和日志参数
            "--workers", str(cls.NUM_WORKERS),
            "--report-to", cls.REPORT_TO,
            "--save-frequency", str(cls.SAVE_FREQUENCY),
            "--log-every-n-steps", str(cls.LOG_EVERY_N_STEPS),
        ]
        return cmd

In [8]:
# --- 2. 实例化配置并准备环境 ---
config = OpenClipFinetuneConfig()
os.makedirs(config.LOGS_DIR, exist_ok=True)
print(f"✅ 配置加载成功。日志和模型将保存到: {config.LOGS_DIR}")


✅ 配置加载成功。日志和模型将保存到: /cwStorage/nodecw_group/jijh/openclip_train/train_log/finetune_20250619-212110


In [None]:
# --- 3. 检查关键文件和目录是否存在 ---
paths_to_check = {
    "训练脚本": os.path.join(config.OPEN_CLIP_SRC_DIR, "open_clip_train/main.py"),
    "训练清单文件": config.TRAIN_MANIFEST,
    "验证清单文件": config.VAL_MANIFEST,
    "预训练模型": config.PRETRAINED_MODEL_PATH,
}
all_paths_ok = True
for name, path in paths_to_check.items():
    if not os.path.exists(path):
        print(f"❌ 错误: {name} 未找到，路径: {path}")
        all_paths_ok = False

if all_paths_ok:
    print("✅ 所有关键文件和目录均已找到。")
    
    # --- 4. 构建并执行命令 ---
    command_list = config.build_command()
    
    execution_env = os.environ.copy()
    execution_env["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, config.GPU_IDS))
    
    print("\n" + "="*80)
    print("      即将执行以下精简命令:")
    print("="*80)
    print(shlex.join(command_list))
    print("="*80, "\n")
    
    # --- 实时流式输出的执行逻辑 ---
    try:
        # 注意: 这里不再需要 `cwd` 参数，因为主脚本路径已经是绝对路径
        process = subprocess.Popen(
            command_list,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            encoding='utf-8',
            errors='replace',
            env=execution_env,
            bufsize=1
        )

        fd = process.stdout.fileno()
        fl = fcntl.fcntl(fd, fcntl.F_GETFL)
        fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

        print("--- [ 训练开始，实时日志如下 ] ---\n")
        
        while True:
            if process.poll() is not None:
                break
            
            readable, _, _ = select.select([process.stdout], [], [], 0.1)
            
            if process.stdout in readable:
                try:
                    line = process.stdout.read()
                    if line:
                        sys.stdout.write(line)
                        sys.stdout.flush()
                except (TypeError, ValueError):
                    break
        
        remaining_output, _ = process.communicate()
        if remaining_output:
            print(remaining_output.strip())

        exit_code = process.returncode
        print(f"\n--- [ 训练结束，退出码: {exit_code} ] ---")
        if exit_code == 0:
            print("✅ 微调任务成功完成！")
        else:
            print("❌ 微调任务失败。请检查上面的日志输出以获取详细错误信息。")

    except Exception as e:
        print(f"❌ 执行过程中发生意外错误: {e}")

else:
    print("\n请修正上述路径错误后再运行此单元格。")

✅ 所有关键文件和目录均已找到。

      即将执行以下精简命令:
/public/home/jijh/micromamba/envs/gigapath/bin/python -m torch.distributed.run --nproc_per_node=3 --master_port=29501 /cwStorage/nodecw_group/jijh/openclip_train/open_clip/src/open_clip_train/main.py --model ViT-B-32 --pretrained /cwStorage/nodecw_group/jijh/openclip_train/open_clip/CLIP-ViT-B-32/open_clip_model.safetensors --train-data /cwStorage/nodecw_group/jijh/openclip_train/train_manifest.csv --val-data /cwStorage/nodecw_group/jijh/openclip_train/validation_manifest.csv --dataset-type csv --csv-separator , --csv-img-key image_path --csv-caption-key gene_sentence --logs /cwStorage/nodecw_group/jijh/openclip_train/train_log/finetune_20250619-212110 --epochs 10 --batch-size 128 --lr 5e-06 --wd 0.1 --warmup 500 --precision amp_bfloat16 --workers 8 --report-to tensorboard --save-frequency 1 --log-every-n-steps 100

--- [ 训练开始，实时日志如下 ] ---


*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 