In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

def find_svs_files(base_dir):
    svs_files = []
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith(".svs"):
                full_path = os.path.join(root, file)
                svs_files.append(full_path)
    return svs_files

base_directory = "./PKG - HER2 tumor ROIs_v3"  
svs_paths = find_svs_files(base_directory)

print(f"Total SVS files found: {len(svs_paths)}")
for path in svs_paths[:5]:
    print(path)

Total SVS files found: 277
./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SVS/Her2Pos_Case_70.svs
./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SVS/Her2Pos_Case_47.svs
./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SVS/Her2Neg_Case_39.svs
./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SVS/Her2Neg_Case_14.svs
./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SVS/Her2Neg_Case_01.svs


In [3]:
csv_path = "./PKG - HER2 tumor ROIs_v3/TCGA_BRCA_Filtered/HER2_TCGA_clean.csv"
df_meta = pd.read_csv(csv_path)
print("Available columns:")
print(df_meta.columns.tolist())
df_meta.head()

Available columns:
['Unnamed: 0', 'Slide', 'Clinical.HER2.status', 'HER2A.status', 'HER2.copy.number']


Unnamed: 0.1,Unnamed: 0,Slide,Clinical.HER2.status,HER2A.status,HER2.copy.number
0,1,TCGA-B6-A0WV-01Z-00-DX1,Negative,non-HER2A,1.7
1,2,TCGA-AO-A129-01Z-00-DX1,Negative,non-HER2A,1.68
2,3,TCGA-AR-A24Q-01Z-00-DX1,Negative,non-HER2A,1.68
3,5,TCGA-E9-A249-01Z-00-DX1,Negative,non-HER2A,1.67
4,6,TCGA-A8-A095-01Z-00-DX1,Negative,non-HER2A,1.65


In [4]:
df_meta_clean = df_meta[['Slide', 'Clinical.HER2.status']].copy()
def map_label(status):
    status = str(status).lower()
    if 'positive' in status:
        return 1
    elif 'negative' in status:
        return 0
    else:
        return None

df_meta_clean['label'] = df_meta_clean['Clinical.HER2.status'].apply(map_label)
df_meta_clean = df_meta_clean.dropna(subset=['label'])
df_meta_clean['label'] = df_meta_clean['label'].astype(int)
df_meta_clean.head()

Unnamed: 0,Slide,Clinical.HER2.status,label
0,TCGA-B6-A0WV-01Z-00-DX1,Negative,0
1,TCGA-AO-A129-01Z-00-DX1,Negative,0
2,TCGA-AR-A24Q-01Z-00-DX1,Negative,0
3,TCGA-E9-A249-01Z-00-DX1,Negative,0
4,TCGA-A8-A095-01Z-00-DX1,Negative,0


In [13]:
def find_svs_files(folder):
    return [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith(".svs")
    ]

cohort1_path = "./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SVS"
cohort2_path = "./PKG - HER2 tumor ROIs_v3/Yale_trastuzumab_response_cohort/SVS"
svs_paths = find_svs_files(cohort1_path) + find_svs_files(cohort2_path)

def extract_slide_and_label(path):
    filename = os.path.basename(path)
    slide = filename.split(".")[0]
    label = 1 if "POS" in slide.upper() else 0  
    return slide, label

data = [extract_slide_and_label(p) + (p,) for p in svs_paths]
df_merged = pd.DataFrame(data, columns=["Slide", "label", "file_path"])

print(f"Total labeled slides: {len(df_merged)}")
df_merged.head()


Total labeled slides: 277


Unnamed: 0,Slide,label,file_path
0,Her2Pos_Case_70,1,./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SV...
1,Her2Pos_Case_47,1,./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SV...
2,Her2Neg_Case_39,0,./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SV...
3,Her2Neg_Case_14,0,./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SV...
4,Her2Neg_Case_01,0,./PKG - HER2 tumor ROIs_v3/Yale_HER2_cohort/SV...


In [None]:
import openslide
tile_output_dir = "./tiles"
os.makedirs(tile_output_dir, exist_ok=True)

tile_size = 256

max_tiles_per_slide = 20

def extract_tiles_from_slide(svs_path, label, slide_id):
    try:
        slide = openslide.OpenSlide(svs_path)
        width, height = slide.dimensions
        count = 0

        for y in range(0, height, tile_size):
            for x in range(0, width, tile_size):
                if count >= max_tiles_per_slide:
                    return

                tile = slide.read_region((x, y), 0, (tile_size, tile_size)).convert("RGB")
                tile_name = f"{slide_id}_tile_{count}_label_{label}.png"
                tile.save(os.path.join(tile_output_dir, tile_name))
                count += 1

    except Exception as e:
        print(f"Error with {svs_path}: {e}")

for i, row in df_merged.iterrows():
    extract_tiles_from_slide(row['file_path'], row['label'], row['Slide'])

print("Tile extraction complete!")

Tile extraction complete!
