In [2]:
import pandas as pd
import os
import random
from concurrent.futures import ThreadPoolExecutor

import cv2 as cv
import numpy as np
import pyvips
import matplotlib.pyplot as plt
import plotly.express as px
from tqdm import tqdm

In [3]:

projects = ["BRCA", "COAD", "GBM", "HNSC", "LIHC", "LUAD", "LUSC", "OV", "PRAD", "THCA", "UCEC"]

mut_info = pd.read_csv("mut_info.tsv", sep="\t")
mut_info = mut_info.reset_index()
mut_info.columns = ['Patient ID'] + mut_info.columns[1:].tolist()
mut_info["Patient ID"] = mut_info["Patient ID"].str.rsplit('-', n=1).str[0]
pID_mut = list(mut_info["Patient ID"].unique())

summary_data = []

for project in projects:
    filename = f"data/TCGA-{project}_summary.csv"
    if not os.path.exists(filename):
        print(f"File {filename} not found. Skipping.")
        continue

    proj_data = pd.read_csv(filename, skiprows=2)

    proj_data = proj_data.dropna(subset=["Patient ID", "Diagnostic Slides"])

    proj_data = proj_data[proj_data["Diagnostic Slides"] != 0]

    proj_data["Patient ID"] = proj_data["Patient ID"].str.strip()

    pID_proj = list(proj_data["Patient ID"].unique())

    common_patients = sorted(set(pID_mut) & set(pID_proj))
    non_common_patients = sorted(set(pID_proj) - set(pID_mut))

    common_df = pd.DataFrame({"Patient ID": common_patients})
    common_df.to_csv(f"common_patients_{project}.csv", index=False)

    summary_data.append({
        "Project": project,
        "Total Patients in Summary": len(pID_proj),
        "Common with mut_info": len(common_patients),
        "Non-common in Summary": len(non_common_patients)
    })

    print(f"Processed {project}: {len(common_patients)} common, {len(non_common_patients)} non-common")


# Save universal summary CSV
summary_df = pd.DataFrame(summary_data)
summary_df.to_csv("common_patients_summary.csv", index=False)


Processed BRCA: 1011 common, 51 non-common
Processed COAD: 419 common, 32 non-common
Processed GBM: 94 common, 295 non-common
Processed HNSC: 419 common, 31 non-common
Processed LIHC: 349 common, 16 non-common
Processed LUAD: 461 common, 17 non-common
Processed LUSC: 456 common, 22 non-common
Processed OV: 72 common, 34 non-common
Processed PRAD: 382 common, 21 non-common
Processed THCA: 443 common, 63 non-common
Processed UCEC: 482 common, 23 non-common


Data pref for tile generation

In [4]:
mut_info = pd.read_csv("mut_info.tsv", sep="\t")
mut_info = mut_info.reset_index()
mut_info.columns = ['Patient ID'] + mut_info.columns[1:].tolist()
mut_info["Patient ID"] = mut_info["Patient ID"].str.rsplit('-', n=1).str[0]

In [5]:
binary_mute_info = mut_info.loc[:, mut_info.columns != 'Patient ID'].applymap(lambda x: 1 if x != "wild" else 0)
binary_mute_info.insert(0, 'Patient ID', mut_info['Patient ID'])

  binary_mute_info = mut_info.loc[:, mut_info.columns != 'Patient ID'].applymap(lambda x: 1 if x != "wild" else 0)


Now we want to exlude patientIDs not in our Ovarian Cacner data.

In [6]:
common_OV_patients = pd.read_csv("common_patients_OV.csv")
binary_mute_info = binary_mute_info[binary_mute_info["Patient ID"].isin(common_OV_patients["Patient ID"])]

Time to generete tiles for each patient CSV. I will use my previous code for LungInsight to geneate tiles.

In [7]:
def pyvips_to_numpy(vips_image):
    return np.ndarray(buffer=vips_image.write_to_memory(),
                      dtype=np.uint8,
                      shape=[vips_image.height, vips_image.width, vips_image.bands])

In [8]:
def process_tile_2(x, y, width, height, patch_size_w, patch_size_h, lower_bnd_intensity, upper_bnd_intensity, file_path, output_dir, slide):
    patch_coordinates = []

    actual_patch_w = min(patch_size_w, width - x)
    actual_patch_h = min(patch_size_h, height - y)

    patch_coordinates.append((x, y, actual_patch_w, actual_patch_h))

    tile = slide.crop(x, y, actual_patch_w, actual_patch_h)
    tile_array = pyvips_to_numpy(tile)
    mean_value = np.mean(tile_array)

    if lower_bnd_intensity < mean_value <= upper_bnd_intensity:
        return None
    
    output_filename = os.path.expanduser(f"{output_dir}/{output_dir.split("/")[-1].split("_")[0]}_tile_{x}_{y}.png")
    tile.pngsave(output_filename, compression=9)

    df = binary_mute_info[binary_mute_info['Patient ID'] == output_dir.split("/")[-1].split("_")[0]]
    if (not os.path.exists(os.path.expanduser(f"{output_dir}/{output_dir.split("/")[-1].split("_")[0]}.csv")) and not df.empty):
        df.to_csv(os.path.expanduser(f"{output_dir}/{output_dir.split("/")[-1].split("_")[0]}.csv"), index=False)

    return output_filename

In [9]:
def generate_tiles(width, height, patch_size_w, patch_size_h, lower_bnd_intensity, upper_bnd_intensity, file_path, output_dir, slide):
    with ThreadPoolExecutor() as executor:
        futures = []
        for y in range(0, height, patch_size_h):
            for x in range(0, width, patch_size_w):
                futures.append(executor.submit(process_tile_2, x, y, width, height,
                                               patch_size_w, patch_size_h,
                                               lower_bnd_intensity, upper_bnd_intensity,
                                               file_path, output_dir, slide))
        for future in futures:
            _ = future.result()

In [10]:
# ==== PARAMETERS ====
patch_size_w = 1024
patch_size_h = 1024
lower_bnd_intensity = 210
upper_bnd_intensity = 255

# ==== PATHS ====
base_slide_path = os.path.expanduser("~/Documents/Data/TCGAOV/tcga_data/slides/TCGA-OV/")
tile_output_base = os.path.expanduser("~/Documents/Data/TCGAOV/tcga_data/tiles/")

# ==== GET VALID PATIENT IDs ====
valid_patient_ids = set(binary_mute_info["Patient ID"])

# ==== WALK THROUGH FOLDERS ====
for patient_folder in tqdm(os.listdir(base_slide_path), desc="Processing Patient Folders"):
    full_patient_path = os.path.join(base_slide_path, patient_folder)

    # Skip if not in common patients
    if patient_folder not in valid_patient_ids:
        continue

    if not os.path.isdir(full_patient_path):
        continue

    svs_files = [f for f in os.listdir(full_patient_path) if f.endswith(".svs")]
    if not svs_files:
        continue

    for svs_file in svs_files:
        full_svs_path = os.path.join(full_patient_path, svs_file)
        slide = pyvips.Image.new_from_file(full_svs_path, level=1)
        width = slide.width
        height = slide.height

        print(f"Processing {svs_file} for patient {patient_folder}: {width}x{height}")

        output_dir = os.path.join(tile_output_base, f"{patient_folder}_{os.path.splitext(svs_file)[0]}")
        os.makedirs(output_dir, exist_ok=True)

        generate_tiles(
            width=width,
            height=height,
            patch_size_w=patch_size_w,
            patch_size_h=patch_size_h,
            lower_bnd_intensity=lower_bnd_intensity,
            upper_bnd_intensity=upper_bnd_intensity,
            file_path=full_svs_path,
            output_dir=output_dir,
            slide=slide
        )

Processing Patient Folders:   0%|          | 0/72 [00:00<?, ?it/s]

Processing TCGA-23-1118-01Z-00-DX1.D69E130E-33C7-4F8A-99AF-400B81378578.svs for patient TCGA-23-1118: 27389x16629


Processing Patient Folders:   1%|▏         | 1/72 [00:24<28:56, 24.46s/it]

Processing TCGA-OY-A56Q-01Z-00-DX1.F1556F26-8845-4962-9004-70F9747B46B7.svs for patient TCGA-OY-A56Q: 38317x11164


Processing Patient Folders:   3%|▎         | 2/72 [00:43<25:04, 21.49s/it]

Processing TCGA-25-2404-01Z-00-DX1.7193B185-31C3-41D0-9194-86DEDF15F096.svs for patient TCGA-25-2404: 27901x22749


Processing Patient Folders:   4%|▍         | 3/72 [01:16<30:30, 26.53s/it]

Processing TCGA-57-1585-01Z-00-DX1.934E0F30-F804-4CFA-92D5-41576DA6C367.svs for patient TCGA-57-1585: 10311x9443


Processing Patient Folders:   6%|▌         | 4/72 [01:21<20:37, 18.21s/it]

Processing TCGA-25-1316-01Z-00-DX1.A9B7F47C-5BDC-4B7D-BC9F-C4C9DBA663C8.svs for patient TCGA-25-1316: 31389x19397


Processing Patient Folders:   7%|▋         | 5/72 [02:08<31:49, 28.50s/it]

Processing TCGA-25-1329-01Z-00-DX1.6DD994F3-E426-4749-8ECA-9A0B87FF04DC.svs for patient TCGA-25-1329: 34877x23089


Processing Patient Folders:   8%|▊         | 6/72 [03:01<40:34, 36.88s/it]

Processing TCGA-25-1634-01Z-00-DX1.6E449C67-922B-409A-84B4-581B9B468682.svs for patient TCGA-25-1634: 28898x20551


Processing Patient Folders:  10%|▉         | 7/72 [03:38<39:43, 36.68s/it]

Processing TCGA-57-1993-01Z-00-DX1.464F641B-AA9F-49A7-A224-88D9847159DF.svs for patient TCGA-57-1993: 7913x10563


Processing Patient Folders:  11%|█         | 8/72 [03:43<28:34, 26.78s/it]

Processing TCGA-25-1320-01Z-00-DX1.30C154CE-C75B-4815-BF05-7D011B761993.svs for patient TCGA-25-1320: 31887x22480


Processing Patient Folders:  12%|█▎        | 9/72 [04:45<39:47, 37.90s/it]

Processing TCGA-23-1114-01Z-00-DX1.26CCA42E-4947-4318-A983-D3B31603482E.svs for patient TCGA-23-1114: 36353x23634


Processing Patient Folders:  14%|█▍        | 10/72 [06:01<51:17, 49.64s/it]

Processing TCGA-25-1312-01Z-00-DX1.733EC7A7-0FC8-4DDC-B366-DF5A45D6BB4E.svs for patient TCGA-25-1312: 41354x22306


Processing Patient Folders:  15%|█▌        | 11/72 [07:12<57:00, 56.07s/it]

Processing TCGA-25-1627-01Z-00-DX1.7596F7F5-D705-46B4-B3A6-833D31482D28.svs for patient TCGA-25-1627: 20428x20007


Processing Patient Folders:  17%|█▋        | 12/72 [07:42<48:03, 48.06s/it]

Processing TCGA-25-2392-01Z-00-DX1.C37932E5-973F-444D-8CEB-1BED4279165E.svs for patient TCGA-25-2392: 30393x15630


Processing Patient Folders:  18%|█▊        | 13/72 [08:17<43:29, 44.23s/it]

Processing TCGA-23-2084-01Z-00-DX1.94780F2B-9A88-47ED-982B-0048081309C9.svs for patient TCGA-23-2084: 19421x18541


Processing Patient Folders:  19%|█▉        | 14/72 [08:44<37:38, 38.94s/it]

Processing TCGA-13-A5FT-01Z-00-DX1.2B292DC8-7336-4CD9-AB1A-F6F482E6151A.svs for patient TCGA-13-A5FT: 24467x10739


Processing Patient Folders:  21%|██        | 15/72 [08:58<29:57, 31.53s/it]

Processing TCGA-25-2400-01Z-00-DX1.AAD20514-6FF7-47A5-89DB-8AA15DCC101B.svs for patient TCGA-25-2400: 26905x18544


Processing Patient Folders:  22%|██▏       | 16/72 [09:41<32:40, 35.01s/it]

Processing TCGA-25-1317-01Z-00-DX1.2E7A7AF9-4D04-4EBD-8C79-7117F76511D5.svs for patient TCGA-25-1317: 36372x22453


Processing Patient Folders:  24%|██▎       | 17/72 [10:02<28:06, 30.66s/it]

Processing TCGA-25-2393-01Z-00-DX1.A311D005-D661-4851-A54F-EE3310C9CD8A.svs for patient TCGA-25-2393: 27901x19867


Processing Patient Folders:  25%|██▌       | 18/72 [10:43<30:24, 33.78s/it]

Processing TCGA-25-1328-01Z-00-DX1.A0145C96-19A9-4F2D-8570-ECF039F99405.svs for patient TCGA-25-1328: 25908x14978


Processing Patient Folders:  26%|██▋       | 19/72 [11:10<28:09, 31.87s/it]

Processing TCGA-25-1623-01Z-00-DX1.28D89405-5F27-47CF-ABC7-01AB494C28D2.svs for patient TCGA-25-1623: 31389x17794


Processing Patient Folders:  28%|██▊       | 20/72 [12:01<32:31, 37.54s/it]

Processing TCGA-57-1994-01Z-00-DX1.A0798185-30C5-4C32-B1A8-B440D4F4ED95.svs for patient TCGA-57-1994: 12230x13388


Processing Patient Folders:  29%|██▉       | 21/72 [12:10<24:31, 28.85s/it]

Processing TCGA-23-1026-01Z-00-DX1.2875B4F7-D6B2-4C72-8A68-4E7C92D04BF0.svs for patient TCGA-23-1026: 27887x21821


Processing Patient Folders:  31%|███       | 22/72 [12:52<27:25, 32.91s/it]

Processing TCGA-25-2401-01Z-00-DX1.AEA99982-DB6B-4178-BEDC-6E1F605B8D7C.svs for patient TCGA-25-2401: 26905x15181


Processing Patient Folders:  32%|███▏      | 23/72 [13:25<26:56, 32.99s/it]

Processing TCGA-25-2398-01Z-00-DX1.3F0E9A15-DBC8-43A0-8C94-20B02757CD10.svs for patient TCGA-25-2398: 21922x17529


Processing Patient Folders:  33%|███▎      | 24/72 [13:50<24:19, 30.40s/it]

Processing TCGA-25-1630-01Z-00-DX1.75C53351-1511-4CCE-97AC-1650010AC568.svs for patient TCGA-25-1630: 28898x20830


Processing Patient Folders:  35%|███▍      | 25/72 [14:29<25:49, 32.97s/it]

Processing TCGA-25-1632-01Z-00-DX1.B911E6FF-D6A9-4AC7-873C-58B2059DE7E6.svs for patient TCGA-25-1632: 38863x17014


Processing Patient Folders:  36%|███▌      | 26/72 [15:14<28:12, 36.80s/it]

Processing TCGA-59-A5PD-01Z-00-DX1.0A4D38A2-7B15-4550-B00C-4276485A9613.svs for patient TCGA-59-A5PD: 29273x21255


Processing Patient Folders:  38%|███▊      | 27/72 [16:00<29:32, 39.38s/it]

Processing TCGA-23-1023-01Z-00-DX1.0C96E118-A4D9-4A9A-B95E-C0AA114D2483.svs for patient TCGA-23-1023: 29879x16626


Processing Patient Folders:  39%|███▉      | 28/72 [16:44<30:02, 40.97s/it]

Processing TCGA-23-1119-01Z-00-DX1.44D3D70A-A421-4962-8385-2A3CF8E9880A.svs for patient TCGA-23-1119: 29879x23642


Processing Patient Folders:  40%|████      | 29/72 [17:53<35:11, 49.11s/it]

Processing TCGA-25-1870-01Z-00-DX1.780BB62D-71C1-4B21-A2F4-61AC4B0234EE.svs for patient TCGA-25-1870: 21922x15567


Processing Patient Folders:  42%|████▏     | 30/72 [18:23<30:23, 43.43s/it]

Processing TCGA-VG-A8LO-01A-02-DX2.9B58474C-DAC0-4D45-B13C-0A1EA9E1BC32.svs for patient TCGA-VG-A8LO: 23406x7860
Processing TCGA-VG-A8LO-01A-01-DX1.B39A4D64-82A1-4A04-8AB6-918F3058B83B.svs for patient TCGA-VG-A8LO: 22410x6451


Processing Patient Folders:  43%|████▎     | 31/72 [18:40<24:16, 35.53s/it]

Processing TCGA-25-1313-01Z-00-DX1.5A2732D5-01D6-4285-9283-1839E1703C27.svs for patient TCGA-25-1313: 33880x15960


Processing Patient Folders:  44%|████▍     | 32/72 [19:29<26:27, 39.68s/it]

Processing TCGA-23-1123-01Z-00-DX1.764A8E5C-F2E7-441D-B9FB-23ECF7183740.svs for patient TCGA-23-1123: 24899x23410


Processing Patient Folders:  46%|████▌     | 33/72 [20:22<28:26, 43.75s/it]

Processing TCGA-25-2391-01Z-00-DX1.3930615C-785B-48D4-BD69-FD87932A518A.svs for patient TCGA-25-2391: 34379x23128


Processing Patient Folders:  47%|████▋     | 34/72 [21:33<32:50, 51.85s/it]

Processing TCGA-23-1122-01Z-00-DX1.991C230C-E3B4-4DAC-9DCC-69E9FA7DF772.svs for patient TCGA-23-1122: 32369x22608


Processing Patient Folders:  49%|████▊     | 35/72 [22:35<33:48, 54.82s/it]

Processing TCGA-23-1120-01Z-00-DX1.59367B12-17F1-41AA-A6FB-8D940365176A.svs for patient TCGA-23-1120: 30875x18210


Processing Patient Folders:  50%|█████     | 36/72 [23:19<30:57, 51.61s/it]

Processing TCGA-23-2077-01Z-00-DX1.9EB664FA-CC13-4A49-89BA-8A4573CE48F1.svs for patient TCGA-23-2077: 25895x18573


Processing Patient Folders:  51%|█████▏    | 37/72 [23:58<27:58, 47.96s/it]

Processing TCGA-25-2042-01Z-00-DX1.79198B62-0E0B-46C4-8D35-FE20582B8035.svs for patient TCGA-25-2042: 28898x19229


Processing Patient Folders:  53%|█████▎    | 38/72 [24:45<26:59, 47.64s/it]

Processing TCGA-25-1318-01Z-00-DX1.742CB28E-DB0A-4BE1-8047-5265FE80F861.svs for patient TCGA-25-1318: 36870x22214


Processing Patient Folders:  54%|█████▍    | 39/72 [25:28<25:22, 46.13s/it]

Processing TCGA-23-1027-01Z-00-DX1.53F9DFF4-6811-4184-B2FD-1F6706B948FD.svs for patient TCGA-23-1027: 24899x20477


Processing Patient Folders:  56%|█████▌    | 40/72 [26:09<23:44, 44.53s/it]

Processing TCGA-25-1635-01Z-00-DX1.D21D2855-43F0-4766-934A-3CE1E3A099B9.svs for patient TCGA-25-1635: 34877x24009


Processing Patient Folders:  57%|█████▋    | 41/72 [27:00<24:00, 46.46s/it]

Processing TCGA-23-1109-01Z-00-DX1.FB0886B9-8108-4A74-8A29-FEA4381AD69C.svs for patient TCGA-23-1109: 28883x22731


Processing Patient Folders:  58%|█████▊    | 42/72 [27:55<24:37, 49.24s/it]

Processing TCGA-25-1633-01Z-00-DX1.B3F91D16-A9AA-479F-8326-DD3E280C8759.svs for patient TCGA-25-1633: 31887x18237


Processing Patient Folders:  60%|█████▉    | 43/72 [28:31<21:46, 45.06s/it]

Processing TCGA-23-1028-01Z-00-DX1.117B4B7B-F796-4D33-A645-CD80E5C43E6D.svs for patient TCGA-23-1028: 19421x15569


Processing Patient Folders:  61%|██████    | 44/72 [28:53<17:53, 38.33s/it]

Processing TCGA-23-1809-01Z-00-DX1.A8767DEC-EB1A-4EDC-906E-C0560A30455F.svs for patient TCGA-23-1809: 27389x21504


Processing Patient Folders:  62%|██████▎   | 45/72 [29:39<18:12, 40.48s/it]

Processing TCGA-23-1030-01Z-00-DX1.FC93147F-FB55-4C62-B187-337A5C107096.svs for patient TCGA-23-1030: 27389x23169


Processing Patient Folders:  64%|██████▍   | 46/72 [30:29<18:48, 43.42s/it]

Processing TCGA-23-1110-01Z-00-DX1.61C2DBB8-7F5C-4878-BEDC-6AE061A2D7D4.svs for patient TCGA-23-1110: 30377x22056


Processing Patient Folders:  65%|██████▌   | 47/72 [31:25<19:35, 47.03s/it]

Processing TCGA-25-1319-01Z-00-DX1.71EFB946-ACAF-4BA6-8855-D336268D87F0.svs for patient TCGA-25-1319: 37866x24218


Processing Patient Folders:  67%|██████▋   | 48/72 [32:35<21:37, 54.05s/it]

Processing TCGA-WR-A838-01Z-00-DX1.5FE22DE4-CEFB-45F6-9299-505023A8F3BA.svs for patient TCGA-WR-A838: 19920x19752


Processing Patient Folders:  68%|██████▊   | 49/72 [33:05<17:55, 46.78s/it]

Processing TCGA-25-1877-01Z-00-DX1.ECD98484-514C-4E9A-913B-F7EC55D6A8FE.svs for patient TCGA-25-1877: 41354x20103


Processing Patient Folders:  69%|██████▉   | 50/72 [34:14<19:36, 53.48s/it]

Processing TCGA-25-1323-01Z-00-DX1.1B82D1C9-2310-4AAB-B531-45C1191E5FF8.svs for patient TCGA-25-1323: 29396x19791


Processing Patient Folders:  71%|███████   | 51/72 [34:46<16:26, 46.97s/it]

Processing TCGA-57-1586-01Z-00-DX1.5C85A9B1-9ACC-4D62-BA09-91F8BDB2A058.svs for patient TCGA-57-1586: 9352x11526


Processing Patient Folders:  72%|███████▏  | 52/72 [34:55<11:51, 35.58s/it]

Processing TCGA-23-1022-01Z-00-DX1.AF9E523E-CB0F-4AB5-AD43-C96731BF9141.svs for patient TCGA-23-1022: 23903x20756


Processing Patient Folders:  74%|███████▎  | 53/72 [35:34<11:36, 36.64s/it]

Processing TCGA-57-1583-01Z-00-DX1.88ACB045-1DBD-4F0B-A6D0-53B6C208805C.svs for patient TCGA-57-1583: 14388x16196


Processing Patient Folders:  75%|███████▌  | 54/72 [35:51<09:13, 30.77s/it]

Processing TCGA-25-1628-01Z-00-DX1.8275623E-8A30-4ABB-8846-34DAEE43523E.svs for patient TCGA-25-1628: 24912x21180


Processing Patient Folders:  76%|███████▋  | 55/72 [36:28<09:13, 32.58s/it]

Processing TCGA-23-1024-01Z-00-DX1.B9194D3F-C6F4-4FC8-B0CA-6E347FF4F885.svs for patient TCGA-23-1024: 27389x21901


Processing Patient Folders:  78%|███████▊  | 56/72 [37:08<09:18, 34.89s/it]

Processing TCGA-23-1116-01Z-00-DX1.FC5FC1E1-56DD-498C-AA9F-4434C6E733A8.svs for patient TCGA-23-1116: 27389x21822


Processing Patient Folders:  79%|███████▉  | 57/72 [37:46<08:55, 35.67s/it]

Processing TCGA-25-1321-01Z-00-DX1.56C898D8-14CE-4C2D-B5C9-AC33FF91FE68.svs for patient TCGA-25-1321: 30891x21837


Processing Patient Folders:  81%|████████  | 58/72 [38:15<07:52, 33.73s/it]

Processing TCGA-23-1021-01Z-00-DX1.F07C221B-D401-47A5-9519-10DE59CA1E9D.svs for patient TCGA-23-1021: 25895x16195


Processing Patient Folders:  82%|████████▏ | 59/72 [38:40<06:46, 31.30s/it]

Processing TCGA-25-2409-01Z-00-DX1.34FFD6A9-4919-4DF5-8415-215CC833F12D.svs for patient TCGA-25-2409: 27403x19426


Processing Patient Folders:  83%|████████▎ | 60/72 [39:11<06:12, 31.03s/it]

Processing TCGA-23-1111-01Z-00-DX1.A99421E1-C73E-42AE-990F-E5E6D26EDDBE.svs for patient TCGA-23-1111: 23903x13859


Processing Patient Folders:  85%|████████▍ | 61/72 [39:29<04:59, 27.27s/it]

Processing TCGA-25-1631-01Z-00-DX1.7CA82CFB-E5F6-4148-8B5E-088B7FE6331F.svs for patient TCGA-25-1631: 35873x22532


Processing Patient Folders:  86%|████████▌ | 62/72 [40:06<05:00, 30.05s/it]

Processing TCGA-23-2078-01Z-00-DX1.E31A9C1E-00EF-4B0D-A15D-6A5C2D00E986.svs for patient TCGA-23-2078: 28385x17817


Processing Patient Folders:  88%|████████▊ | 63/72 [40:27<04:07, 27.51s/it]

Processing TCGA-25-2396-01Z-00-DX1.3E755B49-EFC5-49B8-A381-0B06DAE56961.svs for patient TCGA-25-2396: 28400x20189


Processing Patient Folders:  89%|████████▉ | 64/72 [40:59<03:49, 28.67s/it]

Processing TCGA-25-1322-01Z-00-DX1.20DA1932-908F-44E5-B2F2-43E41E06165B.svs for patient TCGA-25-1322: 36372x18569


Processing Patient Folders:  90%|█████████ | 65/72 [41:32<03:29, 29.95s/it]

Processing TCGA-25-1315-01Z-00-DX1.BC434CCE-7E21-479F-8A5D-1FC7F601C8C5.svs for patient TCGA-25-1315: 28400x19268


Processing Patient Folders:  92%|█████████▏| 66/72 [41:45<02:29, 24.86s/it]

Processing TCGA-5X-AA5U-01Z-00-DX1.1A03C105-4086-4AAB-B821-A9E8156FA704.svs for patient TCGA-5X-AA5U: 25896x15382


Processing Patient Folders:  93%|█████████▎| 67/72 [42:05<01:57, 23.54s/it]

Processing TCGA-23-1029-01Z-00-DX1.0044B39A-51B3-4F76-90A9-00CDF851DE2A.svs for patient TCGA-23-1029: 25397x18177


Processing Patient Folders:  94%|█████████▍| 68/72 [42:31<01:37, 24.29s/it]

Processing TCGA-57-1582-01Z-00-DX1.BDF02DAA-E520-46BD-A479-58CAA4354B1A.svs for patient TCGA-57-1582: 6474x5606


Processing Patient Folders:  96%|█████████▌| 69/72 [42:34<00:53, 17.71s/it]

Processing TCGA-25-2399-01Z-00-DX1.B9A34CB0-9EBB-4435-8EDE-E4B9078565B1.svs for patient TCGA-25-2399: 34877x20006


Processing Patient Folders:  97%|█████████▋| 70/72 [43:10<00:46, 23.31s/it]

Processing TCGA-25-1326-01Z-00-DX1.686E3548-8891-42ED-ABBA-BE3DA6D4CAAB.svs for patient TCGA-25-1326: 20926x19568


Processing Patient Folders:  99%|█████████▊| 71/72 [43:25<00:20, 20.73s/it]

Processing TCGA-23-1113-01Z-00-DX1.D72B1587-444A-4DEC-8D92-703699822BE2.svs for patient TCGA-23-1113: 25895x19207


Processing Patient Folders: 100%|██████████| 72/72 [43:44<00:00, 36.45s/it]


'TCGA-23-1113'

# Using `Virchow2` to 1- Transform into new embeddings  2- Cluster tiles to see what will be clusters at the end.
Virchow2 is a self-supervised vision transformer pretrained using 3.1M whole slide histopathology images. The model can be used as a tile-level feature extractor (frozen or finetuned) to achieve state-of-the-art results for a wide variety of downstream computational pathology use cases.

[link](https://huggingface.co/paige-ai/Virchow2)

In [None]:
from dotenv import load_dotenv
load_dotenv()
TOKEN = os.getenv("HUGGINGFACE_TOKEN")

In [None]:
import os
import torch
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
from timm.layers import SwiGLUPacked
from PIL import Image
import numpy as np
from huggingface_hub import login
from sklearn.cluster import KMeans
import glob
import shutil

login(token= TOKEN)  

model = timm.create_model("hf-hub:paige-ai/Virchow2", pretrained=True, mlp_layer=SwiGLUPacked, act_layer=torch.nn.SiLU)
model = model.eval() 

transforms = create_transform(**resolve_data_config(model.pretrained_cfg, model=model))

def extract_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    image = transforms(image).unsqueeze(0) 
    with torch.inference_mode():
        output = model(image)  
    class_token = output[:, 0]
    patch_tokens = output[:, 5:]
    embedding = torch.cat([class_token, patch_tokens.mean(1)], dim=-1)
    return embedding.cpu().numpy()

tile_dir = "images" 
output_dir_cluster0 = "Class 0"  
output_dir_cluster1 = "Class 1"  
output_dir_cluster2 = "Class 2"  
output_dir_cluster3 = "Class 3"  
os.makedirs(output_dir_cluster0, exist_ok=True)
os.makedirs(output_dir_cluster1, exist_ok=True)
os.makedirs(output_dir_cluster2, exist_ok=True)
os.makedirs(output_dir_cluster3, exist_ok=True)

tile_paths = glob.glob(os.path.join(tile_dir, "*.png"))
embeddings = []
for tile_path in tile_paths:
    embedding = extract_embedding(tile_path)
    embeddings.append(embedding)

embeddings = np.vstack(embeddings)

kmeans = KMeans(n_clusters=4, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

for tile_path, label in zip(tile_paths, cluster_labels):
    tile_name = os.path.basename(tile_path)
    if label == 0:
        shutil.copy(tile_path, os.path.join(output_dir_cluster0, tile_name))
    elif label == 1:
        shutil.copy(tile_path, os.path.join(output_dir_cluster1, tile_name))
    elif label == 2:
        shutil.copy(tile_path, os.path.join(output_dir_cluster2, tile_name))
    else:
        shutil.copy(tile_path, os.path.join(output_dir_cluster3, tile_name))

print(f"Tiles clustered into {output_dir_cluster0}, {output_dir_cluster1}, {output_dir_cluster2}, and {output_dir_cluster3}.")

Tiles clustered into Class 0, Class 1, Class 2, and Class 3. Inspect clusters to identify cancerous tiles.
