In [206]:
import os

# TCGA 연구 약어 리스트 정의
# 각 약어는 특정 암종 연구를 나타냄
# 예: LAML(급성 골수성 백혈병), BRCA(유방암), LUAD(폐선암) 등
study_abbreviations = [
    "LAML", "ACC", "BLCA", "LGG", "BRCA", "CESC", "CHOL", "LCML", "COAD", "CNTL",
    "ESCA", "FPPP", "GBM", "HNSC", "KICH", "KIRC", "KIRP", "LIHC", "LUAD", "LUSC",
    "DLBC", "MESO", "MISC", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM",
    "STAD", "TGCT", "THYM", "THCA", "UCS", "UCEC", "UVM"
]

# 각 연구 약어를 키로 하고 빈 리스트를 값으로 하는 딕셔너리 생성
study_folder_dict = {}
for abbreviation in study_abbreviations:
    study_folder_dict[abbreviation] = []  

# 어떤 연구 약어와도 매칭되지 않는 폴더들을 저장할 'unmatched' 키 추가
study_folder_dict['unmatched'] = []    

# 병리 이미지가 저장된 루트 디렉토리 경로 설정
directory_root = "/Pathology_ImageNet/TCGA_data"

# HDD1부터 HDD4까지 순차적으로 검사
for i in [1,2,3,4]:
    # 현재 검사할 HDD 경로 설정
    directory_path = f"{directory_root}/HDD{i}"

    # 현재 HDD 내의 모든 폴더 목록 가져오기
    folder_names = os.listdir(directory_path)
    
    # 각 폴더명에 대해 연구 약어와 매칭 시도
    for folder_name in folder_names:
        matched = False
        print(folder_name)  # 현재 처리 중인 폴더명 출력
        
        # 모든 연구 약어에 대해 매칭 검사
        for abbreviation in study_abbreviations:
            # 폴더명에 연구 약어가 포함되어 있는지 확인
            if abbreviation in folder_name:
                # 매칭된 경우 해당 약어의 리스트에 전체 경로 추가
                study_folder_dict[abbreviation].append(f'{directory_path}/{folder_name}')
                matched = True
                break  # 매칭되면 더 이상의 검사 불필요
        
        # 어떤 연구 약어와도 매칭되지 않은 경우
        if matched == False:
            # unmatched 리스트에 HDD 번호와 폴더명 추가
            study_folder_dict['unmatched'].append(f'{i}/{folder_name}')

# 최종 결과 출력 - 각 연구별 폴더 경로와 매칭되지 않은 폴더들의 목록
print(study_folder_dict)
print(study_folder_dict)


DLBC_TCGA_Dx_tissue
HNSC_TCGA_images
LGG_TCGA_images_1
LGG_TCGA_images_2
LUSC_TCGA_images
OV_TCGA_images_1
OV_TCGA_images_2
OV_TCGA_images_3
PAAD_TCGA_Dx_tissue_images
System Volume Information
TCGA_Pathology_Report_병리진단리포트_내려받은 파일
THCA_TCGA_Dx_2
UCEC_TCGA_images_Dx_1
UCEC_TCGA_images_Dx_2
Install Western Digital Software for Mac.dmg
Install Western Digital Software for Windows.exe
BLCA_TCGA_Dx_images_1
BLCA_TCGA_Dx_images_2
CESC_TCGA_Dx_tissue_images
COAD_TCGA_Dx_images
COAD_TCGA_READ_tissue_images
ESCA_TCGA_Dx_tissue_images
SARC_TCGA_Dx_images_1
SARC_TCGA_Dx_images_2
SARC_TCGA_Dx_images_3
STAD_TCGA_Dx_images
STAD_TCGA_tissue_images
System Volume Information
THYM_TCGA_Dx_images
Install Western Digital Software for Mac.dmg
Install Western Digital Software for Windows.exe
TCGA Study Abbreviations _ NCI Genomic Data Commons.pdf
ACC_TCGA_Dx_tissue_images
Bile duct cancer_seoul and guri
KIRC_TCGA_Dx_images
LUAD_TCGA_TS_images_1
PCPG_TCGA_Dx_tissue_images
System Volume Information
TCGA_Data

In [207]:
study_folder_dict['unmatched']
# 확인결과 HDD3의 Bile duct cancer_seoul and guri 제외하고는 svs 없음
# 해당 폴더는 CHOL에 해당함

['1/System Volume Information',
 '1/TCGA_Pathology_Report_병리진단리포트_내려받은 파일',
 '1/Install Western Digital Software for Mac.dmg',
 '1/Install Western Digital Software for Windows.exe',
 '2/System Volume Information',
 '2/Install Western Digital Software for Mac.dmg',
 '2/Install Western Digital Software for Windows.exe',
 '2/TCGA Study Abbreviations _ NCI Genomic Data Commons.pdf',
 '3/Bile duct cancer_seoul and guri',
 '3/System Volume Information',
 '3/TCGA_Data_full',
 '3/TCGA_Pathology_Report_병리진단리포트_내려받은 파일',
 '3/TCGA_병리진단리포트_내려받은 파일',
 '3/2021-11-24_TCGA_images_list.xlsx',
 '3/Install Western Digital Software for Mac.dmg',
 '3/Install Western Digital Software for Windows.exe',
 '3/TCGA Study Abbreviations _ NCI Genomic Data Commons.pdf',
 '4/System Volume Information',
 '4/Install Western Digital Software for Mac.dmg',
 '4/Install Western Digital Software for Windows.exe']

In [208]:
study_folder_dict['CHOL'].append(f'{directory_root}/HDD3/Bile duct cancer_seoul and guri')
del study_folder_dict['unmatched']

In [209]:
print(len(study_abbreviations))
print(len(study_folder_dict))


study_folder_dict
# empty_key_list = []
# for key in study_folder_dict:
#     if study_folder_dict[key] == []:
#         empty_key_list.append(key)
# for key in empty_key_list:
#     del study_folder_dict[key] 

# print(len(study_folder_dict))
# study_folder_dict

37
37


{'LAML': [],
 'ACC': ['/Pathology_ImageNet/TCGA_data/HDD3/ACC_TCGA_Dx_tissue_images'],
 'BLCA': ['/Pathology_ImageNet/TCGA_data/HDD2/BLCA_TCGA_Dx_images_1',
  '/Pathology_ImageNet/TCGA_data/HDD2/BLCA_TCGA_Dx_images_2'],
 'LGG': ['/Pathology_ImageNet/TCGA_data/HDD1/LGG_TCGA_images_1',
  '/Pathology_ImageNet/TCGA_data/HDD1/LGG_TCGA_images_2'],
 'BRCA': ['/Pathology_ImageNet/TCGA_data/HDD4/BRCA_TCGA_images'],
 'CESC': ['/Pathology_ImageNet/TCGA_data/HDD2/CESC_TCGA_Dx_tissue_images'],
 'CHOL': ['/Pathology_ImageNet/TCGA_data/HDD4/CHOL_TCGA_Dx_tissue',
  '/Pathology_ImageNet/TCGA_data/HDD3/Bile duct cancer_seoul and guri'],
 'LCML': [],
 'COAD': ['/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_Dx_images',
  '/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images'],
 'CNTL': [],
 'ESCA': ['/Pathology_ImageNet/TCGA_data/HDD2/ESCA_TCGA_Dx_tissue_images'],
 'FPPP': [],
 'GBM': ['/Pathology_ImageNet/TCGA_data/HDD4/GBM_TCGA_images'],
 'HNSC': ['/Pathology_ImageNet/TCGA_data/HDD1/HNSC_TCGA_

In [210]:
dup_site_svs_count = 0

dup_site_folder = f'{directory_root}/HDD2/COAD_TCGA_READ_tissue_images'
folder_names = os.listdir(dup_site_folder)
len(folder_names)

for foldername, subfolders, filenames in os.walk(dup_site_folder):
    for filename in filenames:
        if filename.endswith('.svs'):
            dup_site_svs_count += 1
dup_site_svs_count

1347

In [211]:
svs_root = '/Pathology_ImageNet/TCGA_data'

study_folder_svs_cnt_dict = {}
for abbreviation in study_abbreviations:
    study_folder_svs_cnt_dict[abbreviation] = 0 


count = 0

print('lets start')
for foldername, subfolders, filenames in os.walk(svs_root):
    if 'COAD_TCGA_READ_tissue_images' in foldername:
        continue
    
    for filename in filenames:
        if filename.endswith('.svs'):
            dup_checker = 0
            for abbreviation in study_abbreviations:
                if abbreviation in foldername:
                    study_folder_svs_cnt_dict[abbreviation] += 1
                    count += 1
                    dup_checker+=1
            if 'Bile duct cancer' in foldername:
                abbreviation = 'CHOL'
                study_folder_svs_cnt_dict[abbreviation] += 1
                count += 1
                dup_checker+=1
            if dup_checker != 1:
                print(dup_checker)
                print(foldername)
        
            full_path = os.path.join(foldername, filename)
            svs_path_list.append(full_path)
study_folder_svs_cnt_dict

lets start
0
/Pathology_ImageNet/TCGA_data/HDD3/TCGA_Data_full/Big_data_GEO, TCGA from cbioportal/TCGA_lung/Virture_slide/TCGA-05-4249_gdc_download_20180921_030041.915357/b6fea18e-9615-49c1-a10b-1d466639c2bd
0
/Pathology_ImageNet/TCGA_data/HDD3/TCGA_Data_full/Big_data_GEO, TCGA from cbioportal/TCGA_lung/Virture_slide/TCGA-05-4384_gdc_download_20180921_063843.200227/f8ac5923-4633-4606-8aab-bbdc3238ce1e
0
/Pathology_ImageNet/TCGA_data/HDD3/TCGA_Data_full/tcga/maf/TCGA_lung/Virture_slide/TCGA-05-4249_gdc_download_20180921_030041.915357/b6fea18e-9615-49c1-a10b-1d466639c2bd
0
/Pathology_ImageNet/TCGA_data/HDD3/TCGA_Data_full/tcga/maf/TCGA_lung/Virture_slide/TCGA-05-4384_gdc_download_20180921_063843.200227/f8ac5923-4633-4606-8aab-bbdc3238ce1e


{'LAML': 0,
 'ACC': 323,
 'BLCA': 457,
 'LGG': 844,
 'BRCA': 1180,
 'CESC': 604,
 'CHOL': 342,
 'LCML': 0,
 'COAD': 542,
 'CNTL': 0,
 'ESCA': 396,
 'FPPP': 0,
 'GBM': 860,
 'HNSC': 472,
 'KICH': 0,
 'KIRC': 519,
 'KIRP': 0,
 'LIHC': 379,
 'LUAD': 1746,
 'LUSC': 512,
 'DLBC': 103,
 'MESO': 87,
 'MISC': 0,
 'OV': 1481,
 'PAAD': 466,
 'PCPG': 385,
 'PRAD': 449,
 'READ': 166,
 'SARC': 600,
 'SKCM': 475,
 'STAD': 1197,
 'TGCT': 254,
 'THYM': 318,
 'THCA': 1051,
 'UCS': 87,
 'UCEC': 566,
 'UVM': 80}

In [212]:
print(count)
print(len(study_folder_svs_cnt_dict))

study_folder_svs_cnt_dict = {abbr: cnt for abbr, cnt in study_folder_svs_cnt_dict.items() if cnt != 0}
print(len(study_folder_svs_cnt_dict))


16941
37
30


In [213]:
study_folder_svs_cnt_dict

{'ACC': 323,
 'BLCA': 457,
 'LGG': 844,
 'BRCA': 1180,
 'CESC': 604,
 'CHOL': 342,
 'COAD': 542,
 'ESCA': 396,
 'GBM': 860,
 'HNSC': 472,
 'KIRC': 519,
 'LIHC': 379,
 'LUAD': 1746,
 'LUSC': 512,
 'DLBC': 103,
 'MESO': 87,
 'OV': 1481,
 'PAAD': 466,
 'PCPG': 385,
 'PRAD': 449,
 'READ': 166,
 'SARC': 600,
 'SKCM': 475,
 'STAD': 1197,
 'TGCT': 254,
 'THYM': 318,
 'THCA': 1051,
 'UCS': 87,
 'UCEC': 566,
 'UVM': 80}

In [214]:
total_sum = sum(study_folder_svs_cnt_dict.values())
print(total_sum)

16941


In [215]:
from tqdm.auto import tqdm

svs_root = '/Pathology_ImageNet/TCGA_data'


count = 0
svs_path_list = []
print('lets start')
for foldername, subfolders, filenames in os.walk(svs_root):
    for filename in filenames:
        if filename.endswith('.svs'):
            count += 1
            full_path = os.path.join(foldername, filename)
            svs_path_list.append(full_path)

print(count)


count = 0
for foldername, subfolders, filenames in os.walk(svs_root):
    for filename in filenames:
        if filename.endswith('.svs'):
            dup_count = 0
            for abbreviation in study_abbreviations:
                if abbreviation in foldername:
                    study_folder_svs_cnt_dict[abbreviation] += 1
                    count += 1
                    dup_count +=1
            if dup_count > 1:
                print(foldername)
        
            full_path = os.path.join(foldername, filename)
            svs_path_list.append(full_path)

print(count)

lets start
18292
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00072739-854a-492e-9091-bbccd9afbd41
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/006f83ef-d354-4295-84e1-ff363a4268c8
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00a6bf92-0bd4-4a73-9049-8ba149c95bb4
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00af0821-adaf-4dd8-98d8-bd788ec11c7e
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00c08e0e-4798-49e1-80ed-6047dd68366b
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00ec5f18-43e5-42a9-b2e6-4cbde27f05f1
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00ee3f20-b3de-403f-b3fd-0b703b683075
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/01133468-fb29-474e-9de9-456001ed1f1f
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/012017cb-c598-4785-a99d-6da6363e16bc
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/0193e629-b

In [216]:
# COAD: 직장
# READ: 대장

for foldername, subfolders, filenames in os.walk(svs_root):
    for filename in filenames:
        if filename.endswith('.svs'):
            dup_count = 0
            for abbreviation in study_abbreviations:
                if abbreviation in foldername:
                    study_folder_svs_cnt_dict[abbreviation] += 1
                    count += 1
                    dup_count +=1
            if dup_count > 1:
                print(foldername)
        
            full_path = os.path.join(foldername, filename)
            svs_path_list.append(full_path)

/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00072739-854a-492e-9091-bbccd9afbd41
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/006f83ef-d354-4295-84e1-ff363a4268c8
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00a6bf92-0bd4-4a73-9049-8ba149c95bb4
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00af0821-adaf-4dd8-98d8-bd788ec11c7e
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00c08e0e-4798-49e1-80ed-6047dd68366b
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00ec5f18-43e5-42a9-b2e6-4cbde27f05f1
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/00ee3f20-b3de-403f-b3fd-0b703b683075
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/01133468-fb29-474e-9de9-456001ed1f1f
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/012017cb-c598-4785-a99d-6da6363e16bc
/Pathology_ImageNet/TCGA_data/HDD2/COAD_TCGA_READ_tissue_images/0193e629-b296-4e43-9f2b-b89