# Loading Image Data and Create Train/Val Datasets

In [26]:
import zipfile 
import os
import torch

# Images include 5 classes
# 5000: Colon Adenocarcinomas (colonca)
# 5000: Benign Colonic Tissues (colonn)
# 5000: Lung Adenocarcinomas (lungaca)
# 5000: Lung Squamous Cell Carcinomas (lungscc)
# 5000: Benign Lung Tissues (lungn)

# import data
img_paths = []
for root, dirs, files in os.walk('./images/'):
    if len(files) != 0:
        img_paths.extend(files)

len(img_paths)

25000

In [21]:
# Getting pre-trained weights
backbone_weights_path = 'models/BEPH_backbone.pth'

ck = torch.load(backbone_weights_path, map_location=torch.device('cpu'), weights_only=True)

In [59]:
# Separate by class
colonca = []
colonn = []
lungaca = []
lungscc = []
lungn = []

# Annotated image paths
train_files = []
test_files = []

for file in img_paths:
    if file.startswith('colonca'):
        colonca.append(file)
    elif file.startswith('colonn'):
        colonn.append(file)
    elif file.startswith('lungaca'):
        lungaca.append(file)
    elif file.startswith('lungscc'):
        lungscc.append(file)
    elif file.startswith('lungn'):
        lungn.append(file)

# print(len(colonca), len(colonn), len(lungaca), len(lungscc), len(lungn))

# Break up data into 70/30 train/test and annotate paths
train_idx = [i for i in range(int(5000*0.7))]
test_idx = [i for i in range(int(5000*0.7), 5000)]

# Creates annotated file paths
for i in train_idx:
    train_files.append(f'{colonca[i]}' + ' 1')
    train_files.append(f'{colonn[i]}' + ' 0')
    train_files.append(f'{lungaca[i]}' + ' 1')
    train_files.append(f'{lungscc[i]}' + ' 1')
    train_files.append(f'{lungn[i]}' + ' 0')

for i in test_idx:
    test_files.append(f'{colonca[i]}' + ' 1')
    test_files.append(f'{colonn[i]}' + ' 0')
    test_files.append(f'{lungaca[i]}' + ' 1')
    test_files.append(f'{lungscc[i]}' + ' 1')
    test_files.append(f'{lungn[i]}' + ' 0')

with open('./meta/train.txt', 'w') as f:
    for file in train_files:
        f.write('./images/' + file + '\n')
        
with open('./meta/test.txt', 'w') as f:
    for file in test_files:
        f.write('./images/' + file + '\n')