In [1]:
from pathlib import Path
from dotenv import load_dotenv
import os

load_dotenv()

import sys
import os
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir, '..')))
volumes_dir = Path(os.getenv("VOLUME_DIR"))

import torch
from torchvision.transforms import v2

from training.experiment_config import config
from training.dataloader import get_data_loader

from tqdm import tqdm
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")



In [2]:
train = pd.read_csv(config.CSV_DIR_TRAIN)
train_ones = train[train["label"] == 1].reset_index(drop=True)
train_zeros = train[train["label"] == 0].reset_index(drop=True)

image_loader = get_data_loader(
		data_dir=config.DATADIR,
		dataset=train_ones,
		mode=config.MODE,
		workers=8,
		batch_size=config.BATCH_SIZE,
		size_px=config.SIZE_PX,
		size_mm=config.SIZE_MM,
		rotations=config.ROTATION,
		translations=config.TRANSLATION,
		)

In [24]:
train_ones.head()

Unnamed: 0,PatientID,SeriesInstanceUID,StudyDate,CoordX,CoordY,CoordZ,LesionID,AnnotationID,NoduleID,Age_at_StudyDate,Gender,label
0,104839,1.2.840.113654.2.55.30936757672243920224031101...,19990102,122.39,71.28,-220.84,1,104839_1_19990102,104839_1,55,Male,1
1,109345,1.2.840.113654.2.55.30950357318430274435755858...,19990102,57.68,-104.72,-62.51,1,109345_1_19990102,109345_1,62,Male,1
2,202952,1.3.6.1.4.1.14519.5.2.1.7009.9004.209354409551...,19990102,87.32,34.9,-80.11,1,202952_1_19990102,202952_1,56,Female,1
3,112901,1.2.840.113654.2.55.20259099074923358727086752...,20000102,-71.53,38.67,-37.77,1,112901_1_20000102,112901_1,69,Female,1
4,211570,1.3.6.1.4.1.14519.5.2.1.7009.9004.195185449252...,20000102,-56.2,-17.22,-71.04,1,211570_1_20000102,211570_1,64,Male,1


In [None]:
for data in image_loader:
	data_id = data['ID']

	image_location = volumes_dir / "luna25_nodule_blocks" / "image" / f"{data_id}.npy"
	metadata_location = volumes_dir / "luna25_nodule_blocks" / "metadata" / f"{data_id}.npy"
	
	volume = np.load(str(image_location))
	metadata = np.load(str(metadata_location), allow_pickle=True).item()

In [4]:
image_location = volumes_dir / "luna25_nodule_blocks" / "image" / f"112901_1_20000102.npy"
metadata_location = volumes_dir / "luna25_nodule_blocks" / "metadata" / f"112901_1_20000102.npy"

volume = np.load(str(image_location))
metadata = np.load(str(metadata_location), allow_pickle=True).item()

In [5]:
volume.shape

(64, 128, 128)

In [6]:
metadata

{'origin': array([-117.46499634,    3.57499695, -106.4593811 ]),
 'spacing': array([2.5     , 0.546875, 0.546875]),
 'transform': array([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])}

In [None]:
import matplotlib.pyplot as plt
image_np = image.permute(1, 2, 0).cpu().numpy()
img_np = new_image.permute(1, 2, 0).cpu().numpy()

plt.imshow(img_np)
plt.axis('off')
plt.title(f"Modified image for ID: {data_id}\nLabel: {label.item()}")
plt.show()

plt.imshow(image_np)
plt.axis('off')
plt.show()

In [38]:
n_samples_to_make = (len(train_zeros) - len(train_ones))
n_samples_to_make

4034

In [4]:
for data in tqdm(image_loader):
	continue

100%|██████████| 448/448 [00:37<00:00, 11.89it/s] 


In [39]:
i = 0
while i <= (n_samples_to_make):
	for data in tqdm(image_loader):

		label = data["label"][0].float() #.to(device).item()
		image = data["image"][0]

		# input augmentation
		transforms = v2.Compose([
			v2.RandomHorizontalFlip(p=0.3),
			v2.RandomAffine(degrees=(-20, 20), translate=(0.1, 0.1), scale=(0.9, 1.1)),
		])
		new_image = transforms(image)
		new_image = torch.stack([new_image[0]]*64, dim=0)
		
		# guardar nuevo bloque de imagen
		new_AnnotationID = f"1_1_{str(i)}"
		new_image_location = volumes_dir / "luna25_nodule_blocks" / "image" / f"{new_AnnotationID}.npy"
		
		new_metadata_location = volumes_dir / "luna25_nodule_blocks" / "metadata" / f"{new_AnnotationID}.npy"
		dummy_metadata = {
			"origin": np.array([0.0, 0.0, 0.0]),
			"spacing": np.array([1.0, 1.0, 1.0]),
			"transform": np.eye(3)
		}
		
		np.save(str(new_image_location), new_image.numpy())
		np.save(str(new_metadata_location), dummy_metadata)

		train_ones = pd.concat([
			train_ones,
			pd.DataFrame([{
				"AnnotationID": new_AnnotationID,
				"label": label.item(),
			}])],
			ignore_index=True)

		i+=1

	print(f"""Added {i} new images to the dataset.
Currently, there are {len(train_ones)} positive samples and {len(train_zeros)} negative samples.\n""")

100%|██████████| 448/448 [01:03<00:00,  7.10it/s]


Added 448 new images to the dataset.
Currently, there are 896 positive samples and 4482 negative samples.



100%|██████████| 448/448 [01:01<00:00,  7.23it/s]


Added 896 new images to the dataset.
Currently, there are 1344 positive samples and 4482 negative samples.



100%|██████████| 448/448 [00:57<00:00,  7.73it/s]


Added 1344 new images to the dataset.
Currently, there are 1792 positive samples and 4482 negative samples.



100%|██████████| 448/448 [00:57<00:00,  7.75it/s]


Added 1792 new images to the dataset.
Currently, there are 2240 positive samples and 4482 negative samples.



100%|██████████| 448/448 [00:59<00:00,  7.49it/s]


Added 2240 new images to the dataset.
Currently, there are 2688 positive samples and 4482 negative samples.



100%|██████████| 448/448 [00:59<00:00,  7.51it/s]


Added 2688 new images to the dataset.
Currently, there are 3136 positive samples and 4482 negative samples.



100%|██████████| 448/448 [00:58<00:00,  7.60it/s]


Added 3136 new images to the dataset.
Currently, there are 3584 positive samples and 4482 negative samples.



100%|██████████| 448/448 [01:01<00:00,  7.34it/s]


Added 3584 new images to the dataset.
Currently, there are 4032 positive samples and 4482 negative samples.



100%|██████████| 448/448 [01:01<00:00,  7.33it/s]


Added 4032 new images to the dataset.
Currently, there are 4480 positive samples and 4482 negative samples.



100%|██████████| 448/448 [00:59<00:00,  7.49it/s]

Added 4480 new images to the dataset.
Currently, there are 4928 positive samples and 4482 negative samples.






In [42]:
new_image.shape

torch.Size([64, 64, 64])

In [43]:
4482 - 4928

-446

In [44]:
train_new = pd.concat([train_zeros, train_ones], ignore_index=True)
train_new.label.value_counts()

1.0    4928
0.0    4482
Name: label, dtype: int64

In [45]:
train_new = train_new[:-446]
train_new.label.value_counts()

0.0    4482
1.0    4482
Name: label, dtype: int64

In [46]:
#train_new = pd.concat([train_ones, train_zeros], ignore_index=True)

new_csv_location = volumes_dir / "train_augmented.csv"
if not new_csv_location.parent.exists():
	new_csv_location.parent.mkdir(parents=True)
train_new.to_csv(str(new_csv_location), index=False)