In [2]:
from pathlib import Path
from dotenv import load_dotenv
import os

load_dotenv()

import sys
import os
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir, '..')))
volumes_dir = Path(os.getenv("VOLUME_DIR"))

from imblearn.over_sampling import SMOTE

from training.experiment_config import config
from training.dataloader import get_data_loader

from tqdm import tqdm
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning, module="torchvision")
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")


## Data Import

In [3]:
train = pd.read_csv(config.CSV_DIR_TRAIN)

### Approach 1: Full Dataset, batch size 1


In [None]:
image_loader = get_data_loader(
			data_dir=config.DATADIR,
			dataset=train,
			mode=config.MODE,
			workers=8,
			batch_size=config.BATCH_SIZE,
			size_px=config.SIZE_PX,
			size_mm=config.SIZE_MM,
			rotations=config.ROTATION,
			translations=config.TRANSLATION,
			)

In [None]:
X = []
y = []
for data in tqdm(image_loader):
	label = data["label"][0].float().item()
	image = data["image"][0][0].numpy() #just one channel
	image_reshaped = image.reshape(1, -1)
	X.append(image_reshaped[0])
	y.append(label)

### Approach 2: Full Dataset, one batch

In [None]:
image_loader = get_data_loader(
			data_dir=config.DATADIR,
			dataset=train,
			mode=config.MODE,
			workers=8,
			batch_size=config.BATCH_SIZE,
			size_px=config.SIZE_PX,
			size_mm=config.SIZE_MM,
			rotations=config.ROTATION,
			translations=config.TRANSLATION,
			)

In [None]:
for data in tqdm(image_loader):
	# batch: shape (4930, 3, 64, 64)
	labels = data["label"].cpu().numpy().reshape(-1, 1)
	batch_first_channel = data["image"][:, 0, :, :].cpu().numpy()  # shape (4930, 64, 64)
	
	X = batch_first_channel.reshape(batch_first_channel.shape[0], -1)  # shape (4930, 4096)
	y = labels

### Approach 3: Dataset in batches of size n


In [6]:
image_loader = get_data_loader(
			data_dir=config.DATADIR,
			dataset=train,
			mode=config.MODE,
			workers=8,
			batch_size=config.BATCH_SIZE,
			size_px=config.SIZE_PX,
			size_mm=config.SIZE_MM,
			rotations=config.ROTATION,
			translations=config.TRANSLATION,
			)

In [7]:
X = []
y = []
for data in tqdm(image_loader):
	# batch: shape (4930, 3, 64, 64)
	labels = data["label"].cpu().numpy().reshape(-1, 1)
	batch_first_channel = data["image"][:, 0, :, :].cpu().numpy()  # shape (4930, 64, 64)
	
	X.append(batch_first_channel.reshape(batch_first_channel.shape[0], -1))  # shape (4930, 4096)
	y.append(labels)

100%|██████████| 78/78 [01:14<00:00,  1.05it/s]


### Approach 4: Dataset in batches, but split outside the data loader

In [None]:
n_splits = len(train) // 400
splits = np.array_split(train, n_splits)

In [None]:
X = []
y = []

for split in splits:

	image_loader = get_data_loader(
			data_dir=config.DATADIR,
			dataset=split,
			mode=config.MODE,
			workers=8,
			batch_size=config.BATCH_SIZE,
			size_px=config.SIZE_PX,
			size_mm=config.SIZE_MM,
			rotations=config.ROTATION,
			translations=config.TRANSLATION,
			)

	for data in tqdm(image_loader):
		label = data["label"][0].float().item()
		image = data["image"][0][0].numpy() #just one channel

		image_reshaped = image.reshape(1, -1)
		X.append(image_reshaped[0])
		y.append(label)

## SMOTE

In [12]:
X = np.concatenate(X, axis=0)
y = np.concatenate(y, axis=0)

In [14]:
X.shape, y.shape

((4930, 4096), (4930, 1))

In [16]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [18]:
X_resampled.shape, y_resampled.shape

((8964, 4096), (8964,))

In [19]:
y_resampled.mean()

0.5

In [22]:
resampled_df = pd.DataFrame(X_resampled)
resampled_df['label'] = y_resampled
resampled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4087,4088,4089,4090,4091,4092,4093,4094,4095,label
0,0.723571,0.700714,0.598571,0.365000,0.293571,0.297143,0.183571,0.137143,0.257143,0.281429,...,0.637143,0.679286,0.612857,0.639286,0.594286,0.600000,0.628571,0.712857,0.761429,0
1,0.050000,0.055000,0.074286,0.096429,0.087143,0.067857,0.103571,0.145714,0.130000,0.125000,...,0.624286,0.630000,0.640000,0.640714,0.650000,0.660714,0.655714,0.643571,0.640000,0
2,0.707143,0.743571,0.787143,0.824286,0.840000,0.850714,0.855000,0.827857,0.775000,0.710000,...,0.038571,0.047857,0.035000,0.031429,0.064286,0.091429,0.067857,0.075000,0.098571,0
3,0.685714,0.694286,0.702143,0.702857,0.682143,0.685714,0.692143,0.675714,0.676429,0.672857,...,0.067857,0.068571,0.072143,0.091429,0.112857,0.100000,0.072143,0.062857,0.060714,1
4,0.090714,0.081429,0.085000,0.096429,0.099286,0.092857,0.083571,0.079286,0.085000,0.094286,...,0.758571,0.758571,0.762857,0.763571,0.759286,0.756429,0.763571,0.767857,0.762143,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8959,0.115259,0.095540,0.143314,0.211304,0.139227,0.051218,0.037262,0.046887,0.054858,0.053168,...,0.739546,0.738658,0.734843,0.740645,0.750662,0.759617,0.757614,0.751474,0.734672,1
8960,0.079643,0.072427,0.064181,0.056712,0.064717,0.081355,0.099033,0.100956,0.088477,0.092017,...,0.750609,0.751985,0.748435,0.757353,0.759181,0.744149,0.741702,0.736996,0.750473,1
8961,0.080052,0.088848,0.078978,0.070583,0.074327,0.066771,0.069605,0.084959,0.099787,0.117774,...,0.781184,0.787761,0.783706,0.770176,0.755795,0.750411,0.741238,0.715884,0.698115,1
8962,0.757060,0.752829,0.748826,0.748045,0.749889,0.731074,0.722071,0.720846,0.719756,0.783752,...,0.092890,0.091080,0.094319,0.092868,0.088560,0.089230,0.086683,0.077425,0.080299,1


In [23]:
dataset_path = volumes_dir / 'smote_dataset.csv'
resampled_df.to_csv(str(dataset_path), index=False)

## Data processing to necessary format

In [31]:
dummy_df = pd.DataFrame(columns=train.columns)

for i in range(len(resampled_df)):
	dummy_df = pd.concat([dummy_df,
			 pd.DataFrame([{"AnnotationID":f"1_2_{i}", "label": resampled_df["label"][i]}])], ignore_index=True)

In [34]:
dummy_df.head()

Unnamed: 0,PatientID,SeriesInstanceUID,StudyDate,CoordX,CoordY,CoordZ,LesionID,AnnotationID,NoduleID,Age_at_StudyDate,Gender,label
0,,,,,,,,1_2_0,,,,0
1,,,,,,,,1_2_1,,,,0
2,,,,,,,,1_2_2,,,,0
3,,,,,,,,1_2_3,,,,1
4,,,,,,,,1_2_4,,,,0


In [33]:
new_csv_location = volumes_dir / "train_smote.csv"
if not new_csv_location.parent.exists():
	new_csv_location.parent.mkdir(parents=True)
dummy_df.to_csv(str(new_csv_location), index=False)

In [36]:
for i in range(len(resampled_df)):
	new_image = resampled_df.iloc[i, :-1].values.reshape(1, 64, 64)
	new_image = np.stack([new_image[0]] * 64, axis=0)

	new_AnnotationID = f"1_2_{str(i)}"
	new_image_location = volumes_dir / "luna25_nodule_blocks" / "image" / f"{new_AnnotationID}.npy"

	new_metadata_location = volumes_dir / "luna25_nodule_blocks" / "metadata" / f"{new_AnnotationID}.npy"
	dummy_metadata = {
		"origin": np.array([0.0, 0.0, 0.0]),
		"spacing": np.array([1.0, 1.0, 1.0]),
		"transform": np.eye(3)
	}

	np.save(str(new_image_location), new_image)
	np.save(str(new_metadata_location), dummy_metadata)