# Index

1. [Imports](#Imports)
2. [Dataset Importation](#Dataset-Importation)
3. [Data Exploration](#Data-Exploration)
4. [Data Cleaning](#Data-Cleaning)
5. [Sampling patients to reduce data size](#Sampling-patients-to-reduce-data-size)
   - [Checking distribution from original to sampled](#Checking-distribution-from-original-to-sampled)
   - [Final Patients](#Final-Patients)
6. [Preprocessing (patient data)](#Preprocessing-patient-data)
   - [Demographic Data](#Demographic-Data)
7. [Creating the Data Set (Images)](#Creating-the-Data-Set-Images)
     - [Preprocess images](#Preprocess-images)
     - [Split the data](#Split-the-data)
7. [Data Augmentation for Training and Validation](#Data-Augmentation-for-Training-and-Validation)
   - [Visualizing the final augmented images and the bboxes](#Visualizing-the-final-augmented-images-and-the-bboxes)
   - [Dataset](#Dataset)

## Imports

In [1]:
# OS and File Management
import os
import shutil
import random
from glob import glob
import pickle
import xml.etree.ElementTree as ET
import re
import os
from collections import defaultdict
import re

# Scientific Computing and Data Processing
import numpy as np
import pandas as pd

# Image Processing and Augmentation
import cv2
import pydicom
import albumentations as A
from albumentations import (
    Compose, HorizontalFlip, RandomBrightnessContrast, Affine,
    GaussianBlur, CLAHE, RandomGamma, GaussNoise
)

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches

# Machine Learning Utilities
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Custom Utilities
from VisualizationTools.get_data_from_XML import XML_preprocessor, get_category
from VisualizationTools.get_gt import get_gt
from VisualizationTools.getUID import getUID_path
from VisualizationTools.utils import loadFileInformation
from functions_DataPreprocessing import *

import os
import shutil
from collections import defaultdict
from glob import glob
import os
import cv2
import random
import numpy as np
from glob import glob
from tqdm import tqdm
from collections import defaultdict
from albumentations import (
    Compose, HorizontalFlip, RandomBrightnessContrast, Affine,
    GaussianBlur, CLAHE, RandomGamma
)

  check_for_updates()


# Preprocessing (Image data)

In [2]:
df = pd.read_excel('/Users/catarinasilva/Desktop/Master Thesis/lung_cancer/statistics_clinical.xlsx')

In [3]:
patient_ids = ['E0001', 'E0002', 'E0003', 'E0004', 'E0005', 'B0001', 'B0002', 'B0003', 'B0004', 'B0005', 'B0006', 'B0007', 'B0008', 'B0009', 'B0011', 'B0013', 'B0014', 'B0015', 'B0016', 'B0017', 'B0018', 'B0019', 'B0020', 'B0021', 'B0022', 'B0023', 'B0024', 'B0025', 'B0026', 'B0027', 'B0028', 'B0031', 'B0033', 'B0034', 'B0036', 'B0037', 'B0038', 'B0040', 'B0041', 'B0042', 'B0043', 'B0044','A0001', 'A0003', 'A0005', 'A0006', 'A0008', 'A0010', 'A0012', 'A0013', 'A0014', 'A0016', 'A0017', 'A0019', 'A0021', 'A0022', 'A0024', 'A0026', 'A0027', 'A0029', 'A0030', 'A0032', 'A0036', 'A0037', 'A0039', 'A0042', 'A0044', 'A0045', 'A0047', 'A0048', 'A0051', 'A0053', 'A0057', 'A0059', 'A0061', 'A0066', 'A0068', 'A0073', 'A0075', 'A0077', 'A0086', 'A0089', 'A0091', 'A0092', 'A0095', 'A0097', 'A0103', 'A0105', 'A0106', 'A0108', 'A0109', 'A0112', 'A0118', 'A0120', 'A0121', 'A0125', 'A0128', 'A0132', 'A0135', 'A0137', 'A0140', 'A0147', 'A0149', 'A0157', 'A0160', 'A0162', 'A0163', 'A0168', 'A0169', 'A0170', 'A0171', 'A0179', 'A0182', 'A0183', 'A0184', 'A0185', 'A0187', 'A0189', 'A0192', 'G0001', 'G0002', 'G0003', 'G0004', 'G0005', 'G0006', 'G0007', 'G0008', 'G0010', 'G0011', 'G0012', 'G0013', 'G0014', 'G0015', 'G0017', 'G0018', 'G0019', 'G0024', 'G0025', 'G0026', 'G0028', 'G0029', 'G0030', 'G0031', 'G0032', 'G0033', 'G0036', 'G0039', 'G0040', 'G0041', 'G0042', 'G0043', 'G0044', 'G0046', 'G0049', 'G0050', 'G0056', 'G0062']

In [4]:
df_final = df[df['NewPatientID'].isin(patient_ids)]

In [5]:
df_final.shape

(157, 10)

In [6]:
df_final.describe()

Unnamed: 0,No.,Age,weight (kg),N-Stage,Smoking History
count,157.0,157.0,155.0,157.0,157.0
mean,192.006369,60.076433,65.046452,1.280255,0.477707
std,114.528996,10.182653,11.675917,1.270142,0.501101
min,1.0,28.0,35.5,0.0,0.0
25%,85.0,54.0,57.0,0.0,0.0
50%,253.0,61.0,65.0,1.0,0.0
75%,293.0,67.0,71.0,3.0,1.0
max,355.0,83.0,98.0,3.0,1.0


In [7]:
df_final.describe(include='object')

Unnamed: 0,NewPatientID,Sex,T-Stage,Ｍ-Stage,Histopathological grading
count,157,157,157,157,82
unique,157,2,10,5,7
top,A0001,M,1c,0,G3
freq,1,91,34,88,54


In [8]:
total_observations = len(df_final)
nan_info_list = []

for column in df_final.columns:
    nan_count = df_final[column].isna().sum()
    nan_percentage = (nan_count / total_observations) * 100
    nan_info_list.append({'Column Name': column,
                          'NaN Count': nan_count,
                          'Percentage of NaNs': f"{nan_percentage:.2f}%",
                          'Total Observations': total_observations})

nan_info = pd.DataFrame(nan_info_list)

display(nan_info)

Unnamed: 0,Column Name,NaN Count,Percentage of NaNs,Total Observations
0,No.,0,0.00%,157
1,NewPatientID,0,0.00%,157
2,Sex,0,0.00%,157
3,Age,0,0.00%,157
4,weight (kg),2,1.27%,157
5,T-Stage,0,0.00%,157
6,N-Stage,0,0.00%,157
7,Ｍ-Stage,0,0.00%,157
8,Histopathological grading,75,47.77%,157
9,Smoking History,0,0.00%,157


In [9]:
df_final['T-Stage'].value_counts()

T-Stage
1c    34
3     31
2     29
2a    21
4     15
1b    14
1a     6
2b     5
is     1
1      1
Name: count, dtype: int64

In [10]:
df_final['N-Stage'].value_counts()

N-Stage
0    60
3    50
1    43
2     4
Name: count, dtype: int64

In [11]:
df_final['Ｍ-Stage'].value_counts()

Ｍ-Stage
0     88
1a    21
1b    19
1     19
1c    10
Name: count, dtype: int64

In [12]:
df_final = df_final.rename(columns={'NewPatientID': 'PatientID', 'Age': 'age', 'Sex': 'gender'})

In [13]:
df_final.columns

Index(['No.', 'PatientID', 'gender', 'age', 'weight (kg)', 'T-Stage',
       'N-Stage', 'Ｍ-Stage', 'Histopathological grading', 'Smoking History'],
      dtype='object')

### Demografic Data

In [14]:
df = pd.read_csv('/Users/catarinasilva/Desktop/Master Thesis/NSCLC data/NSCLC-Radiomics-Lung1.clinical-version3-Oct-2019.csv')

In [15]:
# List of selected PatientIDs for class G
selected_patients_g = [
    'LUNG1-007', 'LUNG1-010', 'LUNG1-014', 'LUNG1-027', 'LUNG1-050', 'LUNG1-061', 'LUNG1-063', 'LUNG1-064', 
    'LUNG1-078', 'LUNG1-082', 'LUNG1-086', 'LUNG1-098', 'LUNG1-122', 'LUNG1-135', 'LUNG1-145', 'LUNG1-146', 
    'LUNG1-150', 'LUNG1-167', 'LUNG1-202', 'LUNG1-206', 'LUNG1-222', 'LUNG1-253', 'LUNG1-264', 'LUNG1-266', 
    'LUNG1-295', 'LUNG1-296', 'LUNG1-302', 'LUNG1-321', 'LUNG1-325', 'LUNG1-326', 'LUNG1-329', 'LUNG1-334', 
    'LUNG1-339', 'LUNG1-346', 'LUNG1-347', 'LUNG1-355', 'LUNG1-356', 'LUNG1-358', 'LUNG1-359', 'LUNG1-362', 
    'LUNG1-365', 'LUNG1-366', 'LUNG1-369', 'LUNG1-371', 'LUNG1-374', 'LUNG1-378', 'LUNG1-379', 'LUNG1-380', 
    'LUNG1-383', 'LUNG1-386', 'LUNG1-389', 'LUNG1-391', 'LUNG1-399', 'LUNG1-404', 'LUNG1-413', 'LUNG1-420'
]

# Filter the dataset to include:
filtered_df = df[(df['PatientID'].isin(selected_patients_g)) | (df['Histology'] == 'large cell')]

# Display the filtered DataFrame
display(filtered_df)

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1
6,LUNG1-007,81.5288,2.0,2,0,IIIa,squamous cell carcinoma,male,137,1
9,LUNG1-010,71.0554,4.0,3,0,IIIb,squamous cell carcinoma,female,2119,0
13,LUNG1-014,66.7707,4.0,0,0,IIIb,squamous cell carcinoma,male,1247,1
...,...,...,...,...,...,...,...,...,...,...
390,LUNG1-391,60.0822,2.0,0,0,I,squamous cell carcinoma,female,256,1
398,LUNG1-399,77.7973,1.0,2,0,IIIa,squamous cell carcinoma,female,2835,0
403,LUNG1-404,74.2356,3.0,2,0,IIIa,squamous cell carcinoma,male,280,1
412,LUNG1-413,60.5315,4.0,2,0,IIIb,squamous cell carcinoma,female,246,1


In [16]:
filtered_df.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1
6,LUNG1-007,81.5288,2.0,2,0,IIIa,squamous cell carcinoma,male,137,1
9,LUNG1-010,71.0554,4.0,3,0,IIIb,squamous cell carcinoma,female,2119,0
13,LUNG1-014,66.7707,4.0,0,0,IIIb,squamous cell carcinoma,male,1247,1


In [17]:
filtered_df.columns

Index(['PatientID', 'age', 'clinical.T.Stage', 'Clinical.N.Stage',
       'Clinical.M.Stage', 'Overall.Stage', 'Histology', 'gender',
       'Survival.time', 'deadstatus.event'],
      dtype='object')

In [18]:
filtered_df.shape

(170, 10)

In [19]:
filtered_df['Histology'].value_counts()

Histology
large cell                 114
squamous cell carcinoma     56
Name: count, dtype: int64

In [20]:
total_observations = len(filtered_df)
nan_info_list = []

for column in filtered_df.columns:
    nan_count = filtered_df[column].isna().sum()
    nan_percentage = (nan_count / total_observations) * 100
    nan_info_list.append({'Column Name': column,
                          'NaN Count': nan_count,
                          'Percentage of NaNs': f"{nan_percentage:.2f}%",
                          'Total Observations': total_observations})

nan_info = pd.DataFrame(nan_info_list)

display(nan_info)

Unnamed: 0,Column Name,NaN Count,Percentage of NaNs,Total Observations
0,PatientID,0,0.00%,170
1,age,5,2.94%,170
2,clinical.T.Stage,0,0.00%,170
3,Clinical.N.Stage,0,0.00%,170
4,Clinical.M.Stage,0,0.00%,170
5,Overall.Stage,1,0.59%,170
6,Histology,0,0.00%,170
7,gender,0,0.00%,170
8,Survival.time,0,0.00%,170
9,deadstatus.event,0,0.00%,170


In [21]:
filtered_df.isna().sum()

PatientID           0
age                 5
clinical.T.Stage    0
Clinical.N.Stage    0
Clinical.M.Stage    0
Overall.Stage       1
Histology           0
gender              0
Survival.time       0
deadstatus.event    0
dtype: int64

In [22]:
rows_with_nan = filtered_df[filtered_df[['Overall.Stage', 'age']].isna().any(axis=1)]

display(rows_with_nan)

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
271,LUNG1-272,60.1396,5.0,2,0,,large cell,male,288,1
274,LUNG1-275,,2.0,3,0,IIIb,large cell,male,173,1
302,LUNG1-303,,2.0,0,0,I,large cell,male,24,1
307,LUNG1-308,,2.0,1,0,II,large cell,female,213,1
338,LUNG1-339,,4.0,2,0,IIIb,squamous cell carcinoma,male,120,1
353,LUNG1-354,,1.0,2,0,IIIa,large cell,female,617,1


In [23]:
filtered_df.describe()

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Survival.time,deadstatus.event
count,165.0,170.0,170.0,170.0,170.0,170.0
mean,67.681981,2.482353,1.594118,0.035294,1071.370588,0.870588
std,10.231576,1.152427,1.194153,0.324432,1041.80046,0.336647
min,33.6849,1.0,0.0,0.0,10.0,0.0
25%,60.6959,2.0,0.0,0.0,265.25,1.0
50%,68.1068,2.0,2.0,0.0,637.0,1.0
75%,74.9295,4.0,2.75,0.0,1595.75,1.0
max,91.7043,5.0,4.0,3.0,4328.0,1.0


In [24]:
filtered_df.describe(include='object')

Unnamed: 0,PatientID,Overall.Stage,Histology,gender
count,170,169,170,170
unique,170,4,2,2
top,LUNG1-001,IIIb,large cell,male
freq,1,76,114,106


In [25]:
filtered_df['Histology'].value_counts()

Histology
large cell                 114
squamous cell carcinoma     56
Name: count, dtype: int64

In [26]:
filtered_df = filtered_df.rename(columns={'clinical.T.Stage': 'T-Stage', 'Clinical.N.Stage': 'N-Stage', 'Clinical.M.Stage': 'Ｍ-Stage'})

In [27]:
filtered_df.columns

Index(['PatientID', 'age', 'T-Stage', 'N-Stage', 'Ｍ-Stage', 'Overall.Stage',
       'Histology', 'gender', 'Survival.time', 'deadstatus.event'],
      dtype='object')

In [28]:
filtered_df['T-Stage'].value_counts()

T-Stage
2.0    64
4.0    50
1.0    38
3.0    17
5.0     1
Name: count, dtype: int64

In [29]:
filtered_df['N-Stage'].value_counts()

N-Stage
2    66
0    53
3    41
1     8
4     2
Name: count, dtype: int64

In [30]:
filtered_df['Ｍ-Stage'].value_counts()

Ｍ-Stage
0    168
3      2
Name: count, dtype: int64

In [31]:
filtered_df['Overall.Stage'].value_counts()

Overall.Stage
IIIb    76
IIIa    55
I       27
II      11
Name: count, dtype: int64

## Spliting

In [82]:
image_dir='../../data/yolo/yolo_2datasets/patient_split_2datasets/train/images'

# Contar imagens por T-stage
t_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='T-Stage')
print("T-Stage image counts:", t_counts)

# Contar imagens por N-stage
n_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='N-Stage')
print("N-Stage image counts:", n_counts)

# Contar imagens por M-stage
m_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='M-Stage')
print("M-Stage image counts:", m_counts)

T-Stage image counts: {'1': 3562, '3': 1429, '0': 1802, '2': 2441}
N-Stage image counts: {'0': 3543, '3': 1967, '1': 2684, '2': 1040}
M-Stage image counts: {'1': 2866, '0': 6368}


In [83]:
image_dir='../../data/yolo/yolo_2datasets/patient_split_2datasets/val/images'

# Contar imagens por T-stage
t_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='T-Stage')
print("T-Stage image counts:", t_counts)

# Contar imagens por N-stage
n_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='N-Stage')
print("N-Stage image counts:", n_counts)

# Contar imagens por M-stage
m_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='M-Stage')
print("M-Stage image counts:", m_counts)

T-Stage image counts: {'1': 683, '2': 542, '0': 362, '3': 250}
N-Stage image counts: {'3': 268, '0': 797, '1': 604, '2': 168}
M-Stage image counts: {'0': 1373, '1': 464}


In [84]:
image_dir='../../data/yolo/yolo_2datasets/patient_split_2datasets/test/images'

# Contar imagens por T-stage
t_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='T-Stage')
print("T-Stage image counts:", t_counts)

# Contar imagens por N-stage
n_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='N-Stage')
print("N-Stage image counts:", n_counts)

# Contar imagens por M-stage
m_counts = count_images_per_target_stage_TNM(image_dir, df_concat, stage_col='M-Stage')
print("M-Stage image counts:", m_counts)

T-Stage image counts: {'3': 167, '0': 752, '1': 642, '2': 126}
N-Stage image counts: {'3': 215, '0': 577, '1': 726, '2': 169}
M-Stage image counts: {'1': 323, '0': 1364}


## Augmentation

In [145]:
image_tnm_train_dir='../../data/tnm/train/images'

# Contar imagens por T-stage
t_counts = count_images_per_target_stage_TNM(image_tnm_train_dir, df_concat, stage_col='T-Stage')
print("T-Stage image counts:", t_counts)

# Contar imagens por N-stage
n_counts = count_images_per_target_stage_TNM(image_tnm_train_dir, df_concat, stage_col='N-Stage')
print("N-Stage image counts:", n_counts)

# Contar imagens por M-stage
m_counts = count_images_per_target_stage_TNM(image_tnm_train_dir, df_concat, stage_col='M-Stage')
print("M-Stage image counts:", m_counts)

T-Stage image counts: {'1': 3562, '3': 1429, '0': 1802, '2': 2441}
N-Stage image counts: {'0': 3543, '3': 1967, '1': 2684, '2': 1040}
M-Stage image counts: {'1': 2866, '0': 6368}


In [147]:
tmn_image_counts = count_images_per_TMN_combination(image_tnm_train_dir, df_concat)

for combo, count in sorted(tmn_image_counts.items(), key=lambda x: -x[1]):
    print(f"TMN Combo {combo}: {count} images")

TMN Combo ('1', '1', '0'): 930 images
TMN Combo ('1', '0', '0'): 847 images
TMN Combo ('2', '1', '1'): 818 images
TMN Combo ('2', '0', '0'): 759 images
TMN Combo ('1', '3', '0'): 645 images
TMN Combo ('0', '0', '0'): 541 images
TMN Combo ('1', '2', '0'): 512 images
TMN Combo ('3', '0', '0'): 374 images
TMN Combo ('2', '0', '1'): 360 images
TMN Combo ('0', '3', '0'): 359 images
TMN Combo ('1', '0', '1'): 350 images
TMN Combo ('3', '2', '0'): 318 images
TMN Combo ('0', '1', '1'): 315 images
TMN Combo ('3', '1', '0'): 273 images
TMN Combo ('3', '3', '0'): 265 images
TMN Combo ('0', '0', '1'): 256 images
TMN Combo ('1', '3', '1'): 190 images
TMN Combo ('2', '3', '0'): 177 images
TMN Combo ('2', '3', '1'): 164 images
TMN Combo ('0', '3', '1'): 122 images
TMN Combo ('0', '2', '0'): 119 images
TMN Combo ('3', '1', '1'): 98 images
TMN Combo ('1', '1', '1'): 88 images
TMN Combo ('2', '2', '0'): 87 images
TMN Combo ('0', '1', '0'): 86 images
TMN Combo ('2', '1', '0'): 76 images
TMN Combo ('3', '

### Train

In [170]:
# Contar imagens por T-stage
t_counts = count_images_per_target_stage_TNM(image_tnm_train_dir, df_concat, stage_col='T-Stage')
print("T-Stage image counts:", t_counts)

# Contar imagens por N-stage
n_counts = count_images_per_target_stage_TNM(image_tnm_train_dir, df_concat, stage_col='N-Stage')
print("N-Stage image counts:", n_counts)

# Contar imagens por M-stage
m_counts = count_images_per_target_stage_TNM(image_tnm_train_dir, df_concat, stage_col='M-Stage')
print("M-Stage image counts:", m_counts)

T-Stage image counts: {'1': 2862, '3': 2829, '0': 2802, '2': 2941}
N-Stage image counts: {'0': 2843, '2': 2640, '3': 2967, '1': 2984}
M-Stage image counts: {'1': 5566, '0': 5868}


### Val

In [254]:
image_tnm_val_dir = '../../data/tnm/val/images'
tmn_image_counts = count_images_per_TMN_combination(image_tnm_val_dir, df_concat)

for combo, count in sorted(tmn_image_counts.items(), key=lambda x: -x[1]):
    print(f"TMN Combo {combo}: {count} images")

TMN Combo ('2', '1', '1'): 311 images
TMN Combo ('3', '2', '0'): 246 images
TMN Combo ('1', '3', '1'): 212 images
TMN Combo ('2', '0', '1'): 146 images
TMN Combo ('0', '0', '0'): 141 images
TMN Combo ('0', '3', '1'): 116 images
TMN Combo ('3', '1', '1'): 79 images
TMN Combo ('3', '0', '0'): 79 images
TMN Combo ('0', '1', '0'): 78 images
TMN Combo ('1', '2', '0'): 68 images
TMN Combo ('0', '2', '0'): 66 images
TMN Combo ('1', '0', '0'): 54 images
TMN Combo ('2', '2', '0'): 48 images
TMN Combo ('3', '3', '0'): 46 images
TMN Combo ('1', '1', '0'): 36 images
TMN Combo ('1', '3', '0'): 33 images
TMN Combo ('0', '3', '0'): 21 images
TMN Combo ('2', '0', '0'): 17 images
TMN Combo ('2', '3', '0'): 10 images


In [256]:
# Contar imagens por T-stage
t_counts = count_images_per_target_stage_TNM(image_tnm_val_dir, df_concat, stage_col='T-Stage')
print("T-Stage image counts:", t_counts)

# Contar imagens por\ N-stage
n_counts = count_images_per_target_stage_TNM(image_tnm_val_dir, df_concat, stage_col='N-Stage')
print("N-Stage image counts:", n_counts)

# Contar imagens por M-stage
m_counts = count_images_per_target_stage_TNM(image_tnm_val_dir, df_concat, stage_col='M-Stage')
print("M-Stage image counts:", m_counts)

T-Stage image counts: {'1': 403, '2': 532, '3': 450, '0': 422}
N-Stage image counts: {'3': 438, '0': 437, '1': 504, '2': 428}
M-Stage image counts: {'1': 864, '0': 943}
