# Imports

In [1]:
import sys
import os
import pandas as pd


# ADD PROJECT ROOT TO PATH (2 levels above)
project_root = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)


# NUMERICAL & IMAGE PROCESSING
import numpy as np
import cv2
import pydicom
import matplotlib.pyplot as plt


# RTSTRUCT PROCESSING
from src.preprocessing.rtstruct.rtstruct_io import (
    load_rtstruct,
    load_ct_images,
    read_dicom_image
)

from src.preprocessing.rtstruct.rtstruct_parsing import (
    extract_tumor_bboxes
)

from src.preprocessing.rtstruct.rtstruct_to_yolo import (
    save_yolo_format,
    save_images_as_jpeg,
    load_yolo_labels,
    draw_bboxes
)

from src.preprocessing.rtstruct.rtstruct_visualization import (
    visualize_bboxes,
    display_all_images_for_patient
)

from src.preprocessing.rtstruct.rtstruct_batch_processing import (
    process_all_patients
)



# NSCLC FILE PREPROCESSING
from src.utils.file_utils import (
    rename_files_in_folder,
    count_files_by_prefix,
    update_label_files
)

from src.utils.subject_utils import (
    count_files_and_patients,
    get_patient_images,
    get_patient_images_v2,       
    sample_patients_by_image_count,
    sample_patients
)


# NSCLC SPLITTING PIPELINE
from src.splitting.dataset_splitting import (
    split_data_nsclc,
    copy_sampled_files,
    copy_files,
    main
)

# Patient Processing
from src.preprocessing.rtstruct.rtstruct_batch_processing import process_all_patients

plt.rcParams["figure.dpi"] = 120

# Demografic Data

## Read Data

In [13]:
df = pd.read_csv('../../data/NSCLC-Radiomics-Lung1.clinical-version3-Oct-2019.csv')

In [14]:
df.head()

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1
1,LUNG1-002,83.8001,2.0,0,0,I,squamous cell carcinoma,male,155,1
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1
3,LUNG1-004,70.8802,2.0,1,0,II,squamous cell carcinoma,male,141,1
4,LUNG1-005,80.4819,4.0,2,0,IIIb,squamous cell carcinoma,male,353,1


## Data Exploration

In [15]:
df.columns

Index(['PatientID', 'age', 'clinical.T.Stage', 'Clinical.N.Stage',
       'Clinical.M.Stage', 'Overall.Stage', 'Histology', 'gender',
       'Survival.time', 'deadstatus.event'],
      dtype='object')

In [16]:
df[df['Histology'].isin(['nos', np.nan])]

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
12,LUNG1-013,65.3635,2.0,0,0,I,nos,male,3614,1
15,LUNG1-016,79.1129,2.0,0,0,I,nos,male,101,1
18,LUNG1-019,74.8200,2.0,0,0,I,nos,male,336,1
19,LUNG1-020,76.9692,2.0,3,0,IIIb,,male,139,1
20,LUNG1-021,54.6475,3.0,3,0,IIIb,,male,326,1
...,...,...,...,...,...,...,...,...,...,...
350,LUNG1-351,52.6434,4.0,2,0,IIIb,nos,female,463,1
352,LUNG1-353,78.3080,2.0,0,0,I,,male,182,1
393,LUNG1-394,72.2219,1.0,0,0,I,,male,344,1
401,LUNG1-402,68.6932,4.0,0,0,IIIb,,male,1617,1


In [17]:
df = df[~df['Histology'].isin(['nos', np.nan, 'adenocarcinoma'])]

In [18]:
df.shape

(266, 10)

In [19]:
df['Histology'].value_counts()

Histology
squamous cell carcinoma    152
large cell                 114
Name: count, dtype: int64

### Missing Values

In [20]:
total_observations = len(df)
nan_info_list = []

for column in df.columns:
    nan_count = df[column].isna().sum()
    nan_percentage = (nan_count / total_observations) * 100
    nan_info_list.append({'Column Name': column,
                          'NaN Count': nan_count,
                          'Percentage of NaNs': f"{nan_percentage:.2f}%",
                          'Total Observations': total_observations})

nan_info = pd.DataFrame(nan_info_list)

display(nan_info)

Unnamed: 0,Column Name,NaN Count,Percentage of NaNs,Total Observations
0,PatientID,0,0.00%,266
1,age,7,2.63%,266
2,clinical.T.Stage,0,0.00%,266
3,Clinical.N.Stage,0,0.00%,266
4,Clinical.M.Stage,0,0.00%,266
5,Overall.Stage,1,0.38%,266
6,Histology,0,0.00%,266
7,gender,0,0.00%,266
8,Survival.time,0,0.00%,266
9,deadstatus.event,0,0.00%,266


In [21]:
df.isna().sum()

PatientID           0
age                 7
clinical.T.Stage    0
Clinical.N.Stage    0
Clinical.M.Stage    0
Overall.Stage       1
Histology           0
gender              0
Survival.time       0
deadstatus.event    0
dtype: int64

In [22]:
rows_with_nan = df[df[['Overall.Stage', 'age']].isna().any(axis=1)]

display(rows_with_nan)

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
271,LUNG1-272,60.1396,5.0,2,0,,large cell,male,288,1
274,LUNG1-275,,2.0,3,0,IIIb,large cell,male,173,1
298,LUNG1-299,,1.0,0,0,IIIb,squamous cell carcinoma,male,1005,1
302,LUNG1-303,,2.0,0,0,I,large cell,male,24,1
307,LUNG1-308,,2.0,1,0,II,large cell,female,213,1
338,LUNG1-339,,4.0,2,0,IIIb,squamous cell carcinoma,male,120,1
340,LUNG1-341,,2.0,0,0,I,squamous cell carcinoma,male,1157,1
353,LUNG1-354,,1.0,2,0,IIIa,large cell,female,617,1


In [23]:
df.describe()

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Survival.time,deadstatus.event
count,259.0,266.0,266.0,266.0,266.0,266.0
mean,68.8131,2.62406,1.443609,0.022556,955.834586,0.894737
std,10.161052,1.109829,1.196937,0.259642,976.612993,0.307471
min,33.6849,1.0,0.0,0.0,10.0,0.0
25%,61.7857,2.0,0.0,0.0,258.0,1.0
50%,69.1034,2.0,2.0,0.0,551.0,1.0
75%,76.7242,4.0,2.0,0.0,1361.0,1.0
max,91.7043,5.0,4.0,3.0,4328.0,1.0


In [24]:
df.describe(include='object')

Unnamed: 0,PatientID,Overall.Stage,Histology,gender
count,266,265,266,266
unique,266,4,2,2
top,LUNG1-001,IIIb,squamous cell carcinoma,male
freq,1,119,152,183


In [25]:
df['Histology'].value_counts()

Histology
squamous cell carcinoma    152
large cell                 114
Name: count, dtype: int64

In [26]:
df['clinical.T.Stage'].value_counts()

clinical.T.Stage
2.0    102
4.0     86
1.0     43
3.0     34
5.0      1
Name: count, dtype: int64

In [27]:
df['Clinical.N.Stage'].value_counts()

Clinical.N.Stage
2    98
0    95
3    53
1    17
4     3
Name: count, dtype: int64

In [28]:
df['Clinical.M.Stage'].value_counts()

Clinical.M.Stage
0    264
3      2
Name: count, dtype: int64

In [29]:
df['Overall.Stage'].value_counts()

Overall.Stage
IIIb    119
IIIa     80
I        38
II       28
Name: count, dtype: int64

## Sampled G patients + E patients (added to LUNG-PET-CT dataset)

In [30]:
# List of selected PatientIDs for class G
selected_patients_g = [
    'LUNG1-007', 'LUNG1-010', 'LUNG1-014', 'LUNG1-027', 'LUNG1-050', 'LUNG1-061', 'LUNG1-063', 'LUNG1-064', 
    'LUNG1-078', 'LUNG1-082', 'LUNG1-086', 'LUNG1-098', 'LUNG1-122', 'LUNG1-135', 'LUNG1-145', 'LUNG1-146', 
    'LUNG1-150', 'LUNG1-167', 'LUNG1-202', 'LUNG1-206', 'LUNG1-222', 'LUNG1-253', 'LUNG1-264', 'LUNG1-266', 
    'LUNG1-295', 'LUNG1-296', 'LUNG1-302', 'LUNG1-321', 'LUNG1-325', 'LUNG1-326', 'LUNG1-329', 'LUNG1-334', 
    'LUNG1-339', 'LUNG1-346', 'LUNG1-347', 'LUNG1-355', 'LUNG1-356', 'LUNG1-358', 'LUNG1-359', 'LUNG1-362', 
    'LUNG1-365', 'LUNG1-366', 'LUNG1-369', 'LUNG1-371', 'LUNG1-374', 'LUNG1-378', 'LUNG1-379', 'LUNG1-380', 
    'LUNG1-383', 'LUNG1-386', 'LUNG1-389', 'LUNG1-391', 'LUNG1-399', 'LUNG1-404', 'LUNG1-413', 'LUNG1-420'
]

# Filter the dataset to include:
filtered_df = df[(df['PatientID'].isin(selected_patients_g)) | (df['Histology'] == 'large cell')]

# Display the filtered DataFrame
display(filtered_df)

Unnamed: 0,PatientID,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Overall.Stage,Histology,gender,Survival.time,deadstatus.event
0,LUNG1-001,78.7515,2.0,3,0,IIIb,large cell,male,2165,1
2,LUNG1-003,68.1807,2.0,3,0,IIIb,large cell,male,256,1
6,LUNG1-007,81.5288,2.0,2,0,IIIa,squamous cell carcinoma,male,137,1
9,LUNG1-010,71.0554,4.0,3,0,IIIb,squamous cell carcinoma,female,2119,0
13,LUNG1-014,66.7707,4.0,0,0,IIIb,squamous cell carcinoma,male,1247,1
...,...,...,...,...,...,...,...,...,...,...
390,LUNG1-391,60.0822,2.0,0,0,I,squamous cell carcinoma,female,256,1
398,LUNG1-399,77.7973,1.0,2,0,IIIa,squamous cell carcinoma,female,2835,0
403,LUNG1-404,74.2356,3.0,2,0,IIIa,squamous cell carcinoma,male,280,1
412,LUNG1-413,60.5315,4.0,2,0,IIIb,squamous cell carcinoma,female,246,1


In [31]:
total_observations = len(filtered_df)
nan_info_list = []

for column in filtered_df.columns:
    nan_count = filtered_df[column].isna().sum()
    nan_percentage = (nan_count / total_observations) * 100
    nan_info_list.append({'Column Name': column,
                          'NaN Count': nan_count,
                          'Percentage of NaNs': f"{nan_percentage:.2f}%",
                          'Total Observations': total_observations})

nan_info = pd.DataFrame(nan_info_list)

display(nan_info)

Unnamed: 0,Column Name,NaN Count,Percentage of NaNs,Total Observations
0,PatientID,0,0.00%,170
1,age,5,2.94%,170
2,clinical.T.Stage,0,0.00%,170
3,Clinical.N.Stage,0,0.00%,170
4,Clinical.M.Stage,0,0.00%,170
5,Overall.Stage,1,0.59%,170
6,Histology,0,0.00%,170
7,gender,0,0.00%,170
8,Survival.time,0,0.00%,170
9,deadstatus.event,0,0.00%,170


In [32]:
filtered_df.describe()

Unnamed: 0,age,clinical.T.Stage,Clinical.N.Stage,Clinical.M.Stage,Survival.time,deadstatus.event
count,165.0,170.0,170.0,170.0,170.0,170.0
mean,67.681981,2.482353,1.594118,0.035294,1071.370588,0.870588
std,10.231576,1.152427,1.194153,0.324432,1041.80046,0.336647
min,33.6849,1.0,0.0,0.0,10.0,0.0
25%,60.6959,2.0,0.0,0.0,265.25,1.0
50%,68.1068,2.0,2.0,0.0,637.0,1.0
75%,74.9295,4.0,2.75,0.0,1595.75,1.0
max,91.7043,5.0,4.0,3.0,4328.0,1.0


In [33]:
filtered_df.describe(include='object')

Unnamed: 0,PatientID,Overall.Stage,Histology,gender
count,170,169,170,170
unique,170,4,2,2
top,LUNG1-001,IIIb,large cell,male
freq,1,76,114,106


In [34]:
filtered_df['Histology'].value_counts()

Histology
large cell                 114
squamous cell carcinoma     56
Name: count, dtype: int64