In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import os

# Data Anlysis

In [2]:
# Import the data.
data_radiomics = pd.read_csv('x_train/features/radiomics.csv')
data_clinical = pd.read_csv('x_train/features/clinical_data.csv')
y_train = pd.read_csv('y_train.csv')

### Study of the clinical dataset.

In [3]:
# Show the first 5 rows.
data_clinical.head()

Unnamed: 0,PatientID,Histology,Mstage,Nstage,SourceDataset,Tstage,age
0,202,Adenocarcinoma,0,0,l2,2,66.0
1,371,LargeCell,0,2,l1,4,64.5722
2,246,SquamousCellCarcinoma,0,3,l1,2,66.0452
3,240,Nos,0,2,l1,3,59.3566
4,284,SquamousCellCarcinoma,0,3,l1,4,71.0554


In [4]:
# Get rid of the columns not needed.
data_clinical.columns = data_clinical.columns.str.strip() # Remove any leading or trailing white spaces in the column names.
data_clinical = data_clinical.drop(['SourceDataset'], axis=1)

# Drop the rows with missing values.
data_clinical = data_clinical.dropna()

In [5]:
# Check for duplicates.
print(data_clinical.duplicated().sum())

0


In [6]:
# One-hot encode the categorical variables in the 0/1 format.
data_clinical = pd.get_dummies(data_clinical, columns=['Histology'], dtype=int)

In [7]:
data_clinical.head()

Unnamed: 0,PatientID,Mstage,Nstage,Tstage,age,Histology_Adenocarcinoma,Histology_LargeCell,Histology_Nos,Histology_SquamousCellCarcinoma
0,202,0,0,2,66.0,1,0,0,0
1,371,0,2,4,64.5722,0,1,0,0
2,246,0,3,2,66.0452,0,0,0,1
3,240,0,2,3,59.3566,0,0,1,0
4,284,0,3,4,71.0554,0,0,0,1


### Study of the radiomic dataset.

In [8]:
# Show the first 5 rows of the radiomics data.
data_radiomics.head()

Unnamed: 0,PatientID,Compactness_1,Compactness_2,Maximum_Diameter,Spherical_Disproportion,Sphericity,Surface_Area,Surface_Volume_Area,Voxel_Volume
0,202,0.027815,0.274892,48.559242,1.537964,0.65021,5431.33321,0.275228,19786.0
1,371,0.023015,0.18821,75.703368,1.744961,0.573079,10369.568729,0.240727,43168.0
2,246,0.027348,0.26574,70.434367,1.55542,0.642913,10558.818691,0.200766,52655.0
3,240,0.026811,0.255406,46.8188,1.57612,0.634469,4221.412123,0.323878,13074.0
4,284,0.023691,0.199424,53.795911,1.71162,0.584242,5295.900331,0.327241,16237.0


In [9]:
# Any missing values?
data_radiomics.isnull().sum()

PatientID                  0
Compactness_1              0
Compactness_2              0
Maximum_Diameter           0
Spherical_Disproportion    0
Sphericity                 0
Surface_Area               0
Surface_Volume_Area        0
Voxel_Volume               0
dtype: int64

In [10]:
# Get rid of the columns not needed.
data_radiomics.columns = data_radiomics.columns.str.strip() # Remove any leading or trailing white spaces in the column names.
data_radiomics = data_radiomics.drop(['Surface_Area', 'Compactness_1', 'Spherical_Disproportion'], axis=1)

In [11]:
# Check the first 5 rows.
data_radiomics.head()

Unnamed: 0,PatientID,Compactness_2,Maximum_Diameter,Sphericity,Surface_Volume_Area,Voxel_Volume
0,202,0.274892,48.559242,0.65021,0.275228,19786.0
1,371,0.18821,75.703368,0.573079,0.240727,43168.0
2,246,0.26574,70.434367,0.642913,0.200766,52655.0
3,240,0.255406,46.8188,0.634469,0.323878,13074.0
4,284,0.199424,53.795911,0.584242,0.327241,16237.0


In [12]:
# Merge three dataframes.
data = pd.merge(data_clinical, data_radiomics, on='PatientID')
data = pd.merge(data, y_train, on='PatientID')

### Include the survival time inside the dataset.

In [13]:
data.head(20)

Unnamed: 0,PatientID,Mstage,Nstage,Tstage,age,Histology_Adenocarcinoma,Histology_LargeCell,Histology_Nos,Histology_SquamousCellCarcinoma,Compactness_2,Maximum_Diameter,Sphericity,Surface_Volume_Area,Voxel_Volume,SurvivalTime,Event
0,202,0,0,2,66.0,1,0,0,0,0.274892,48.559242,0.65021,0.275228,19786.0,1378,0
1,371,0,2,4,64.5722,0,1,0,0,0.18821,75.703368,0.573079,0.240727,43168.0,379,1
2,246,0,3,2,66.0452,0,0,0,1,0.26574,70.434367,0.642913,0.200766,52655.0,573,1
3,240,0,2,3,59.3566,0,0,1,0,0.255406,46.8188,0.634469,0.323878,13074.0,959,0
4,284,0,3,4,71.0554,0,0,0,1,0.199424,53.795911,0.584242,0.327241,16237.0,2119,0
5,348,0,2,2,65.0212,0,0,0,1,0.341038,63.74951,0.698663,0.197602,43036.0,706,1
6,384,0,0,3,78.7105,0,0,0,1,0.284341,81.767964,0.657577,0.170328,80565.0,78,1
7,244,0,0,1,70.0,1,0,0,0,0.149037,47.180504,0.530189,0.563329,4295.0,1369,1
8,100,0,0,4,74.4504,1,0,0,0,0.279116,43.393548,0.653524,0.330283,11286.0,197,1
9,173,0,2,4,53.0842,1,0,0,0,0.120841,112.222992,0.494392,0.180436,159487.0,196,1


In [14]:
# Save the data.
data.to_csv('train.csv', index=False)

### Images

We need to separate the images that have been deleted from the ones we will use.

In [33]:
# Image path.
image_path = 'x_train/images/'

# For images name not matching the PatientID. Discard them.
for file in os.listdir(image_path):
    if 'DS_Store' in file:
        continue
    # Get the PatientID. remove the extension and remove 'patient' from the name.
    patient_id = int(file.split('.')[0].replace('patient_', ''))

    # Check if patient_id figures in the data.
    if patient_id in data['PatientID'].values:
        print('Patient ID not found in the data:', patient_id)
        # move the file to the images/ret folder.
        os.rename(image_path + file, 'x_train/images_ret/' + file)