1. Load the CSV file into a DataFrame.
2. Parse the "Radiomics" column, as it contains JSON data.
3. Remove columns with the same values across all rows.

In [98]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# create random seed for reproducibility
random_state = 42

# Load the data from DF_Radiomics_noduls_with_diagnose.csv
file_path = "DF_Radiomics_noduls_with_diagnose.csv"
data = pd.read_csv(file_path)


# Convert the 'Labels' column to an integer
data['Labels'] = data['Labels'].astype(int)

# drop all rows where the label == 0
data = data[data.Labels != 0]

# Parse the JSON in the 'Radiomics' column
data['Radiomics'] = data['Radiomics'].apply(json.loads)

# Convert the 'Radiomics' column into separate columns
radiomics_data = pd.json_normalize(data['Radiomics'])


# Drop the original 'Radiomics' column
data = data.drop('Radiomics', axis=1)


# Reset the indices of both DataFrames
data = data.reset_index(drop=True)
radiomics_data = radiomics_data.reset_index(drop=True)

# Combine the data with the new radiomics columns
data = pd.concat([data, radiomics_data], axis=1)

# Remove columns with the same value across all rows
data = data.loc[:, (data != data.iloc[0]).any()]

#remove columns with all NaN values
data = data.dropna(axis=1, how='all')

print(data.shape)


(309, 103)


In [99]:
data.head()

Unnamed: 0,Patient,Node,Labels,diagnostics_Image-original_Hash,diagnostics_Image-original_Spacing,diagnostics_Image-original_Size,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_Hash,...,original_gldm_GrayLevelNonUniformity,original_gldm_GrayLevelVariance,original_gldm_HighGrayLevelEmphasis,original_gldm_LargeDependenceEmphasis,original_gldm_LargeDependenceHighGrayLevelEmphasis,original_gldm_LargeDependenceLowGrayLevelEmphasis,original_gldm_LowGrayLevelEmphasis,original_gldm_SmallDependenceEmphasis,original_gldm_SmallDependenceHighGrayLevelEmphasis,original_gldm_SmallDependenceLowGrayLevelEmphasis
0,LIDC-IDRI-0068,Node_N1,3,bea2c9750ea59a0bebb6d3bd63ffacc40fcf6a28,"[0.683594, 0.683594, 1.25]","[512, 512, 261]",-1026.065264,-3024.0,3071.0,0506d1d0d6522eddd1640c8ea75c2fc5a9266270,...,7.355556,60.706173,469.644444,23.444444,16578.377778,0.053875,0.021012,0.488461,152.929922,0.019809
1,LIDC-IDRI-0068,Node_N1,3,bea2c9750ea59a0bebb6d3bd63ffacc40fcf6a28,"[0.683594, 0.683594, 1.25]","[512, 512, 261]",-1026.065264,-3024.0,3071.0,9d7da356d43e2f7ad7f374f6c193e97f6088d7c7,...,7.467153,72.801002,471.051095,17.49635,13573.328467,0.11065,0.024328,0.494688,165.356306,0.010062
2,LIDC-IDRI-0068,Node_N1,3,bea2c9750ea59a0bebb6d3bd63ffacc40fcf6a28,"[0.683594, 0.683594, 1.25]","[512, 512, 261]",-1026.065264,-3024.0,3071.0,c0a43747a23d26b107e21614525f2fd8870ffefc,...,7.685185,43.527006,277.787037,20.37037,9310.490741,0.084481,0.031811,0.463956,84.174037,0.027819
3,LIDC-IDRI-0068,Node_N1,3,bea2c9750ea59a0bebb6d3bd63ffacc40fcf6a28,"[0.683594, 0.683594, 1.25]","[512, 512, 261]",-1026.065264,-3024.0,3071.0,72a09dc3f5d5d146b13402b8ef109422cc3f38a5,...,6.78022,35.367709,229.21978,18.78022,7065.923077,0.084783,0.026368,0.465301,67.725183,0.021973
4,LIDC-IDRI-0072,Node_N1,1,54705f26f9320581c90452445aa820fe9630d5e9,"[0.732422, 0.732422, 1.25]","[512, 512, 305]",-871.93633,-3024.0,3071.0,05efcefff38c73903c3d7839bb987a49176f6068,...,629.334146,45.147393,1253.131545,28.918031,43475.541623,0.020967,0.001319,0.262518,254.476429,0.000632


In [100]:
#remove hash columns
data = data.drop(['diagnostics_Image-original_Hash', 'diagnostics_Mask-original_Hash'], axis=1)

# ok looks like all the objeckt columns except of "Patient" & "Node" are in this form [0.683594, 0.683594, 1.25] which is a list of multiple floats
# exploade them into multiple columns

object_columns = data.select_dtypes(include=['object']).columns.tolist()

# Remove 'Patient' and 'Node' from the list
object_columns.remove('Patient')
object_columns.remove('Node')

# Explode the lists in each object column into multiple columns
for column in object_columns:
    # Convert each list to a Series and expand it into multiple columns
    expanded_columns = data[column].apply(pd.Series)
    
    # Rename the expanded columns to have the original column name as a prefix
    expanded_columns = expanded_columns.rename(columns=lambda x: f"{column}_{x}")
    
    # Drop the original column from the DataFrame
    data = data.drop(column, axis=1)
    
    # Concatenate the expanded columns to the DataFrame
    data = pd.concat([data, expanded_columns], axis=1)

In [103]:
# Create a stratified split
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['Labels'])

# Save the data to CSV files
train_data.to_csv('DF_Radiomics_noduls_with_diagnose_train_data.csv', index=False)
test_data.to_csv('DF_Radiomics_noduls_with_diagnose_test_data.csv', index=False)

In [104]:
print("Train data:", train_data.shape)
print("Test data:", test_data.shape)

Train data: (247, 118)
Test data: (62, 118)


In [106]:
train_data.head()

Unnamed: 0,Patient,Node,Labels,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,diagnostics_Mask-original_VolumeNum,original_firstorder_10Percentile,original_firstorder_90Percentile,...,diagnostics_Mask-original_BoundingBox_2,diagnostics_Mask-original_BoundingBox_3,diagnostics_Mask-original_BoundingBox_4,diagnostics_Mask-original_BoundingBox_5,diagnostics_Mask-original_CenterOfMassIndex_0,diagnostics_Mask-original_CenterOfMassIndex_1,diagnostics_Mask-original_CenterOfMassIndex_2,diagnostics_Mask-original_CenterOfMass_0,diagnostics_Mask-original_CenterOfMass_1,diagnostics_Mask-original_CenterOfMass_2
163,LIDC-IDRI-0247,Node_N1,1,-766.499705,-2048.0,3071.0,26,1,-417.5,134.5,...,83,7,6,1,151.153846,279.769231,83.0,-65.528846,14.855769,-107.75
192,LIDC-IDRI-0258,Node_N1,2,-712.560507,-2048.0,3071.0,253,1,-511.0,-98.0,...,87,13,13,3,351.913043,354.434783,87.924901,56.538859,69.211957,-107.937747
228,LIDC-IDRI-0276,Node_N1,1,-774.785474,-2048.0,3071.0,355,1,-289.2,194.6,...,51,13,14,5,352.177465,357.143662,53.08169,68.62478,71.116637,-224.295775
158,LIDC-IDRI-0246,Node_N1,1,-753.441353,-2048.0,3071.0,178,1,-448.9,178.0,...,93,12,12,3,308.140449,174.146067,94.179775,40.034726,-81.148385,-149.800562
225,LIDC-IDRI-0275,Node_N1,1,-734.181876,-2048.0,3071.0,25,1,-585.4,-321.8,...,64,5,5,2,126.48,369.96,64.64,-90.428065,84.579872,-252.9


Scaling the data

In [109]:
from sklearn.preprocessing import StandardScaler

# Get all column names
all_columns = train_data.columns.tolist()

# Exclude the first three columns
features = all_columns[3:]

# Create a stratified split
train_data, test_data = train_test_split(data, test_size=0.2, stratify=data['Labels'])

# Create a scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
train_data[features] = scaler.fit_transform(train_data[features])
test_data[features] = scaler.transform(test_data[features])

# Save the data to CSV files
train_data.to_csv('DF_Radiomics_noduls_with_diagnose_train_data_scaled.csv', index=False)
test_data.to_csv('DF_Radiomics_noduls_with_diagnose_test_data_scaled.csv', index=False)

# Fully Connected Neural Network

In [111]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

# Assuming 'data' is your pandas DataFrame
# Ensure the DataFrame only contains numeric values
data = data.apply(pd.to_numeric, errors='coerce')

# Split data into features and labels
X = data.drop('Labels', axis=1).values
y = data['Labels'].values

# Convert to PyTorch tensors
X_tensor = torch.tensor(X).float()
y_tensor = torch.tensor(y).float()

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, stratify=y_tensor)

# Create TensorDatasets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

In [112]:
print("Train data:", train_dataset.tensors[0].shape)

Train data: torch.Size([247, 117])


In [124]:
# Model
class FCNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(FCNN, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            #nn.Dropout(0.5),  # Dropout for regularization
            nn.Linear(hidden_size, hidden_size*2),
            nn.ReLU(),
            nn.Linear(hidden_size*2, hidden_size),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(hidden_size, output_size)
        )

    def forward(self, x):
        return self.net(x)

# Hyperparameters
input_size = train_dataset.tensors[0].shape[1]  # Get the number of features from your dataset
hidden_size = 64  # You can tune this
output_size = 3   # 3 labels 
learning_rate = 0.0001
batch_size = 32
epochs = 3  # Adjust based on your runtime requirement
clip_value = 1  # for gradient clipping

# Initialize model, loss function, and optimizer
model = FCNN(input_size, hidden_size, output_size).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)  # L2 regularization

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [122]:
#check if cuda is available, print the gpu model name
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print(torch.cuda.get_device_name(0))
    # Move model to the device
    model = model.to(device)

NVIDIA GeForce RTX 3090


In [123]:
# Training loop
model.fc = nn.Linear(num_features, output_size)  # 'num_output_neurons' is the number of output neurons in your linear layer

# Training loop
model.train()
for epoch in range(epochs):
    for inputs, targets in train_loader:
        # Move inputs and targets to the device
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)

        # Convert targets to LongTensor
        targets = targets.long()

        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
        
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [6,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [7,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [10,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [11,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [18,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [21,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:250: nll_lo

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [116]:
# Evaluate the model
# Evaluation function
def evaluate(model, data_loader, device):
    model.eval()  # Set the model to evaluation mode
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():  # Disable gradient calculations
        for inputs, targets in data_loader:
            inputs = inputs.to(device)
            targets = targets.to(device)

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)  # Get the index of the max log-probability

            total_predictions += targets.size(0)
            correct_predictions += (predicted == targets).sum().item()

    accuracy = correct_predictions / total_predictions
    return accuracy

# Use the function
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
accuracy = evaluate(model, test_loader, device)
print(f'Accuracy: {accuracy * 100}%')

Accuracy: 0.0%
