# Neural Network approach

## - Loading the required libraries

In [1]:
## Imports

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## -Reading and defining the dataset 

In [2]:
## Reading data

data = pd.read_csv("D:/DATASET LIDC-IDRI/processeddata/features.csv")
data = data.dropna()
data=data.drop(['diagnostics_Mask-original_CenterOfMass','diagnostics_Mask-original_CenterOfMassIndex','diagnostics_Versions_PyRadiomics','diagnostics_Versions_Numpy','diagnostics_Versions_SimpleITK','diagnostics_Versions_PyWavelet','diagnostics_Versions_Python','diagnostics_Configuration_Settings','diagnostics_Configuration_EnabledImageTypes','diagnostics_Image-original_Hash','diagnostics_Image-original_Dimensionality','diagnostics_Image-original_Spacing','diagnostics_Image-original_Size','diagnostics_Image-original_Mean','diagnostics_Image-original_Minimum','diagnostics_Image-original_Maximum','diagnostics_Mask-original_Hash','diagnostics_Mask-original_Spacing','diagnostics_Mask-original_Size','diagnostics_Mask-original_BoundingBox','diagnostics_Mask-original_VolumeNum','Sid','Unnamed: 0'],axis=1)

### Instead of deleting the entries of mal=3 or mapping them all to 0 or 1, we had the idea to map the high 3's to 1 and the low 3's to 0. We did that by mapping the data into {1: 0, 2: 0, 4: 1, 5: 1} and sperating the data into the ones labeled as 0 and 1, and the ones labeled as 3. We trained the model on the binary dataset and predicted the dataset with our 3's.

In [3]:
## Separating the data onto 01 and 3

df = data
df['malignancy_mapped'] = df['Malignancy'].map({1: 0, 2: 0, 4: 1, 5: 1})
df01 = df[df['malignancy_mapped'].isin([0, 1])]
df3 = df[df['Malignancy'] == 3]

In [4]:
# Separate the features and labels for the 0-1 labeled instances
features_01 = df01.drop(['Malignancy', 'malignancy_mapped', 'Pid'], axis=1)
labels_01 = df01['malignancy_mapped']

# Separate the features for label 3 instances
features_3 = df3.drop(['Malignancy', 'malignancy_mapped', 'Pid'], axis=1)

# Train a model on 0-1 labeled instances
X_train, X_test, y_train, y_test = train_test_split(features_01, labels_01, test_size=0.2, random_state=50)
model = LogisticRegression(max_iter=1000)  # You can use any model of your choice
model.fit(X_train, y_train)

# Predict probabilities for label 3 instances
probs = model.predict_proba(features_3)

# Threshold to determine the 3+s and the 3-s
threshold = 0.4885
labels_3_mapped = np.where(probs[:, 1] >= threshold, 1, 0)
mapped_labels_3 = pd.Series(labels_3_mapped, index=df3.index)

# Combine all the labels
final_labels = pd.concat([labels_01, mapped_labels_3], axis=0).sort_index()

# Updating the dataset
df = df.drop(['malignancy_mapped','Malignancy'],axis=1)
df['Malv2'] = final_labels

In [5]:
df["Malv2"].value_counts()

0.0    7186
1.0    7162
Name: Malv2, dtype: int64

## -Splitting the data

#### Even though our dataset is built around slices, having a slice for every row, we are splitting the data by patients, and not by slices, to avoid having slices from the same patient separated. This way all the slices from a patient are either on the same fraction of the set.


In [6]:
## Splitting patients into groups

all_patients = df['Pid'].unique() #732 (if we remove mal=3)

# Spliting patients into train, test and val groups
train_patients, test_patients = train_test_split(all_patients, test_size=0.2, random_state=50)

# Creating the train, test and val datasets
train_data = df[df['Pid'].isin(train_patients)]
test_data = df[df['Pid'].isin(test_patients)]

In [7]:
## Train Test Split

# Creating the train, test and val sets
X_train = train_data.drop(['Pid','Malv2'], axis=1)
X_test = test_data.drop(['Pid','Malv2'], axis=1)
y_train = train_data['Malv2']
y_test = test_data['Malv2']

# Standard Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## -Feature engineering

#### We are using the Random Forest algorithm to select the 75 best features

In [8]:
## Feature engineering (Random Forest)

from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=50)

# Fit the classifier to the data
rf_classifier.fit(X_train, y_train)

# Get feature importances from the trained Random Forest model
feature_importances = rf_classifier.feature_importances_

# Sort features based on their importance
feature_indices = feature_importances.argsort()[::-1]  # Sort in descending order

# Select the top K important features
k = 75
top_k_features_indices = feature_indices[:k]

# Filter your data to keep only the selected features
X_train_selected = X_train[:, top_k_features_indices]
X_test_selected = X_test[:, top_k_features_indices]

In [9]:
## Apply the feature engineering

X_train = X_train_selected
X_test = X_test_selected

## -Neural Network

In [10]:
## NN aproach

# Define the model
input_shape = X_train.shape[1]

model = Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.002), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [11]:
## NN Results

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

Test Accuracy: 0.71


In [14]:
## Find patient

pid = "LIDC-IDRI-0101"

if pid in train_patients:
    print("Patient in train set")
else: print("Patient in test set")

Patient in train set
