## Neural Network approach

-Realize the data is inbalanced

-Testing an approach of eliminating all entries with label 3

-Performing data split by patients, in train, test and val sets

-Changing the labeling to binary classification

-Implementing Random Forest to perform feature engineering, selecting the 75 best features

-Defining, compiling and training and testing the NN model 

-Evaluating the model's performance 

-Trying SVM (with AdaBoost)

### - Loading the required libraries

In [1]:
## Imports

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

### -Reading and defining the dataset

In [2]:
## Reading data

data = pd.read_csv("D:/DATASET LIDC-IDRI/processeddata/features.csv")
data = data.dropna()
data=data.drop(['diagnostics_Mask-original_CenterOfMass','diagnostics_Mask-original_CenterOfMassIndex','diagnostics_Versions_PyRadiomics','diagnostics_Versions_Numpy','diagnostics_Versions_SimpleITK','diagnostics_Versions_PyWavelet','diagnostics_Versions_Python','diagnostics_Configuration_Settings','diagnostics_Configuration_EnabledImageTypes','diagnostics_Image-original_Hash','diagnostics_Image-original_Dimensionality','diagnostics_Image-original_Spacing','diagnostics_Image-original_Size','diagnostics_Image-original_Mean','diagnostics_Image-original_Minimum','diagnostics_Image-original_Maximum','diagnostics_Mask-original_Hash','diagnostics_Mask-original_Spacing','diagnostics_Mask-original_Size','diagnostics_Mask-original_BoundingBox','diagnostics_Mask-original_VolumeNum','Sid','Unnamed: 0'],axis=1)

### - Verifying class imbalance and erasing all entries with the label 3

In [3]:
## Class inbalance??

data["Malignancy"].value_counts()

3    4262
2    3286
5    2511
4    2228
1    2061
Name: Malignancy, dtype: int64

In [4]:
## Erasing entries with mal==3 (10082 left)
i = data[data['Malignancy'] == 3].index
data = data.drop(i)
data = data.reset_index(drop=True)

### -Splitting the data

In [5]:
## Splitting patients into groups

all_patients = data['Pid'].unique() #732 (if we remove mal=3)

# Spliting patients into train, test and val groups
train_patients, temp_patients = train_test_split(all_patients, test_size=0.3, random_state=42)
test_patients, val_patients = train_test_split(temp_patients, test_size=0.5, random_state=42)

# Creating the train, test and val datasets
train_data = data[data['Pid'].isin(train_patients)]
test_data = data[data['Pid'].isin(test_patients)]
val_data = data[data['Pid'].isin(val_patients)]

In [6]:
## Train Test Split

# Creating the train, test and val sets
X_train = train_data.drop(['Pid','Malignancy'], axis=1)
X_test = test_data.drop(['Pid','Malignancy'], axis=1)
X_val = val_data.drop(['Pid','Malignancy'], axis=1)
y_train = train_data['Malignancy']
y_test = test_data['Malignancy']
y_val = val_data['Malignancy']

# Mapping malignancy levels to binary labels
y_train = y_train.map({1: 0, 2: 0, 4: 1, 5: 1})
y_test = y_test.map({1: 0, 2: 0, 4: 1, 5: 1})
y_val = y_val.map({1: 0, 2: 0, 4: 1, 5: 1})

# Standard Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

### -Feature engineering

In [8]:
## Feature engineering (Random Forest)

from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the classifier to the data
rf_classifier.fit(X_train, y_train)

# Get feature importances from the trained Random Forest model
feature_importances = rf_classifier.feature_importances_

# Sort features based on their importance
feature_indices = feature_importances.argsort()[::-1]  # Sort in descending order

# Select the top K important features
k = 75
top_k_features_indices = feature_indices[:k]

# Filter your data to keep only the selected features
X_train_selected = X_train[:, top_k_features_indices]
X_test_selected = X_test[:, top_k_features_indices]
X_val_selected = X_val[:, top_k_features_indices]

In [9]:
## Apply the feature engineering

X_train = X_train_selected
X_test = X_test_selected
X_val = X_val_selected
print(len(X_train[0]))

75


### -Neural Network

In [10]:
## NN aproach

# Define the model
input_shape = X_train.shape[1]

model = Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.002), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [11]:
## NN Results

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.2f}')

'''
sem 3: 0.62
com 3=0: 0.68
com 3=1: 0.62

Decidimos explorar a opção sem 3
50 features ANOVA: 0.62
50 features RF: 0.63
'''

Test Accuracy: 0.64


'\nsem 3: 0.62\ncom 3=0: 0.68\ncom 3=1: 0.62\n\nDecidimos explorar a opção sem 3\n50 features ANOVA: 0.62\n50 features RF: 0.63\n'

In [12]:
## K-NN approach (Reused algorithms from ML1 project)

# We decided to make that in a separate notebook since it did not perform the best
# It was very coputationaly expensive to run, not worth

### -SVM (with AdaBoost)

In [13]:
## SVM approach (AdaBoost)

from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif

# Assuming X_train_selected, X_test_selected, y_train, y_test are your selected features and labels

# Initialize an SVM classifier (as the base estimator)
svm_classifier = SVC(kernel='linear', C=1.0, random_state=42)

# Initialize AdaBoost classifier using the SVM as the base estimator
adaboost_clf = AdaBoostClassifier(base_estimator=svm_classifier, n_estimators=50, algorithm='SAMME', random_state=42)

# Fit the AdaBoost classifier on the training data
adaboost_clf.fit(X_train_selected, y_train)

# Evaluate on test data
accuracy = adaboost_clf.score(X_test_selected, y_test)
print(f"Accuracy of AdaBoost with SVM base estimator on test set: {accuracy}")

# Evaluate on validation data
accuracy_val = adaboost_clf.score(X_val_selected, y_val)
print(f"Accuracy of AdaBoost with SVM base estimator on validation set: {accuracy_val}")

Accuracy of AdaBoost with SVM base estimator on test set: 0.5997574287446937
Accuracy of AdaBoost with SVM base estimator on validation set: 0.6977535738597685
