In [138]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input, ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences



### 1. Load the data

In [48]:
filename = 'metadata_filtered.csv'

df = pd.read_csv(os.path.join("data", filename))

### So, whats my goal here? 
I have A few hundred records, but my time series is very low

In [59]:
# Benign = 0, Malignant = 1
df['target'] = df['benign_malignant'].map({'benign': 0, 'malignant': 1})
df.head(3)

Unnamed: 0.1,Unnamed: 0,image_name,patient_id,lesion_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,2,ISIC_0052212,IP_2842074,IL_9087444,female,50.0,lower extremity,nevus,benign,0
1,3,ISIC_0068279,IP_6890425,IL_4255399,female,45.0,head/neck,unknown,benign,0
2,6,ISIC_0074542,IP_4698288,IL_5017890,male,25.0,lower extremity,unknown,benign,0


In [139]:
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
model_feature_extractor = Model(inputs=base_model.input, outputs=base_model.output)

In [140]:
def load_and_preprocess_image(image_path):
    """
    Load an image, convert it to an array, preprocess it, and extract features using ResNet50.
    """
    # Adjust the path according to your dataset structure
    full_path = os.path.join('data', 'train', image_path + '.jpg')
    
    # Load and preprocess the image
    img = load_img(full_path, target_size=(224, 224))  # ResNet50 expects input size of 224x224
    img_array = img_to_array(img)
    img_array_expanded = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(img_array_expanded)
    
    # Extract features using the ResNet50 model
    features = model_feature_extractor.predict(preprocessed_img)
    
    # The output 'features' is a 2D array (1, 2048) when using pooling='avg'. You may want to squeeze it to (2048,)
    return np.squeeze(features)

### Find patient records with the same body location present in multiple appointments

In [154]:
unique_patients = df['patient_id'].unique()

In [155]:
# Lets for series of images for each patient and body area. 1 array for each time step
train_arr = []
label_arr = []
sub_arr = []
sub_labels = []

# Iterate over each unique patient
for patient_id in unique_patients:
    patient_df = df[df['patient_id'] == patient_id]
    
    # Iterate over each unique body area for the current patient
    for anatom_site in patient_df['anatom_site_general_challenge'].unique():
        anatom_site_df = patient_df[patient_df['anatom_site_general_challenge'] == anatom_site].copy()
        
        # Sort by 'age_approx' to maintain temporal order
        anatom_site_df.sort_values(by=['age_approx', 'target'], ascending=[True, False], inplace=True)
        first_images_by_age = anatom_site_df.groupby('age_approx').first().reset_index()
        # Check if there are multiple ages for the current body area
        if len(first_images_by_age) > 1:
            sub_arr = []
            sub_labels = []

            # Iterate over each row in the sorted DataFrame
            for _, row in first_images_by_age.iterrows():
                # load the image and the image and its label for each age to the sub-lists
                sub_arr.append(load_and_preprocess_image(row['image_name']))
                sub_labels.append(row['target'])
                
            # After processing all ages for the current anatom site, append the sub-lists to the main lists
            train_arr.append(sub_arr)
            label_arr.append(sub_labels)




In [156]:
print(f"Number of training sequences: {len(label_arr)}")
# number of arrays in label_arr with a 1 in them
print(f"Number of training sequences with melanoma: {sum([1 in labels for labels in label_arr])}")

Number of training sequences: 1801
Number of training sequences with melanoma: 206


In [162]:

# Pad sequences to ensure uniform length
X = pad_sequences(train_arr, padding='post', dtype='float32', maxlen=4)

# Since we're predicting the next image, we shift the labels by one
y = [seq[1:] + [0] for seq in label_arr]  # Append a dummy label (0) for the next, unseen image
y = pad_sequences(y, padding='post', dtype='float32', maxlen=4)
y = to_categorical(y)  # Convert labels to categorical as needed by LSTM

In [163]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Training set size: 1260
Validation set size: 270
Test set size: 271


In [164]:
model = Sequential([
    Masking(mask_value=0., input_shape=(X.shape[1], X.shape[2])),  # Masks padding
    LSTM(64, return_sequences=True),  # Adjust the LSTM units as needed
    Dense(2, activation='softmax')  # Output layer
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_6 (Masking)         (None, 4, 2048)           0         
                                                                 
 lstm_6 (LSTM)               (None, 4, 64)             540928    
                                                                 
 dense_6 (Dense)             (None, 4, 2)              130       
                                                                 
Total params: 541058 (2.06 MB)
Trainable params: 541058 (2.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [165]:
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3322864d0>

In [167]:
model.summary()


Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_6 (Masking)         (None, 4, 2048)           0         
                                                                 
 lstm_6 (LSTM)               (None, 4, 64)             540928    
                                                                 
 dense_6 (Dense)             (None, 4, 2)              130       
                                                                 
Total params: 541058 (2.06 MB)
Trainable params: 541058 (2.06 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [166]:
loss, accuracy = model.evaluate(X_val, y_val)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")


1/9 [==>...........................] - ETA: 0s - loss: 0.1603 - accuracy: 0.9487

Validation Loss: 0.12850692868232727
Validation Accuracy: 0.9713831543922424


In [168]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)

# Print the results
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


Test Loss: 0.13724789023399353
Test Accuracy: 0.9711538553237915
