Mount AWS S3 bucket containing parquet data files

In [0]:

AWS_S3_BUCKET = "databricks-workspace-stack-brad-personal-bucket/AD_MRI_classification/raw/"
KEY_FILE = "/FileStore/tables/brad_databricks_personal_accessKeys_new.csv"

# extract aws credentials from hidden table 
aws_keys_df = spark.read.format("csv").option("header", "true").option("sep", ",").load(KEY_FILE)

ACCESS_KEY = aws_keys_df.collect()[0][0]
SECRET_KEY = aws_keys_df.collect()[0][1]

# specify bucket and mount point
MOUNT_NAME = f"/mnt/{AWS_S3_BUCKET.split('/')[-2]}"
SOURCE_URL = f"s3a://{AWS_S3_BUCKET}"
EXTRA_CONFIGS = { "fs.s3a.access.key": ACCESS_KEY, "fs.s3a.secret.key": SECRET_KEY}

# mount bucket
if any(mount.mountPoint == MOUNT_NAME for mount in dbutils.fs.mounts()):
    print(f"{MOUNT_NAME} is already mounted.")
else:
    dbutils.fs.mount(SOURCE_URL, MOUNT_NAME, extra_configs = EXTRA_CONFIGS)
    print(f"{MOUNT_NAME} is now mounted.")

Import libraries

In [0]:
# "standard"
import numpy as np
import pandas as pd

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# misc
import cv2
import magic
from IPython.display import clear_output
from imblearn.over_sampling import SMOTE
import pickle
import boto3

# Preprocessing
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import sys
sys.path.append('/Workspace/Users/bjedelma@gmail.com/Alzheimers-MRI-Classification/src')
from img_preprocessing import dict_to_image

Load and format training data

In [0]:
BASE_DIR = "C:/Users/bedelman/Documents/GitHub/Alzheimers-MRI-Classification/data/raw/"

'''
Label meanings
0 - Mild dementia
1 - Moderate dementia
2 - No dementia
3 - Very mild dementia
'''
Lab = ['Mild', 'Moderate', 'None', 'Very Mild']

train = pd.read_parquet("/dbfs/mnt/AD_classification/train-00000-of-00001-c08a401c53fe5312.parquet")
train.head()

Convert data to readable format

In [0]:
train['img_arr'] = train['image'].apply(dict_to_image)
train.drop("image", axis=1, inplace=True)
train.head()

Load and convert test data

In [0]:
test = pd.read_parquet("/dbfs/mnt/AD_classification/test-00000-of-00001-44110b9df98c5585.parquet")
test.head() 

# Also convert to readable format
test['img_arr'] = test['image'].apply(dict_to_image)
test.drop("image", axis=1, inplace=True)
test.head()

Examine some sample images to check data quality

In [0]:
train_lab_idx = np.asarray(train.iloc[:].label)

f, ax = plt.subplots(4,4)
for lab in range(4):
    for ex in range(4):
    
        class_lab = np.argwhere(train_lab_idx == 1)
        current_idx = np.random.randint(len(class_lab)-1,size = 1)
        current_idx = np.asarray(current_idx)
        
        ax[ex, lab].axis('off')
        ax[ex, lab].imshow(train.iloc[class_lab[current_idx[0]][0]].img_arr, cmap = "gray")
        if ex == 0: ax[ex, lab].set_title(Lab[lab])

Clearly, images show different slices within the brain, which may be a major confound...

Explore distribution of dataset classes

In [0]:
colors = ['#aec7e8', '#ffbb78', '#98df8a', '#ff9896']

f, ax = plt.subplots(1, 2, figsize=(15, 5))
unique, counts = np.unique(np.asarray(train.iloc[:].label), return_counts=True)
ax[0].bar(unique, counts, color=colors)
ax[0].set_xticks(unique)
ax[0].set_xticklabels(Lab, rotation=45)
ax[0].set_title('Training')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('# of images')

unique, counts = np.unique(np.asarray(test.iloc[:].label), return_counts=True)
ax[1].bar(unique, counts, color=colors)
ax[1].set_xticks(unique)
ax[1].set_xticklabels(Lab, rotation=45)
ax[1].set_title('Testing')
ax[1].set_xlabel('Class')
ax[1].set_ylabel('# of images')

We can see that there is an obvious imbalance across classes in both the training and testing sets. However, each class has been proportionally split between the two. Nevertheless, let's attempt to balance the training set such that the model sees equal numbers of each class. To avoid overfitting (e.g. simple resampling), we use the SMOTE method here to synthetically generate new data based on what is available.

In [0]:
# Extract features and labels
X_train = np.array([img.flatten() for img in train['img_arr']])
y_train = train['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
train_smote, train_smote_lab = smote.fit_resample(X_train.reshape(-1, 128*128), y_train)
train_smote = train_smote.reshape(-1, 128, 128)

# Create a new DataFrame with the resampled data
train_smote = pd.DataFrame({'label': train_smote_lab, 'img_arr': [img.tolist() for img in train_smote]})
train_smote_lab = train_smote['label']

# Plot the distribution of the different classes
colors = ['#aec7e8', '#ffbb78', '#98df8a', '#ff9896']

f, ax = plt.subplots(1, 1, figsize=(7, 5))
unique, counts = np.unique(train_smote_lab, return_counts=True)
ax.bar(unique, counts, color=colors)
ax.set_xticks(unique)
ax.set_xticklabels(Lab, rotation=45)
ax.set_title('Resampled Training')
ax.set_xlabel('Class')
ax.set_ylabel('# of images')

Now we see that the training set is balanced across classes. Let's inspect some of the new data for quality.

In [0]:
first_indices = {}
for num in range(0, 4):
    first_index = next((i for i, x in enumerate(train_smote_lab[5121:], start=5121) if x == num), None)
    first_indices[num] = first_index

print(first_indices)

# Visualize the images from the first_indices values
f, ax = plt.subplots(1, 4, figsize=(20, 5))
for i, label in enumerate(np.unique(train_smote_lab)):
    if first_indices[label] is not None:
        first_image = np.array(train_smote.iloc[first_indices[label]]['img_arr']).reshape(128, 128)
        ax[i].imshow(first_image, cmap='gray')
        ax[i].set_title(f"{Lab[label]}: SMOTE")
    ax[i].axis('off')

Rather the balancing classes with SMOTE, let's augment the existing data to expand and balance the data sets

In [0]:
train_tmp = np.array([img for img in train['img_arr']])
train_tmp = train_tmp.reshape(-1, 128, 128, 1)

train_lab_tmp = train['label'].values

# Create ImageDataGenerator object
datagen = ImageDataGenerator()

# Create a balanced dataset using ImageDataGenerator
train_balanced = []
train_lab_balanced = []
for class_label in np.unique(train_lab_tmp):
    class_indices = np.where(train_lab_tmp == class_label)[0]
    class_images = train_tmp[class_indices]
    class_labels = train_lab_tmp[class_indices]
    
    # Generate more images to balance the classes
    datagen = ImageDataGenerator(
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest'
    )
    for X_batch, y_batch in datagen.flow(class_images, class_labels, batch_size=len(class_images)):
        train_balanced.extend(X_batch)
        train_lab_balanced.extend(y_batch)
        if len(train_balanced) >= len(train_tmp):
            break

# Convert balanced data and labels to numpy arrays
train_balanced = np.array(train_balanced)
train_lab_balanced = np.array(train_lab_balanced)

# Create a new DataFrame with the balanced data
train_balanced = pd.DataFrame({'label': train_lab_balanced, 'img_arr': [img.tolist() for img in train_balanced]})

In [0]:
# Plot the distribution of the different classes in the style and color scheme as above
colors = ['#aec7e8', '#ffbb78', '#98df8a', '#ff9896']

f, ax = plt.subplots(1, 2, figsize=(14, 5))

# Left subplot: Class distribution
unique, counts = np.unique(train['label'], return_counts=True)
ax[0].bar(unique, counts, color=colors)
ax[0].set_xticks(unique)
ax[0].set_xticklabels(Lab, rotation=45)
ax[0].set_title('Class Distribution')
ax[0].set_xlabel('Class')
ax[0].set_ylabel('# of images')

# Right subplot: Images created by the data generator for all classes
for i, label in enumerate(np.unique(balanced_labels)):
    first_image = balanced_data[np.where(balanced_labels == label)][0].reshape(128, 128)
    ax[1].imshow(first_image, cmap='gray')
    ax[1].set_title(f"{Lab[label]}: Generated")
    ax[1].axis('off')

PCA

In [0]:
train_data = []
for i in range(len(train)):
    train_data.append(np.reshape(train.iloc[i].img_arr, (1, -1)))    
train_data = np.vstack(train_data[:])

# decompose with PCA and look at various metrics/info
pca = PCA(n_components = 100)
pca.fit(train_data)

plt.plot(np.linspace(1,100,100),pca.explained_variance_[:100]/sum(pca.explained_variance_[:100])*100,'b')
plt.title('PCA')
plt.xlabel('Component #')
plt.ylabel('Variance Explained')

In [0]:
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
load_train = np.matmul(train_data, loadings[:,:2])

# plot first two PCs
s  = plt.scatter(load_train[:,0], load_train[:,1], c = train_lab_idx*2, cmap = 'tab10', alpha = 0.75)
handles, labels = s.legend_elements()
legend = plt.legend(handles = handles, labels = Lab, title = 'Diagnosis', loc = 'upper right')
plt.axis('off')
plt.title('PCA projections')
plt.xlabel('PC 1')
plt.ylabel('PC 2')


In [0]:
# Prepare the data to be saved
data_to_save = {
    'train_data_balanced': train_data,
    'train_labels_balanced': train_lab_idx,
    'pca': pca
}

# Define the file path
file_path = '/Workspace/Users/bjedelma@gmail.com/Alzheimers-MRI-Classification/data/preprocessed/train_data_preprocessed.pkl'

# Save the data using pickle
with open(file_path, 'wb') as file:
    pickle.dump(data_to_save, file)

# Upload to S3 bucket
s3 = boto3.client('s3')
bucket_name = 'databricks-workspace-stack-brad-personal-bucket'
s3_file_path = 'AD_MRI_classification/raw/train_data_preprocessed.pkl'

s3.upload_file(file_path, bucket_name, s3_file_path)

In [0]:
import types
import setuptools

# List all libraries used in this notebook
libraries_used = [
    'numpy',
    'matplotlib',
    'pickle'
]

# Get the current versions installed
installed_packages = {pkg.key: pkg.version for pkg in setuptools.working_set}

# Filter the versions of the libraries used
libraries_versions = {lib: installed_packages[lib] for lib in libraries_used if lib in installed_packages}

display(libraries_versions)