In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!pip install kaggle --upgrade

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/"

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c rsna-2024-lumbar-spine-degenerative-classification -p "/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification/"

In [None]:
!unzip /content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification/rsna-2024-lumbar-spine-degenerative-classification.zip -d "/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification/"

In [None]:
!pip install pydicom

In [None]:
!pip install pylibjpeg

In [None]:
!pip install pylibjpeg-libjpeg

In [None]:
!pip install pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg

#Start with data manipulation and preparation for training

In [None]:
import numpy as np
import pandas as pd
import os
import pydicom
import cv2
import concurrent.futures
from tqdm import tqdm

Load the datasets

In [None]:
#Load the csv file to data frames
train_labels = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification/train.csv")
train_coordinates = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification//train_label_coordinates.csv")
series_descriptions = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification//train_series_descriptions.csv")

In [None]:
print("Lenghth of train_labels:", train_labels.shape[0])
print("Lenghth of train_coordinates:", train_coordinates.shape[0])
print("Lenghth of series_descriptions:", series_descriptions.shape[0])

Extract the images from their files and save them in an array

In [None]:
#Function to load the images
def load_images(image_path, img_size=(224, 224)):
    try:
        dicom = pydicom.dcmread(image_path)
        img = dicom.pixel_array
        img = img.astype(np.float32)
        img = (img - np.min(img)) / (np.max(img) - np.min(img)) * 255
        img = img.astype(np.uint8)
        img = cv2.resize(img, img_size)  # Resize image for the model
        img = np.expand_dims(img, axis=-1)  # Expand dimensions to (224, 224, 1)
        img = np.repeat(img, 3, axis=-1)
        return img
    except Exception as e:
        print(f"Error loading image: {image_path}, error: {e}")
        return None

def process_image(row):
    study_id, series_id, instance_number = row['study_id'], row['series_id'], row['instance_number']
    path = (
        "/content/drive/MyDrive/ColabNotebooks/"
        f"RSNA2024LumbarSpineDegenerativeClassification/train_images/{study_id}/"
        f"{series_id}/{instance_number}.dcm"
    )
    try:
        image = load_images(path)
        return {"study_id": study_id, "series_id": series_id, "instance_number": instance_number, "image": image}
    except Exception as e:
        return {"study_id": study_id, "series_id": series_id, "instance_number": instance_number, "image": None, "error": str(e)}

def load_images_faster(df, max_workers=10):
    # Convert DataFrame to dict for faster row access
    rows = df.to_dict(orient="records")

    # Use ThreadPoolExecutor for concurrent processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(process_image, rows), total=len(rows)))

    return pd.DataFrame(results)

# Call the optimized function
processed_df = load_images_faster(train_coordinates)


In [None]:
column_names = train_labels.columns
column_names = column_names.drop('study_id')

label_array = []

for id in train_labels['study_id']:
  for col in column_names:
    label_array.append({'study_id': id, 'condition': col, 'severity': train_labels.loc[train_labels['study_id'] == id, col].values[0]})

labels_new = pd.DataFrame(label_array)
labels_new['series_id'] = train_coordinates['series_id']

In [None]:
merged_dataset = labels_new
merged_dataset['image'] = processed_df['image']
merged_dataset['instance_number'] = processed_df['instance_number']

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10,10))
plt.imshow(merged_dataset.iloc[1][4])
plt.show()

In [None]:
merged_dataset.isnull().sum()

In [None]:
len(merged_dataset)

In [None]:
final_dataset = merged_dataset.dropna()

In [None]:
len(final_dataset)

In [None]:
images_array = final_dataset['image']
images_array = np.stack(images_array.to_numpy())

# Create a directory for saving the memmap file
output_dir = "/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification"
os.makedirs(output_dir, exist_ok=True)

# Path to the memmap file
memmap_path = os.path.join(output_dir, "images.npy")

array_shape = images_array.shape

# Create a memmap file and write the array to it
memmap = np.memmap(memmap_path, dtype='float32', mode='w+', shape=array_shape)

# Write the array to the memmap file
memmap[:] = images_array[:]

# Flush changes to the file
memmap.flush()

print(f"Array saved to memmap file: {memmap_path}")

In [None]:
final_dataset.to_csv("/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification/final_dataset.csv", index=False)

In [None]:
def sample_and_duplicate(df, class_column, sample_size=10000):
  """Samples and duplicates images to create a balanced dataset.

  Args:
    df: Pandas DataFrame containing image metadata.
    class_column: Name of the column containing class labels.
    sample_size: Desired number of images per class.

  Returns:
    Pandas DataFrame with sampled and duplicated images.
  """

  all_classes = df[class_column].unique()
  sampled_df = pd.DataFrame()

  for class_label in all_classes:
    class_data = df[df[class_column] == class_label]
    num_images = len(class_data)

    if num_images >= sample_size:
      # Sample if enough images
      sampled_class_data = class_data.sample(n=sample_size, replace=False, random_state=42)
    else:
      # Duplicate if not enough images
      num_duplicates = sample_size - num_images
      duplicate_indices = np.random.choice(num_images, size=num_duplicates, replace=True)
      duplicates = class_data.iloc[duplicate_indices]
      sampled_class_data = pd.concat([class_data, duplicates], ignore_index=True)

    sampled_df = pd.concat([sampled_df, sampled_class_data], ignore_index=True)

  return sampled_df

balanced_dataset = sample_and_duplicate(final_dataset, class_column='severity', sample_size=10000)

In [None]:
balanced_images = balanced_dataset['image']
balanced_images = np.stack(balanced_images.to_numpy())

# Create a directory for saving the memmap file
output_dir = "/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification"
os.makedirs(output_dir, exist_ok=True)

# Path to the memmap file
memmap_path = os.path.join(output_dir, "balanced_images.npy")

array_shape = balanced_images.shape

# Create a memmap file and write the array to it
memmap = np.memmap(memmap_path, dtype='float32', mode='w+', shape=array_shape)

# Write the array to the memmap file
memmap[:] = balanced_images[:]

# Flush changes to the file
memmap.flush()

print(f"Array saved to memmap file: {memmap_path}")


In [None]:
balanced_dataset.to_csv("/content/drive/MyDrive/ColabNotebooks/RSNA2024LumbarSpineDegenerativeClassification/balanced_dataset.csv", index=False)