## EDA Workbook
Attempting Lumbar Spinal Degenerative Classification

## Creating a Config
Standardizing notebook changes with a dedicated CFG object

In [None]:
from pathlib import Path

In [None]:
class CFG:
    DATA_DIR = Path('data/')
    #DATA_DIR = Path('/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/)

## Exploring Available Data
Courtesy of Abhinav Suri
<https://www.kaggle.com/code/abhinavsuri/anatomy-image-visualization-overview-rsna-raids>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import pydicom
import numpy as np
import os
import glob
from tqdm import tqdm
import warnings

In [None]:
train = pd.read_csv(CFG.DATA_DIR / 'train.csv')

In [None]:
print("Total Cases: ", len(train))

In [None]:
train.columns

In [None]:
figure, axis = plt.subplots(1,3, figsize=(20,5)) 
for idx, d in enumerate(['foraminal', 'subarticular', 'canal']):
    diagnosis = list(filter(lambda x: x.find(d) > -1, train.columns))
    dff = train[diagnosis]
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        value_counts = dff.apply(pd.value_counts).fillna(0).T
    value_counts.plot(kind='bar', stacked=True, ax=axis[idx])
    axis[idx].set_title(f'{d} distribution')

todo; evaluate rebalancing of data based on above

### Grab metadata for each scan.
For each scan let's create an object with the following structure:

    meta_obj = {
        StudyInstanceUID: {
            'folder_path': ... # path to the folder,
            'SeriesInstanceUIDs': [ Array of the SeriesInstanceUIDs ],
            'SeriesDescriptions' [ Array of the Series Descriptions ]
        }, ...
    }

In [None]:
# List out all the Studies we have on patients.
part_1 = os.listdir(CFG.DATA_DIR / 'train_images')
part_1 = list(filter(lambda x: x.find('.DS') == -1, part_1))

In [None]:
df_meta_f = pd.read_csv(CFG.DATA_DIR / 'train_series_descriptions.csv')

In [None]:
p1 = [(x, CFG.DATA_DIR / f"train_images/{x}") for x in part_1]
meta_obj = { p[0]: {    'folder_path': p[1], 
                        'SeriesInstanceUIDs': [] } 
            for p in p1 }

In [None]:
for m in meta_obj:
    meta_obj[m]['SeriesInstanceUIDs'] = list(
        filter(lambda x: x.find('.DS') == -1, 
               os.listdir(meta_obj[m]['folder_path'])
              )
    )

In [None]:
# grabs the corresponding series descriptions
for k in tqdm(meta_obj):
    for s in meta_obj[k]['SeriesInstanceUIDs']:
        if 'SeriesDescriptions' not in meta_obj[k]:
            meta_obj[k]['SeriesDescriptions'] = []
        try:
            meta_obj[k]['SeriesDescriptions'].append(
                df_meta_f[(df_meta_f['study_id'] == int(k)) & 
                (df_meta_f['series_id'] == int(s))]['series_description'].iloc[0])
        except:
            print("Failed on", s, k)

In [None]:
meta_obj[list(meta_obj.keys())[1]]

### Single study (patient) example

In [None]:
patient = train.iloc[1]

In [None]:
ptobj = meta_obj[str(patient['study_id'])]

In [None]:
print(ptobj)

In [None]:
# Get data into the format
"""
im_list_dcm = {
    '{SeriesInstanceUID}': {
        'images': [
            {'SOPInstanceUID': ...,
             'dicom': PyDicom object
            },
            ...,
        ],
        'description': # SeriesDescription
    },
    ...
}
"""
im_list_dcm = {}
for idx, i in enumerate(ptobj['SeriesInstanceUIDs']):
    im_list_dcm[i] = {'images': [], 'description': ptobj['SeriesDescriptions'][idx]}
    images = glob.glob(f"{ptobj['folder_path']}/{ptobj['SeriesInstanceUIDs'][idx]}/*.dcm")
    for j in sorted(images, key=lambda x: int(x.split('/')[-1].replace('.dcm', ''))):
        im_list_dcm[i]['images'].append({
            'SOPInstanceUID': j.split('/')[-1].replace('.dcm', ''), 
            'dicom': pydicom.dcmread(j) })

In [None]:
# Function to display images
def display_images(images, title, max_images_per_row=4):
    # Calculate the number of rows needed
    num_images = len(images)
    num_rows = (num_images + max_images_per_row - 1) // max_images_per_row  # Ceiling division

    # Create a subplot grid
    fig, axes = plt.subplots(num_rows, max_images_per_row, figsize=(5, 1.5 * num_rows))
    
    # Flatten axes array for easier looping if there are multiple rows
    if num_rows > 1:
        axes = axes.flatten()
    else:
        axes = [axes]  # Make it iterable for consistency

    # Plot each image
    for idx, image in enumerate(images):
        ax = axes[idx]
        ax.imshow(image, cmap='gray')  # Assuming grayscale for simplicity, change cmap as needed
        ax.axis('off')  # Hide axes

    # Turn off unused subplots
    for idx in range(num_images, len(axes)):
        axes[idx].axis('off')
    fig.suptitle(title, fontsize=16)

    plt.tight_layout()

In [None]:
for i in im_list_dcm:
    display_images([x['dicom'].pixel_array for x in im_list_dcm[i]['images']], 
                   im_list_dcm[i]['description'])

### Coordinates of pathologies

In [None]:
df_coor = pd.read_csv(CFG.DATA_DIR / 'train_label_coordinates.csv')

In [None]:
df_coor.head()

In [None]:
def display_coor_on_img(c, i, title):
    center_coordinates = (int(c['x']), int(c['y']))
    radius = 10
    color = (255, 0, 0)  # Red color in BGR
    thickness = 2
    IMG = i['dicom'].pixel_array
    IMG_normalized = cv2.normalize(IMG, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
    
    IMG_with_circle = cv2.circle(IMG_normalized.copy(), center_coordinates, radius, color, thickness)
    
    # Convert the image from BGR to RGB for correct color display in matplotlib
    IMG_with_circle = cv2.cvtColor(IMG_with_circle, cv2.COLOR_BGR2RGB)
    
    # Display the image
    plt.imshow(IMG_with_circle)
    plt.axis('off')  # Turn off axis numbers and ticks
    plt.title(title)
    plt.show()

In [None]:
coor_entries = df_coor[df_coor['study_id'] == int(patient['study_id'])]

In [None]:
print("Only showing severe cases for this patient")
for idc, c in coor_entries.iterrows():
    for i in im_list_dcm[str(c['series_id'])]['images']:
        if int(i['SOPInstanceUID']) == int(c['instance_number']):
            try:
                patient_severity = patient[
                    f"{c['condition'].lower().replace(' ', '_')}_{c['level'].lower().replace('/', '_')}"
                ]
            except Exception as e:
                patient_severity = "unknown severity"
            title = f"{i['SOPInstanceUID']} \n{c['level']}, {c['condition']}: {patient_severity} \n{c['x']}, {c['y']}"
            if patient_severity == 'Severe':
                display_coor_on_img(c, i, title)

#### Thanks Abhinav!
---

## Roadmap

- [ ] Decide on model structure
- [ ] Decide on Convolution Dim
- [ ] Decide on kernel