In [3]:
import warnings
warnings.filterwarnings('ignore')
import tqdm
import os
import gc

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from scipy.stats import probplot, mode, linregress

import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from IPython.display import HTML

import gdcm
import pydicom
import cv2

ModuleNotFoundError: No module named 'gdcm'

In [None]:
df_train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
df_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

print(f'Training Set Shape = {df_train.shape} - Patients = {df_train["Patient"].nunique()}')
print(f'Training Set Memory Usage = {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Set Shape = {df_test.shape} - Patients = {df_test["Patient"].nunique()}')
print(f'Test Set Memory Usage = {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')

In [None]:
df_test

In [None]:
df_submission = pd.read_csv( '../input/osic-pulmonary-fibrosis-progression/sample_submission.csv' )
df_submission

In [None]:
def laplace_log_likelihood(y_true, y_pred, sigma):
    sigma_clipped = np.maximum(sigma, 70)
    delta_clipped = np.minimum(np.abs(y_true - y_pred), 1000)
    score = - np.sqrt(2) * delta_clipped / sigma_clipped - np.log(np.sqrt(2) * sigma_clipped)
    return np.mean(score)

laplace_log_likelihood(df_train['FVC'], df_train['FVC'], 70)

In [None]:
file_path = '../input/osic-pulmonary-fibrosis-progression/train/ID00228637202259965313869/1.dcm'
dicom_file = pydicom.dcmread(file_path)

print(f'Patient: ID00228637202259965313869 Image: 1.dcm Dataset\n{"-" * 55}\n\n{dicom_file}')

In [None]:
print(f'Accessing Patient Name with DICOM Keyword (PatientName): {dicom_file.PatientName}')
print(f'Accessing Patient Name with Tag Number ((0x10, 0x10)): {dicom_file[(0x10, 0x10)].value}')

In [None]:
dicom_file.dir()

In [None]:
def load_scan(patient_name):

    patient_directory = [pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{s}') for s in os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')]
    patient_directory.sort(key=lambda s: float(s.ImagePositionPatient[2]))
    patient_slices = np.zeros((len(patient_directory), patient_directory[0].Rows, patient_directory[0].Columns))

    for i, s in enumerate(patient_directory):
        patient_slices[i] = s.pixel_array

    return patient_slices

patient = 'ID00228637202259965313869'
patient_slices = load_scan(patient)
print(f'Patient {patient} CT scan is loaded - Volume Shape: {patient_slices.shape}')

In [None]:
def get_metadata(patient_name):

    patient_directory = [pydicom.dcmread(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}/{s}') for s in os.listdir(f'../input/osic-pulmonary-fibrosis-progression/train/{patient_name}')]

    try:
        patient_directory.sort(key=lambda s: float(s.ImagePositionPatient[2]))
        slice_positions = np.round([s.ImagePositionPatient[2] for s in patient_directory], 4)
        non_duplicate_idx = np.unique([np.where(slice_position == slice_positions)[0][0] for slice_position in slice_positions])
    except AttributeError:
        patient_directory.sort(key=lambda s: int(s.InstanceNumber))
        instance_numbers = np.array([int(s.InstanceNumber) for s in patient_directory])
        non_duplicate_idx = np.unique([np.where(instance_number == instance_numbers)[0][0] for instance_number in instance_numbers])

    patient_directory = list(np.array(patient_directory)[non_duplicate_idx])

    rows = patient_directory[0].Rows
    columns = patient_directory[0].Columns
    slices = len(patient_directory)

    pixel_spacings = np.zeros((len(patient_directory), 2))
    slice_positions = np.zeros((len(patient_directory)))
    slice_thicknesses = []

    for i, s in enumerate(patient_directory):
        slice_thicknesses.append(s.SliceThickness)
        try:
            pixel_spacings[i, :] = np.array(s.PixelSpacing)
        except AttributeError:
            pixel_spacings[i, :] = np.nan

        try:
            slice_positions[i] = s.ImagePositionPatient[2]
        except AttributeError:
            pass

    df_train.loc[df_train['Patient'] == patient_name, 'Rows'] = rows
    df_train.loc[df_train['Patient'] == patient_name, 'Columns'] = columns
    df_train.loc[df_train['Patient'] == patient_name, 'Slices'] = slices
    df_train.loc[df_train['Patient'] == patient_name, 'PixelSpacingX'] = list(np.round(pixel_spacings.mean(axis=0), 3))[0]
    df_train.loc[df_train['Patient'] == patient_name, 'PixelSpacingY'] = list(np.round(pixel_spacings.mean(axis=0), 3))[1]
    df_train.loc[df_train['Patient'] == patient_name, 'SliceSpacing'] = mode(np.abs(np.diff(np.round(slice_positions, 3))))[0][0]
    df_train.loc[df_train['Patient'] == patient_name, 'SliceThickness'] = mode(slice_thicknesses)[0][0]


for patient in tqdm.tqdm(df_train['Patient'].unique()):
    get_metadata(patient)

df_train['Rows'] = df_train['Rows'].astype(np.uint16)
df_train['Columns'] = df_train['Columns'].astype(np.uint16)
df_train['Slices'] = df_train['Slices'].astype(np.uint16)
df_train['SliceShape'] = df_train['Rows'].astype(str) + 'x' + df_train['Columns'].astype(str)
df_train['SliceThickness'] = df_train['SliceThickness'].astype(np.float32)
df_train['SliceSpacing'] = df_train['SliceSpacing'].astype(np.float32)

In [None]:
print(f'Slice Counts between 400 and 410\n{"-" * 32}\n\n', df_train.groupby('Patient').first().query('400 < Slices < 410')[['Rows', 'Columns', 'Slices', 'PixelSpacingX', 'PixelSpacingY', 'SliceSpacing', 'SliceThickness']], '\n')
df_train.loc[df_train['Patient'] == 'ID00132637202222178761324', 'SliceSpacing'] = 0.7
print(f'Slice Counts between 45 and 55\n{"-" * 30}\n\n', df_train.groupby('Patient').first().query('45 < Slices < 55')[['Rows', 'Columns', 'Slices', 'PixelSpacingX', 'PixelSpacingY', 'SliceSpacing', 'SliceThickness']])
df_train.loc[df_train['Patient'] == 'ID00128637202219474716089', 'SliceSpacing'] = 5.0

In [None]:
def crop_slice(s):

    """
    Crop frames from slices

    Parameters
    ----------
    s : numpy array, shape = (Rows, Columns)
    numpy array of slices with frame

    Returns
    -------
    s_cropped : numpy array, shape = (Rows - All Zero Rows, Columns - All Zero Columns)
    numpy array after the all zero rows and columns are dropped
    """

    s_cropped = s[~np.all(s == 0, axis=1)]
    s_cropped = s_cropped[:, ~np.all(s_cropped == 0, axis=0)]
    return s_cropped

In [None]:
df_train['VoxelVolume'] = df_train['PixelSpacingX'] * df_train['PixelSpacingY'] * df_train['SliceSpacing']
df_train['VoxelCount'] = df_train['Rows'].astype(int) * df_train['Columns'].astype(int) * df_train['Slices'].astype(int)

df_train.drop(columns=['SliceShape', 'PixelSpacingX', 'PixelSpacingY', 'SliceSpacing', 'Rows', 'Columns', 'Slices', 'SliceThickness'], inplace=True)

df_train.groupby('Patient').first()[['VoxelVolume', 'VoxelCount']].head(10)