In [None]:
import pandas as pd
import numpy as np
from fastcore.all import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
IN = Path('../input')
RAW = IN/'rsna-2023-abdominal-trauma-detection'
sns.set_style('whitegrid')

In [None]:
# [x.stem + x.suffix for x in RAW.ls()] # to get file names
sample_submission = pd.read_csv(RAW/'sample_submission.csv')
train_series_meta = pd.read_csv(RAW/'train_series_meta.csv')
train = pd.read_csv(RAW/'train.csv')
test_series_meta = pd.read_csv(RAW/'test_series_meta.csv')
image_level_labels = pd.read_csv(RAW/'image_level_labels.csv')

test_dicom_tags = pd.read_parquet(RAW/'test_dicom_tags.parquet')
train_dicom_tags = pd.read_parquet(RAW/'train_dicom_tags.parquet')

In [None]:
assert train.patient_id.value_counts().max() == 1
assert sample_submission.patient_id.value_counts().max() == 1
print('only one patient per submission')

In [None]:
TARGETS = [
 'bowel_healthy',
 'bowel_injury',
 'extravasation_healthy',
 'extravasation_injury',
 'kidney_healthy',
 'kidney_low',
 'kidney_high',
 'liver_healthy',
 'liver_low',
 'liver_high',
 'spleen_healthy',
 'spleen_low',
 'spleen_high']

In [None]:
print(f'***** There are {len(sample_submission.columns[1:])} binary targets\n', '\n'.join(TARGETS))
display(sample_submission.columns[1:].str.split('_').str[0].value_counts().rename(
    'counts').to_frame().reset_index(names='target_type'))

In [None]:
print('asserting `any_injury` is 1 with an injury of any type')
injury_cols = ['bowel_injury',
    'extravasation_injury', 'kidney_low', 'kidney_high',
    'liver_low', 'liver_high',
    'spleen_low', 'spleen_high']
assert all(train[injury_cols].any(axis=1) == train['any_injury'])
print('asserting targets for each type sum to 1')
assert all(train.bowel_healthy + train.bowel_injury == 1)
assert all(train.extravasation_healthy + train.extravasation_injury == 1)
assert all(train.kidney_healthy + train.kidney_low + train.kidney_high == 1)
assert all(train.liver_healthy + train.liver_low + train.liver_high == 1)
assert all(train.spleen_healthy + train.spleen_low + train.spleen_high == 1)

In [None]:
import sklearn.metrics

In [None]:
sample_weighted_mean = sklearn.metrics.log_loss(
    [[1, 0], [0, 1]], [[.6, .4], [.5, .5]], sample_weight=[10, 1])
a = sklearn.metrics.log_loss([[1, 0]], [[.6, .4]])
b = sklearn.metrics.log_loss([[0, 1]], [[.5, .5]])
print(sample_weighted_mean)
print(a, b)
print((a * 10 + b) / 11, 'same as the sample weighted mean')

In [None]:
print('There are', train.shape[0], 'patients in the training set.')
print('We are told there will be about 1100 patients in the test set.')

## Train images 

In [None]:
train_image_path = Path('../input/rsna-2023-abdominal-trauma-detection/train_images')

In [None]:
train_image_series = pd.Series([x.stem for x in train_image_path.ls()]).astype(int)

In [None]:
assert set(train_image_series) == set(train.patient_id)
print('Each patient has a single corresponding folder with their dicoms')

## Train/test meta series info
* `incomplete_organ` column only available in train data.

In [None]:
display(train_series_meta.head())
display(test_series_meta.head())

In [None]:
print('There are 1564 patients that have two scans')
train_series_meta.patient_id.value_counts().value_counts()

In [None]:
train = train.set_index('patient_id').join(train_series_meta.groupby('patient_id').patient_id.count().rename('n_scans'))
train

In [None]:
train.groupby('n_scans').mean()

In [None]:
train.groupby('bowel_injury').n_scans.mean()

In [None]:
healthy_cols = [c for c in list(train) if 'healthy' in c]
healthy_cols

In [None]:
train[healthy_cols].sum(axis=1).value_counts()

## Dicom tags

In [None]:
[(a, b )for a, b in zip(sorted(train_dicom_tags.columns), sorted(test_dicom_tags.columns)) if a != b]

In [None]:
display(test_dicom_tags.head(3))
display(train_dicom_tags.head(3))
assert sorted(test_dicom_tags.columns) == sorted(train_dicom_tags.columns)
display(train_dicom_tags.info())

In [None]:
train_dicom_tags[['SOPInstanceUID', 'SeriesNumber', 'PatientID', 'InstanceNumber']]

In [None]:
pd.set_option('display.max_columns', 100)


In [None]:
p10004

In [None]:
p10004 = train_dicom_tags.query('PatientID == "10004"')
p10004['InstanceNumber'].astype(int).sort_values()

In [None]:
p10004['InstanceNumber'].duplicated()

In [None]:
inst_nums = sorted([int(x.stem) for x in (train_image_path/"10004"/'51033').ls()])

In [None]:
inst_nums[0], inst_nums[-1]

In [None]:
train_series_meta.series_id.sort_values()

In [None]:
train_series_meta.query('series_id == 6')

In [None]:
train_dicom_tags

## Image level labels

In [None]:
image_level_labels.injury_name.value_counts()

In [None]:
image_level_labels.head(3)

## Segmentations

In [None]:
seg = RAW/'segmentations'

In [None]:
seg_paths = [int(x.stem) for x in seg.ls()]

set(train_image_series)

set(train_series_meta.patient_id) & set(seg_paths)

set(seg_paths) & set(train_image_series)