In [None]:
import numpy as np
import pandas as pd
import pandas.api.types
import sklearn.metrics

import matplotlib.pyplot as plt


class ParticipantVisibleError(Exception):
    pass


def normalize_probabilities_to_one(df: pd.DataFrame, group_columns: list) -> pd.DataFrame:
    # Normalize the sum of each row's probabilities to 100%.
    # 0.75, 0.75 => 0.5, 0.5
    # 0.1, 0.1 => 0.5, 0.5
    row_totals = df[group_columns].sum(axis=1)
    if row_totals.min() == 0:
        raise ParticipantVisibleError('All rows must contain at least one non-zero prediction')
    for col in group_columns:
        df[col] /= row_totals
    return df


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str='patient_id') -> float:
    '''
    Pseudocode:
    1. For every label group (liver, bowel, etc):
        - Normalize the sum of each row's probabilities to 100%.
        - Calculate the sample weighted log loss.
    2. Derive a new any_injury label by taking the max of 1 - p(healthy) for each label group
    3. Calculate the sample weighted log loss for the new label group
    4. Return the average of all of the label group log losses as the final score.
    '''
    solution = solution.copy() # EDIT HERE TO AVOID SIDE EFFECTS
    submission = submission.copy() # EDIT HERE TO AVOID SIDE EFFECTS
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    # Run basic QC checks on the inputs
    if not pandas.api.types.is_numeric_dtype(submission.values):
        raise ParticipantVisibleError('All submission values must be numeric')

    if not np.isfinite(submission.values).all():
        raise ParticipantVisibleError('All submission values must be finite')

    if solution.min().min() < 0:
        raise ParticipantVisibleError('All labels must be at least zero')
    if submission.min().min() < 0:
        raise ParticipantVisibleError('All predictions must be at least zero')

    # Calculate the label group log losses
    binary_targets = ['bowel', 'extravasation']
    triple_level_targets = ['kidney', 'liver', 'spleen']
    all_target_categories = binary_targets + triple_level_targets

    label_group_losses = []
    for category in all_target_categories:
        if category in binary_targets:
            col_group = [f'{category}_healthy', f'{category}_injury']
        else:
            col_group = [f'{category}_healthy', f'{category}_low', f'{category}_high']

        solution = normalize_probabilities_to_one(solution, col_group)

        for col in col_group:
            if col not in submission.columns:
                raise ParticipantVisibleError(f'Missing submission column {col}')
        submission = normalize_probabilities_to_one(submission, col_group)
        label_group_losses.append(
            sklearn.metrics.log_loss(
                y_true=solution[col_group].values,
                y_pred=submission[col_group].values,
                sample_weight=solution[f'{category}_weight'].values
            )
        )

    # Derive a new any_injury label by taking the max of 1 - p(healthy) for each label group
    healthy_cols = [x + '_healthy' for x in all_target_categories]
    any_injury_labels = (1 - solution[healthy_cols]).max(axis=1)
    any_injury_predictions = (1 - submission[healthy_cols]).max(axis=1)
    any_injury_loss = sklearn.metrics.log_loss(
        y_true=any_injury_labels.values,
        y_pred=any_injury_predictions.values,
        sample_weight=solution['any_injury_weight'].values
    )

    label_group_losses.append(any_injury_loss)
    return np.mean(label_group_losses)

In [None]:
def make_solution(train): 
    solution = train.copy()

    binary_targets = ['bowel', 'extravasation'] 
    triple_level_targets = ['kidney', 'liver', 'spleen']
    all_target_categories = binary_targets + triple_level_targets

    for category in all_target_categories:
        if category in binary_targets:
            injury_weight = 2 if category == 'bowel' else 6
            x = train[f'{category}_healthy']
            solution[f'{category}_weight'] = np.where(x, 1, injury_weight)
        else:
            x = train[f'{category}_healthy']
            y = train[f'{category}_low']
            solution[f'{category}_weight'] = np.where(x, 1, np.where(y, 2, 4))
            col_group = [f'{category}_healthy', f'{category}_low', f'{category}_high']
    solution[f'any_injury_weight'] = np.where(train['any_injury'], 6, 1)
    return solution

In [None]:
train = pd.read_csv('/kaggle/input/rsna-2023-abdominal-trauma-detection/train.csv')
submission = pd.read_csv('/kaggle/input/rsna-2023-abdominal-trauma-detection/sample_submission.csv')
solution = make_solution(train)

In [None]:
score(solution, train, row_id_column_name='patient_id')

In [None]:
category = 'bowel'
x = train[[f'{category}_healthy', f'{category}_injury']]
mean_x = x.mean()[1]

# Calculate log loss scores for different constant predictions
const_pred = np.linspace(0, 1, 1000)
scores = [sklearn.metrics.log_loss(x, 
                                   np.ones_like(x) * [1 - pred, pred], 
                                   sample_weight=solution[f'{category}_weight']
                                  ) for pred in const_pred]

# Find the index of the minimum score
min_score_idx = np.argmin(scores)
min_score = scores[min_score_idx]
min_score_pred = const_pred[min_score_idx]

# Plot the log loss scores
plt.plot(const_pred, scores)
plt.axvline(x=mean_x, color='gray', linestyle='dashed', label=f'Mean x = {mean_x:.3f}')
plt.axvline(x=min_score_pred, color='red', linestyle='dashed', label=f'Min Score = {min_score:.3f}')
plt.xlabel('Constant Prediction')
plt.ylabel('Log Loss Score')
plt.title('Log Loss Scores for Constant Predictions')
plt.legend()
plt.show()

print('Mean x:', mean_x, 1 - mean_x)
print('Minimum Log Loss Score:', min_score)
print('Corresponding Prediction:', min_score_pred, 1 - min_score_pred)

In [None]:
%%time
best_constant = {}
for category in ['bowel', 'extravasation']:
    print('*' * 10, category, '*' * 10)
    x = train[[f'{category}_healthy', f'{category}_injury']]
    mean_x = x.mean()[1]

    # Calculate log loss scores for different constant predictions
    const_pred = np.linspace(mean_x, mean_x * 7, 1000)
    scores = [sklearn.metrics.log_loss(x, 
                                       np.ones_like(x) * [1 - pred, pred], 
                                       sample_weight=solution[f'{category}_weight']
                                      ) for pred in const_pred]
    # Find the index of the minimum score
    min_score_idx = np.argmin(scores)
    min_score = scores[min_score_idx]
    min_score_pred = const_pred[min_score_idx]

    best_constant[category + '_healthy'] = 1 - min_score_pred
    best_constant[category + '_injury'] = min_score_pred

    print('Mean x:', mean_x, 1 - mean_x)
    print('Minimum Log Loss Score:', min_score)
    print('Corresponding Prediction:', min_score_pred, 1 - min_score_pred)

In [None]:
%%time
for category in ['kidney', 'liver', 'spleen']:
    print('*' * 10, category, '*' * 10)
    x = train[[f'{category}_healthy', f'{category}_low', f'{category}_high']]
    mean_x_low = x.mean()[1]
    mean_x_high = x.mean()[2]

    # Calculate log loss scores for different constant predictions
    const_pred = np.linspace(mean_x_low, mean_x_low * 3, 25)
    const_pred_2 = np.linspace(mean_x_high, mean_x_high * 5, 25)
    preds = []
    scores = []
    for pred in const_pred: 
        for pred_2 in const_pred_2:
            p = [1 - pred - pred_2, pred, pred_2]
            preds.append(p)
            scores.append(sklearn.metrics.log_loss(x, 
                                       np.ones_like(x) * p, 
                                       sample_weight=solution[f'{category}_weight']))
    assert len(preds) == len(scores)
    # Find the index of the minimum score
    min_score_idx = np.argmin(scores)
    min_score = scores[min_score_idx]
    min_score_pred = preds[min_score_idx]

    best_constant[category + '_healthy'] = min_score_pred[0]
    best_constant[category + '_low'] = min_score_pred[1]
    best_constant[category + '_high'] = min_score_pred[2]

    print('mean_x_low:', mean_x_low)
    print('mean_x_high:', mean_x_high)
    print('Minimum Log Loss Score:', min_score)
    print('Corresponding Prediction:', min_score_pred)

In [None]:
t = train.copy()
cols = t.columns[1:-1]
t[cols] = cols.map(best_constant)
t

In [None]:
score(solution, t)

### Maybe the any injury score is not being optimized enought

In [None]:
category = 'any_injury'
x = train[category]
mean_x = x.mean()

# Calculate log loss scores for different constant predictions
const_pred = np.linspace(0, 1, 1000)
scores = [sklearn.metrics.log_loss(x, 
                                   np.ones_like(x) * pred, 
                                   sample_weight=solution[f'{category}_weight']
                                  ) for pred in const_pred]

# Find the index of the minimum score
min_score_idx = np.argmin(scores)
min_score = scores[min_score_idx]
any_injury_optimal = min_score_pred = const_pred[min_score_idx]

# Plot the log loss scores
plt.plot(const_pred, scores)
plt.axvline(x=mean_x, color='gray', linestyle='dashed', label=f'Mean x = {mean_x:.3f}')
plt.axvline(x=min_score_pred, color='red', linestyle='dashed', label=f'Min Score = {min_score:.3f}')
plt.xlabel('Constant Prediction')
plt.ylabel('Log Loss Score')
plt.title('Log Loss Scores for Constant Predictions')
plt.legend()
plt.show()

print('Mean x:', mean_x, 1 - mean_x)
print('Minimum Log Loss Score:', min_score)
print('Corresponding Prediction:', min_score_pred, 1 - min_score_pred)

In [None]:
preds, scores = [], []
for x in np.linspace(t['extravasation_injury'][0], .692, 100): 
    preds.append(x)
    tt = t.copy()
    tt['extravasation_injury'] = x
    tt['extravasation_healthy'] = 1 - x
    scores.append(score(solution, tt))
    
min_score_idx = np.argmin(scores)
min_score = scores[min_score_idx]
min_score_pred = preds[min_score_idx]
optimal_extravasation_injury = min_score_pred

# Plot the log loss scores
plt.plot(preds, scores)
plt.axvline(x=mean_x, color='gray', linestyle='dashed', label=f'')
plt.axvline(x=min_score_pred, color='red', linestyle='dashed', label=f'Min Score = {min_score:.3f}')
plt.xlabel('Constant Prediction')
plt.ylabel('Log Loss Score')
plt.title('Log Loss Scores for Constant Predictions')
plt.legend()
plt.show()

print('Minimum Log Loss Score:', min_score)
print('Corresponding Prediction:', min_score_pred)


In [None]:
preds, scores = [], []
max_delta = any_injury_optimal - t['spleen_low'][0] - t['spleen_high'][0]
for x in np.linspace(.01, max_delta, 100): 
    preds.append(x)
    tt = t.copy()
    tt['spleen_high'] += x
    tt['spleen_healthy'] = 1 - tt['spleen_low'] - tt['spleen_high']
    scores.append(score(solution, tt))
    
min_score_idx = np.argmin(scores)
min_score = scores[min_score_idx]
min_score_pred = preds[min_score_idx] + t['spleen_high'][0]

# Plot the log loss scores
plt.plot([x + t['spleen_high'][0] for x in preds], scores)
# plt.axvline(x=mean_x, color='gray', linestyle='dashed', label=f'')
plt.axvline(x=min_score_pred, color='red', linestyle='dashed', label=f'Min Score = {min_score:.3f}')
plt.xlabel('Constant Prediction')
plt.ylabel('Log Loss Score')
plt.title('Log Loss Scores for Constant Predictions')
plt.legend()
plt.show()

print('Minimum Log Loss Score:', min_score)
print('Corresponding Prediction:', min_score_pred)

In [None]:
display(best_constant)

In [None]:
# 1.779473 / (0.936447 + 1.779473) # checking .66 LB notebook's true extravasation
# ratio

In [None]:
submission[submission.columns[1:]] = submission.columns[1:].map(best_constant)
submission['extravasation_injury'] = optimal_extravasation_injury
submission['extravasation_healthy'] = 1 - optimal_extravasation_injury
display(submission)
display(best_constant)
submission.to_csv('submission.csv', index=False)