## Generate the intrapatient analysis csv from the longitudinal test csv 

In [None]:
import pandas as pd

def generate_consecutive_samples(row):
    """
    Generate oversampled data based on the scandates and label.
    Each consecutive sequence of scandates is created.
    The label is assigned based on the last scan date in the combination.
    """
    scandates = row['scandate'].split('-')
    label = row['label']
    samples = []

    for start in range(len(scandates)):
        for end in range(start + 1, len(scandates) + 1):
            # Create a new sample with the consecutive scandates and the label
            new_scandate = '-'.join(scandates[start:end])
            new_label = label if scandates[-1] in new_scandate else 0
            samples.append({'pat_id': row['pat_id'], 'scandate': new_scandate, 'label': new_label})

    return samples

# Load test csv
file_path = '/csvs/longitudinal_test.csv'  
df = pd.read_csv(file_path)
consecutive_oversampled_data = pd.DataFrame([sample for _, row in df.iterrows() for sample in generate_consecutive_samples(row)])

# Save the oversampled dataframe to a CSV file
output_file_path = '/analysis_csvs/intrapatient_analysis_test.csv' 
consecutive_oversampled_data.to_csv(output_file_path, index=False)


## Merge the model output csv and intrapatient analysis csv

In [None]:
validation_results_df = pd.read_csv("/csvs/infer_test.csv")
scandate_data_df = pd.read_csv("/analysis_csvs/intrapatient_analysis_test.csv")

concatenated_df = pd.concat([scandate_data_df, validation_results_df], axis=1)
scandate_data = pd.read_csv("/analysis_csvs/intrapatient_analysis_test.csv")
validation_data = concatenated_df.copy()

# Merge the 'scandate' column from scandate_data into validation_data on 'pat_id'
validation_results_with_scandate = validation_data.merge(
    scandate_data[['pat_id', 'scandate']],
    on='pat_id',
    how='left'
)

# Adding a new column 'num_scans' to count the number of scandates
validation_results_with_scandate['num_scans'] = validation_results_with_scandate['scandate'].apply(lambda x: len(x.split('-')))
validation_results_with_scandate.to_csv("/analysis_csvs/intrapatient_analysis.csv")

## Plot the intrapatient analysis results 

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots
plt.style.use(['nature'])
from confidenceinterval import roc_auc_score as auc_ci, accuracy_score as acc_ci, f1_score as f1_ci
from sklearn.metrics import (
    roc_auc_score, balanced_accuracy_score, average_precision_score,
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

plt.rcParams.update({
        'figure.dpi': '400' ,# Default DPI value
})

## UTIL FUNCTIONS 
def calculate_f1_scores(df):
    '''
    Calculate F1 scores for a given dataframe
    - Group by patient id
    - For each patient, calculate f1 scores and return the mean
    '''
    grouped = df.groupby('pat_id')

    f1_scores = {}
    for name, group in grouped:
        score = f1_score(group['GT'], group['ModelPredictions'], average="weighted")
        f1_scores[name] = score

    return np.mean(list(f1_scores.values()))


def calculate_f1_scores_with_ci(df, n_bootstraps=100, ci=95):
    '''Calculate F1 scores with confidence intervals'''
    grouped = df.groupby('pat_id')
    f1_scores = {}
    for name, group in grouped:
        score = f1_score(group['GT'], group['ModelPredictions'], average="macro")
        f1_scores[name] = score
    original_mean = np.mean(list(f1_scores.values()))

    bootstrap_means = []
    for _ in range(n_bootstraps):
        resampled_df = df.sample(n=len(df), replace=True)
        resampled_grouped = resampled_df.groupby('pat_id')
        resampled_f1_scores = []
        for name, group in resampled_grouped:
            score = f1_score(group['GT'], group['ModelPredictions'], average="macro")
            resampled_f1_scores.append(score)
        bootstrap_means.append(np.mean(resampled_f1_scores))

    lower_bound = np.percentile(bootstrap_means, (100 - ci) / 2)
    upper_bound = np.percentile(bootstrap_means, 100 - (100 - ci) / 2)

    return (original_mean, lower_bound, upper_bound)


def get_metircs_data(df):
    """
    - filter by number of scans
    - Collect metrics based on number of scans
    returns dataframe indexed by number of scans
    """
    metrics_data = {
        'f1_score': [],
        'f1_lower_ci': [], 'f1_upper_ci': [],
        'n_cases': [], 'n_scans': [], "n_true_prediction": [],
        "num_positive_GT":[]
    }
    for num in range(1, 13):
        
        sample_df = df[df.num_scans == num]
        if sample_df["GT"].nunique() > 1:        
            # calculate f1 score for filtered number of scan
            f1_mean, lower_bound, upper_bound = calculate_f1_scores_with_ci(sample_df)
            metrics_data['f1_score'].append(f1_mean)
            metrics_data["f1_lower_ci"].append(lower_bound)
            metrics_data["f1_upper_ci"].append(upper_bound)
            metrics_data['n_cases'].append(len(sample_df))
            metrics_data['n_scans'].append(num)
            metrics_data["n_true_prediction"].append(len(sample_df[sample_df["GT"] == sample_df["ModelPredictions"]]))
            metrics_data["num_positive_GT"].append(len(sample_df[sample_df["GT"]==1]))
    metrics_df = pd.DataFrame(metrics_data)

    metrics_df.set_index("n_scans", inplace=True)
    metrics_df["Tot(Pos)"]= [f"{total}\n({true})" for total, true in zip(metrics_df['n_cases'], metrics_df['num_positive_GT'])]

    return metrics_df

def make_f1_plot(metrics_df):
    
    fig, ax = plt.subplots(figsize=(4, 3))
    error = [metrics_df['f1_score'] - metrics_df['f1_lower_ci'], metrics_df['f1_upper_ci'] - metrics_df['f1_score']]
    dot_color = "#B12A90"  
    error_color = 'lightgray'  
    error_lw = 3 
    capsize = 5 
    ax.errorbar(metrics_df.index, metrics_df['f1_score'], yerr=error, fmt='o', color=dot_color, ecolor=error_color, 
                elinewidth=error_lw, capsize=capsize, linestyle='-', linewidth=2, markersize=5, markeredgecolor=dot_color,
                markerfacecolor=dot_color, markeredgewidth=2, label='F1 Score')
    ax.set_xlabel('Number of Scans')
    # ax.set_ylabel('F1 Score')
    # ax.set_title('F1 Scores per Number of Scans')
    ax.set_xticks(metrics_df.index)
    ax.set_xticklabels(metrics_df.index)
    ax.spines['top'].set_visible(False)   
    ax.spines['right'].set_visible(False)


    return fig



In [None]:
test_intrapatient = pd.read_csv("/analysis_csvs/intrapatient_analysis.csv")
intrapatient_metrics = get_metircs_data(test_intrapatient)
_ = make_f1_plot(intrapatient_metrics)