In [None]:
import sys
sys.path.append('/autofs/homes/005/fd881/repos/MedImaging-ModelDriftMonitoring/src')

In [None]:
import pandas as pd
from model_drift import mgb_locations
import matplotlib.pyplot as plt
import os
from sklearn.metrics import roc_auc_score
from datetime import timedelta
from tqdm.auto import tqdm
import numpy as np
from joblib import Parallel, delayed
from model_drift.data import mgb_data
import seaborn as sns

In [None]:
# Function to generate windows
def generate_windows(start_date, end_date, window_size_days=14, stride_days=1):
    current_start = start_date
    while current_start + timedelta(days=window_size_days) <= end_date:
        current_end = current_start + timedelta(days=window_size_days)
        yield current_start, current_end
        current_start += timedelta(days=stride_days)

In [None]:
output_dir = '/autofs/cluster/qtim/projects/xray_drift/analysis/'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
labels_df = pd.read_csv(
    mgb_locations.labels_csv,
)
labels_df['StudyDate'] = pd.to_datetime(labels_df['StudyDate'])
labels_df['StudyDate'] = labels_df['StudyDate'].dt.date

In [None]:
diseases = labels_df.columns[-13:].tolist()

# Label Counts over Time 

In [None]:
results_counts = {}  # Dictionary to hold dataframes for each disease

for i, disease in tqdm(enumerate(diseases), total=len(diseases)):
    
    disease_df = labels_df[['StudyDate', disease]].copy()
    disease_df.set_index('StudyDate', inplace=True)
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()
    counts_data = []  # List to store the proportions and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date):
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        positive_cases = window_data[disease].sum()
        counts_data.append((window_start, positive_cases))

    # Create a DataFrame for the collected proportions and dates
    proportions_df = pd.DataFrame(counts_data, columns=['WindowStart', 'Counts'])
    results_counts[disease] = proportions_df

# Merge all the dataframes into one
for disease, df in results_counts.items():
    df.set_index('WindowStart', inplace=True)  
    df.rename(columns={'Counts': disease}, inplace=True) 

counts_df = pd.concat(results_counts.values(), axis=1)

In [None]:
#counts_df.to_csv(os.path.join(output_dir, 'raw_counts.csv'), index=True)

In [None]:
n_diseases = len(counts_df.columns)
cols = 2
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(counts_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(counts_df.index, counts_df[disease])
    ax.set_title(disease)
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('Counts in 14 Day Window')
    plt.xticks(rotation=45)

plt.tight_layout()
#plt.savefig(os.path.join(output_dir, 'raw_counts_findings.png'))
plt.show()

# Prevalence over Time

In [None]:
results_prevalence = {}  # Dictionary to hold dataframes for each disease

for i, disease in tqdm(enumerate(diseases), total=len(diseases)):
    
    disease_df = labels_df[['StudyDate', disease]].copy()
    disease_df.set_index('StudyDate', inplace=True)
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()
    proportions_data = []  # List to store the proportions and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date):
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        positive_cases = window_data[disease].sum()
        total_cases = window_data.shape[0]
        proportion_positive = positive_cases / total_cases if total_cases > 0 else 0
        proportions_data.append((window_start, proportion_positive))

    # Create a DataFrame for the collected proportions and dates
    proportions_df = pd.DataFrame(proportions_data, columns=['WindowStart', 'Prevalence'])
    results_prevalence[disease] = proportions_df

# Merge all the dataframes into one
for disease, df in results_prevalence.items():
    df.set_index('WindowStart', inplace=True)  
    df.rename(columns={'Prevalence': disease}, inplace=True) 

prevalence_df = pd.concat(results_prevalence.values(), axis=1)

In [None]:
#prevalence_df.to_csv(os.path.join(output_dir, 'prevalence.csv'), index=True)

In [None]:
n_diseases = len(prevalence_df.columns)
cols = 2
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(prevalence_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(prevalence_df.index, prevalence_df[disease])
    ax.set_title(disease)
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('Prevalence')
    plt.xticks(rotation=45)

plt.tight_layout()
#plt.savefig(os.path.join(output_dir, 'prevalence_findings.png'))
plt.show()

In [None]:

def compute_prevalence_for_batch(batch_args):
    """Compute prevalence for a batch of windows."""
    proportions_data = []
    for args in batch_args:
        try:
            window_start, window_end, disease_df, disease = args
            window_data = disease_df.loc[(disease_df.index >= window_start) & (disease_df.index < window_end)]
            positive_cases = window_data[disease].sum()
            total_cases = window_data.shape[0]
            proportion_positive = positive_cases / total_cases if total_cases > 0 else 0
            proportions_data.append((window_start, proportion_positive))
        except Exception as e:
            print(f"Error processing window {window_start} - {window_end}: {e}")
    return proportions_data

def batch_arguments(arguments, batch_size):
    """Split a list of arguments into smaller lists of a specified maximum size."""
    return [arguments[i:i + batch_size] for i in range(0, len(arguments), batch_size)]

def calculate_prevalence_for_category(df, category, diseases, batch_size=10, n_jobs=4):
    """Calculate prevalence for each category and disease combination."""
    results_prevalence = {}
    categories = df[category].dropna().unique()
    diseases = ['label.'+ d for d in diseases]
    for cat_value in tqdm(categories, desc=f"Processing {category}"):
    #for cat_value in categories:

        if cat_value not in top_5_poc:
            print(f"Skipping {cat_value}")
            continue
        else:
            print(f"Processing {cat_value}") 

        for disease in tqdm(diseases, desc="Processing diseases", leave=False):
        
        #for disease in diseases:

            cat_df = df[df[category] == cat_value].copy()
            cat_df['StudyDate'] = pd.to_datetime(cat_df['StudyDate'], errors='coerce')
            cat_df.dropna(subset=['StudyDate'], inplace=True)
            cat_df.set_index('StudyDate', inplace=True)
            
            start_date = cat_df.index.min()
            end_date = cat_df.index.max()

            window_args = [(window_start, window_end, cat_df, disease)
                           for window_start, window_end in generate_windows(start_date, end_date, stride_days=1, window_size_days=30)]

            batched_args = batch_arguments(window_args, batch_size=batch_size)

            with Parallel(n_jobs=n_jobs) as parallel:
                all_proportions_data = parallel(delayed(compute_prevalence_for_batch)(batch) for batch in batched_args)

            # Flatten the list of lists returned by parallel processing
            proportions_data = [item for sublist in all_proportions_data for item in sublist]
            proportions_df = pd.DataFrame(proportions_data, columns=['WindowStart', f'{cat_value}_{disease}'])
            key = f'{cat_value}_{disease}'
            results_prevalence[key] = proportions_df

    return results_prevalence

def plot_prevalence(performance_df, save_path=None, show=True):
    diseases = set(col.split('_')[-1] for col in performance_df.columns)
    manufacturers = set(col.split('_')[0] for col in performance_df.columns)

    n_diseases = len(diseases)
    cols = 2
    rows = int(np.ceil(n_diseases / cols))

    plt.figure(figsize=(cols * 6, rows * 4))

    for i, disease in enumerate(sorted(diseases), start=1):
        ax = plt.subplot(rows, cols, i)
        for manufacturer in sorted(manufacturers):
            if manufacturer in ['SIEMENS', 'GE MEDICAL SYSTEMS']:
                continue
            col_name = f'{manufacturer}_{disease}'
            if col_name in performance_df.columns:
                avg_performance = performance_df[col_name].mean()
                label = f"{manufacturer} (avg: {avg_performance:.2f})"
                ax.plot(performance_df.index, performance_df[col_name], label=label)
        ax.set_title(disease)
        ax.set_xlabel('Window Start Date')
        ax.set_ylabel('Prevalence in 14 Day Window')
        ax.legend()
        plt.xticks(rotation=45)

    plt.tight_layout()

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
    if show:
        plt.show()
    else:
        plt.close()


# Performance Over Time

In [None]:
label_cols = [
'Atelectasis',
'Consolidation',
'Edema',
'Lung Lesion',
'Lung Opacity',
'Pleural Effusion',
'Pneumonia',
'Pneumothorax'
]

label_cols = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Lung Opacity', 'Pleural Other', 'Pleural Effusion', 'Pneumonia', 'Pneumothorax', 'Support Devices', ]
#label_cols = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Lung Opacity', 'Pleural Other', 'Pleural Effusion', 'Pneumonia', 'Pneumothorax', 'Support Devices', 'Enlarged Cardiomediastinum', ]



In [None]:
# read jsonl file from pred_folder in df
pred_folder = '/autofs/cluster/qtim/projects/xray_drift/inferences/mgb_data_from_chexpert_retrain_frontal_only_lr1e-4_frozen_step25_qwen_labels_allclasses'
pred_folder = '/autofs/cluster/qtim/projects/xray_drift/inferences/classification_imagenet_nogamma/'

pred_folder = '/autofs/cluster/qtim/projects/xray_drift/inferences/classification_final_allpoc_inference/'

df_preds = pd.read_json(os.path.join(pred_folder, 'preds.jsonl'), lines=True)
df_preds = pd.concat(
    [
        df_preds,
        pd.DataFrame(df_preds['activation'].values.tolist(), columns=[f"activation.{c}" for c in label_cols])
    ],
    axis=1
)

df_preds = pd.concat(
    [
        df_preds,
        pd.DataFrame(df_preds['label'].values.tolist(), columns=[f"label.{c}" for c in label_cols])
    ],
    axis=1
)

In [None]:
df_dicom = pd.read_csv(mgb_locations.dicom_inventory_csv)

def make_index(row: pd.Series):
    return f"{row.PatientID}_{row.AccessionNumber}_{row.SOPInstanceUID}"

# df_dicom only has anonimized dates, so we are pulling them from labels
study_dates = labels_df[['StudyInstanceUID','StudyDate']].copy()

df_dicom.drop(columns=["StudyDate"], inplace=True)
df_dicom = df_dicom.merge(
    study_dates,
    left_on="StudyInstanceUID",
    right_on="StudyInstanceUID",
)
df_dicom["index"] = df_dicom.apply(make_index, axis=1)


In [None]:
df_preds = df_preds.merge(
    df_dicom,
    on="index",
)

#get accesion number from here
crosswalk = pd.read_csv(mgb_locations.crosswalk_csv, dtype={"ANON_AccNumber": int})
crosswalk = crosswalk[["ANON_AccNumber", "ORIG_AccNumber"]]

# get other metadata from here
reports = pd.read_csv(mgb_locations.reports_csv, dtype=str)
reports = reports[
    [
        "Accession Number",
        "Point of Care",
        "Patient Sex",
        "Patient Age",
        "Is Stat",
        "Exam Code",
    ]
].copy()

df_preds = df_preds.merge(
    crosswalk,
    how="left",
    left_on="AccessionNumber",
    right_on="ANON_AccNumber",
    validate="many_to_one",
)
df_preds = df_preds.merge(
    reports,
    how="left",
    left_on="ORIG_AccNumber",
    right_on="Accession Number",
    validate="many_to_one",
)


df_preds['StudyDate'] = pd.to_datetime(df_preds['StudyDate'])
df_preds['StudyDate'] = df_preds['StudyDate'].dt.date

In [None]:
# select only rows where study date is after 2019/10/21
#df_preds = df_preds[df_preds['StudyDate'] > pd.to_datetime('2019-10-21').date()]

# exclude laterals 
df_preds = df_preds[df_preds['ViewPosition'] != 'LL']
#df_preds = df_preds[df_preds['StudyDate'] > pd.to_datetime('2019-10-21').date()]
df_preds['ViewPosition'].value_counts()

In [None]:
results_performance = {}  # Dictionary to hold dataframes for each disease


for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    # Create a DataFrame for each disease containing only necessary columns
    disease_df = df_preds[['StudyDate', disease_label, disease_score]].copy()
    disease_df.set_index('StudyDate', inplace=True)
    
    # Get the overall start and end dates
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date, window_size_days=30):
    # Select data for the current window
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        score = roc_auc_score(window_data[disease_label], window_data[disease_score])
        scores_data.append((window_start, score))

    # Create a DataFrame for the collected scores and dates
    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{disease}'])
    results_performance[disease] = scores_df    

    # Merge all the dataframes into one
for disease, df in results_performance.items():
    df.set_index('WindowStart', inplace=True)  
    #df.rename(columns={'Prevalence': disease}, inplace=True) 

performance_df = pd.concat(results_performance.values(), axis=1)

In [None]:

def compute_scores_for_batch(batch_args):
    scores_data = []
    for args in batch_args:
        window_start, window_end, disease_df, disease_label, disease_score = args
        window_data = disease_df.loc[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        try:
            score = roc_auc_score(window_data[disease_label], window_data[disease_score])
        except ValueError:
            score = np.nan
        scores_data.append((window_start, score))
    return scores_data

def batch_arguments(arguments, batch_size):
    return [arguments[i:i + batch_size] for i in range(0, len(arguments), batch_size)]

def calculate_performance_for_category(df, category, label_cols, batch_size=10, n_jobs=-1):
    results_performance = {}
    categories = df[category].dropna().unique()



    for cat_value in tqdm(categories, desc=f"Processing {category}"):

        #replace _ in catvalue with nothing
        
    #for cat_value in categories:

        #Hotfix, remove later
        if cat_value not in top_5_poc:
            print(f"Skipping {cat_value}")
            continue
        else:
            print(f"Processing {cat_value}") 

        for disease in tqdm(label_cols, desc="Processing diseases", leave=False):
        #for disease in label_cols:

            disease_label = f'label.{disease}'
            disease_score = f'activation.{disease}'
            cat_df = df[df[category] == cat_value]
            disease_df = cat_df[['StudyDate', disease_label, disease_score]].copy()
            disease_df['StudyDate'] = pd.to_datetime(disease_df['StudyDate'], errors='coerce')
            disease_df.dropna(subset=['StudyDate'], inplace=True)

            disease_df.set_index('StudyDate', inplace=True)
            start_date = disease_df.index.min()
            end_date = disease_df.index.max()
            window_args = [(window_start, window_end, disease_df, disease_label, disease_score)
                           for window_start, window_end in generate_windows(start_date, end_date, stride_days=1, window_size_days=30)]

            batched_args = batch_arguments(window_args, batch_size=batch_size)

            with Parallel(n_jobs=4) as parallel:
                all_scores_data = parallel(delayed(compute_scores_for_batch)(batch) for batch in batched_args)

            # Flatten the list of lists returned by parallel processing
            scores_data = [item for sublist in all_scores_data for item in sublist]
            scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{cat_value}_{disease}'])
            key = f'{cat_value}_{disease}'
            results_performance[key] = scores_df

    return results_performance

def plot_performance(performance_df, save_path=None, show=True):
    diseases = set(col.split('_')[-1] for col in performance_df.columns)
    manufacturers = set(col.split('_')[0] for col in performance_df.columns)
 
    manufacturers = ['Varian_4343R', '3543EZE']
    n_diseases = len(diseases)
    cols = 2
    rows = int(np.ceil(n_diseases / cols))

    plt.figure(figsize=(cols * 6, rows * 4))

    for i, disease in enumerate(sorted(diseases), start=1):
        ax = plt.subplot(rows, cols, i)
        for manufacturer in sorted(manufacturers):
            if manufacturer in ['SIEMENS', 'GE MEDICAL SYSTEMS']:
                continue
            col_name = f'{manufacturer}_{disease}'
            if col_name in performance_df.columns:
                avg_performance = performance_df[col_name].mean()
                label = f"{manufacturer} (avg: {avg_performance:.2f})"
                ax.plot(performance_df.index, performance_df[col_name], label=label)
        ax.set_title(disease)
        ax.set_xlabel('Window Start Date')
        ax.set_ylabel('AUROC')
        ax.legend()
        plt.xticks(rotation=45)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
    if show:
        plt.show()
    else:
        plt.close()

In [None]:
#df_preds['Point of Care'].value_counts()
#
## group all the point of Cares that contain OP into one category
#df_preds['Point of Care'] = df_preds['Point of Care'].apply(lambda x: 'MGH IMG XR OPX' if 'OP' in x else x)
#
#df_preds['Point of Care'].value_counts()


In [None]:

# select the 5 most common point of cares
top_5_poc = df_preds['ManufacturerModelName'].value_counts().index[:2].tolist()
top_5_poc

In [None]:
df_preds['ManufacturerModelName'].value_counts()

In [None]:
categories = ['Manufacturer', 'ViewPosition', 'PhotometricInterpretation', 'BitsAllocated']#, 'ManufacturerModelName'] 

categories = ['ManufacturerModelName']

output_dir_groups = os.path.join(output_dir, 'subgroup_analysis_nolaterals')
os.makedirs(output_dir_groups, exist_ok=True)  

for category in categories:

    # Performance in Category
    print(f'Processing Performance for : {category}')
    results_performance = calculate_performance_for_category(df_preds, category, label_cols)
    performance_df = pd.concat([df.set_index('WindowStart') for df in results_performance.values()], axis=1)

    # Save the DataFrame to a CSV file
    csv_filename = f'{output_dir_groups}/{category}_performance.csv'
    performance_df.to_csv(csv_filename)
    print(f'Saved performance data to "{csv_filename}"')
    
    # Plot the performance and save the plot to a PNG file
    plot_filename = f'{output_dir_groups}/{category}_performance_plot.png'
    plot_performance(performance_df, save_path=plot_filename, show=False) 
    print(f'Saved performance plot to "{plot_filename}"')


    ##Prevalence in Category
    print(f'Processing Prevalence for : {category}')

    results_prevalence = calculate_prevalence_for_category(df_preds, category, label_cols)
    prevalence_df = pd.concat([df.set_index('WindowStart') for df in results_prevalence.values()], axis=1)

    # Save the DataFrame to a CSV file
    csv_filename = f'{output_dir_groups}/{category}_prevalence.csv'
    prevalence_df.to_csv(csv_filename)
    print(f'Saved prevalence data to "{csv_filename}"')


    # Plot the prevalence and save the plot to a PNG file
    plot_filename = f'{output_dir_groups}/{category}_prevalence_plot.png'
    plot_prevalence(prevalence_df, save_path=plot_filename, show=False)
    print(f'Saved prevalence plot to "{plot_filename}"')

In [None]:
output_dir_groups = os.path.join(output_dir, 'subgroup_analysis_nolaterals')

#performance_df.to_csv(os.path.join(output_dir_groups, 'performance.csv'), index=True)

In [None]:
n_diseases = len(performance_df.columns)
cols = 2
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(performance_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(performance_df.index, performance_df[disease])
    ax.set_title(disease)
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('AUROC')
    plt.xticks(rotation=45)

plt.tight_layout()
#plt.savefig(os.path.join(output_dir_groups, 'performance_findings.png'))
plt.show()

# Scores over Time

In [None]:
results_scores = {}  # Dictionary to hold dataframes for each disease

for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    # Create a DataFrame for each disease containing only necessary columns
    disease_df = df_preds[['StudyDate', disease_label, disease_score]].copy()
    disease_df.set_index('StudyDate', inplace=True)
    
    # Get the overall start and end dates
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date):
    # Select data for the current window
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        # select only rows where label is 1 

        window_data = window_data[(window_data[disease_label] == 0)]
    
        score = np.mean(window_data[disease_score])
        scores_data.append((window_start, score))

    # Create a DataFrame for the collected scores and dates
    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{disease}'])
    results_scores[disease] = scores_df    

    # Merge all the dataframes into one
for disease, df in results_scores.items():
    df.set_index('WindowStart', inplace=True)  
    #df.rename(columns={'Prevalence': disease}, inplace=True) 

scores_df = pd.concat(results_scores.values(), axis=1)

In [None]:
n_diseases = len(scores_df.columns)
cols = 2
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(scores_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(scores_df.index, scores_df[disease], label='Negative Label Score')
    ax.plot(scores_df_pos.index, scores_df_pos[disease], label='Positive Label Score')

    ax.set_title(disease)
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('Mean Score')
    plt.xticks(rotation=45)
    ax.legend()

plt.tight_layout()
#plt.savefig(os.path.join(output_dir, 'performance_findings.png'))
plt.show()

In [None]:
# Score Density estimation


In [None]:
results_scores_raw = {}  # Dictionary to hold dataframes for each disease

for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    # Create a DataFrame for each disease containing only necessary columns
    disease_df = df_preds[['StudyDate', disease_label, disease_score]].copy()
    disease_df.set_index('StudyDate', inplace=True)
    
    # Get the overall start and end dates
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date, stride_days=30):
    # Select data for the current window
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        
        score = window_data[disease_score].to_numpy()
        scores_data.append((window_start, score))

    # Create a DataFrame for the collected scores and dates
    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{disease}'])
    results_scores_raw[disease] = scores_df    

    # Merge all the dataframes into one
for disease, df in results_scores_raw.items():
    df.set_index('WindowStart', inplace=True)  
    #df.rename(columns={'Prevalence': disease}, inplace=True) 

scores_raw_df = pd.concat(results_scores_raw.values(), axis=1)

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Assuming scores_raw_df exists and is your DataFrame

num_columns = 6
num_diseases = len(scores_raw_df.columns)
num_rows = int(np.ceil(num_diseases / num_columns))

fig = make_subplots(rows=num_rows, cols=num_columns, shared_xaxes=True, shared_yaxes=True,
                    subplot_titles=scores_raw_df.columns.to_list())

for i, disease in enumerate(scores_raw_df.columns, start=1):
    row_position = int(np.ceil(i / num_columns))
    col_position = i - (row_position - 1) * num_columns
    # Extract scores and dates for this disease
    scores_list = []
    dates_list = []

    for window_start, row in scores_raw_df.iterrows():
        scores = row[disease]
        for score in scores:
            scores_list.append(score)
            dates_list.append(window_start)

    combined_df = pd.DataFrame({
        'Score': scores_list,
        'Date': pd.to_datetime(dates_list)
    })

    # Group data by Date and compute KDE for each group
    for date, group in combined_df.groupby('Date'):
        # KDE computation
        if len(group['Score']) > 1:  # KDE requires at least 2 data points
            kde = gaussian_kde(group['Score'])
            score_range = np.linspace(min(group['Score']), max(group['Score']), 100)
            density = kde(score_range)
            # Plot KDE as a line
            fig.add_trace(go.Scatter(x=score_range, y=density, mode='lines', name=date.strftime('%Y-%m-%d'), legendgroup=str(date), showlegend=(i==1)), row=row_position, col=col_position)


# Customize layout
fig.update_layout(height=400*num_rows, width=400*num_columns, title_text="KDE of Disease Scores Over Time")

fig.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Determine the number of subplots needed based on the number of diseases
num_diseases = len(scores_raw_df.columns)
num_cols = 3 
num_rows = int(np.ceil(num_diseases / num_cols))


# Create a large figure to hold all subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(5*num_cols, 5*num_rows))
axes = axes.flatten()  # Flatten the axes array for easy iteration

# Iterate over the columns (diseases) and axes simultaneously
for ax, disease in zip(axes, scores_raw_df.columns):
    
    scores_list = []
    dates_list = []

    # Iterate over each row in the DataFrame
    for window_start, row in scores_raw_df.iterrows():
        # Extract the scores for the current disease
        scores = row[disease]  # Get the list of scores for the current disease
        
        # Append each score to the scores list and the corresponding date to the dates list
        for score in scores:
            scores_list.append(score)
            dates_list.append(window_start)

    # Create a new DataFrame with scores and dates for the current disease
    combined_df = pd.DataFrame({
        'Score': scores_list,
        'Date': pd.to_datetime(dates_list)
    })

    # Group by the Date to plot each distribution for the current disease on its axis
    for date, group in combined_df.groupby('Date'):
        sns.kdeplot(group['Score'], ax=ax, label=date.strftime('%Y-%m-%d'))

    # Set the title for the current subplot
    ax.set_title(f'{disease}')

    # Hide x-axis labels and legend for individual plots for clarity
    ax.set_xlabel('')
    ax.set_ylabel('')
    if ax.legend_:
        ax.legend_.remove()

# Outside of the loop, set a common X and Y label
fig.text(0.5, 0.04, 'Score', ha='center', va='center')
fig.text(0.04, 0.5, 'Density', ha='center', va='center', rotation='vertical')

# Add a single legend at the bottom of the figure
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=3)

# Adjust layout to prevent overlap and to allocate space for the legend
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

# Show the combined figure
plt.show()

# Resampling

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
df_precovid = df_preds[df_preds['StudyDate'] < pd.to_datetime('2020-04-01').date()].copy()
df_postcovid = df_preds[df_preds['StudyDate'] > pd.to_datetime('2020-04-01').date()].copy()


In [None]:
df_preds.columns

In [None]:
for i in key_columns:
    print(i)
    print(df_postcovid[i].value_counts())

In [None]:
key_columns = ['PhotometricInterpretation',
               'BitsStored', 'ViewPosition']

# Create a composite key in both DataFrames
df_precovid['composite_key'] = df_precovid[key_columns].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
df_postcovid['composite_key'] = df_postcovid[key_columns].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)


sss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)

# Since StratifiedShuffleSplit expects arrays, we convert our DataFrame columns to arrays.
# Using indices as features just to comply with the expected input shape
X = df_precovid.index.to_numpy().reshape(-1, 1)
y = df_precovid['composite_key'].to_numpy()  # The composite key acts as the label for stratification

# Generate indices for a stratified sample
for train_idx, test_idx in sss.split(X, y):
    # Correctly indexing df_precovid to get the stratified sample
    df_precovid_sampled = df_precovid.iloc[test_idx]


In [None]:
df_combined = pd.concat([df_precovid_sampled, df_postcovid], axis=0)


In [None]:
results_performance = {}  # Dictionary to hold dataframes for each disease

for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    # Create a DataFrame for each disease containing only necessary columns
    disease_df = df_combined[['StudyDate', disease_label, disease_score]].copy()
    disease_df.set_index('StudyDate', inplace=True)
    
    # Get the overall start and end dates
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date):
    # Select data for the current window
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        score = roc_auc_score(window_data[disease_label], window_data[disease_score])
        scores_data.append((window_start, score))

    # Create a DataFrame for the collected scores and dates
    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{disease}'])
    results_performance[disease] = scores_df    

    # Merge all the dataframes into one
for disease, df in results_performance.items():
    df.set_index('WindowStart', inplace=True)  
    #df.rename(columns={'Prevalence': disease}, inplace=True) 

performance_df_sampled = pd.concat(results_performance.values(), axis=1)

In [None]:
n_diseases = len(performance_df_sampled.columns)
cols = 2
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(performance_df_sampled.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(performance_df_sampled.index, performance_df_sampled[disease])
    ax.set_title(disease)
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('AUROC')
    plt.xticks(rotation=45)

plt.tight_layout()
#plt.savefig(os.path.join(output_dir, 'performance_findings.png'))
plt.show()

# Correlation 

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt

# Cramér's V function
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Assuming df is your DataFrame
# Select only categorical columns
categorical_columns = ['Manufacturer', 'ViewPosition', 'PhotometricInterpretation', 'BitsAllocated', 'Point of Care', 'Is Stat']

# Initialize an empty DataFrame to store Cramér's V values
cramers_v_matrix = pd.DataFrame(index=categorical_columns, columns=categorical_columns, dtype=float)

# Calculate Cramér's V for each pair of categorical columns and fill the matrix
for col1 in categorical_columns:
    for col2 in categorical_columns:
        cramers_v_matrix.loc[col1, col2] = cramers_v(df_preds[col1], df_preds[col2])

# Visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cramers_v_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
plt.title("Cramér's V Correlation Matrix")
plt.show()


# Cut-Off Tuning

In [None]:
# select studies after train date end but before val date end
df_ref = df_preds[(df_preds['StudyDate'] > mgb_data.TRAIN_DATE_END.date())&(df_preds['StudyDate'] < mgb_data.VAL_DATE_END.date())]

# calcuate the ideal cutoff point for each disease to maximize f1-score on this data
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

cut_offs = {}  

for disease in label_cols:
    best_f1_score = -1 
    best_cutoff = 0  
    
    for cutoff in np.arange(0, 1, 0.01):
        preds = df_ref[f'activation.{disease}'] > cutoff
        current_f1_score = f1_score(df_ref[f'label.{disease}'], preds)
        
        if current_f1_score > best_f1_score:
            best_f1_score = current_f1_score
            best_cutoff = cutoff
    
    cut_offs[disease] = {'Max F1 Score': best_f1_score, 'Optimal Cutoff': best_cutoff}

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, accuracy_score, f1_score, confusion_matrix

operating_points = {}
for disease in label_cols:
    fpr, tpr, thresholds = roc_curve(df_ref[f'label.{disease}'], df_ref[f'activation.{disease}'])
    roc_auc = auc(fpr, tpr)

    # Calculate Youden Index
    youden_index = tpr - fpr
    best_index = np.argmax(youden_index)
    best_cutoff = thresholds[best_index]

    operating_points[disease] = best_cutoff

    plt.plot(fpr, tpr, label=f'{disease} AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')
    #plt.plot(fpr[best_index], tpr[best_index], 'v', markersize=8, fillstyle='none', c='r', label=f'Youden {disease}')

# figure size 10 10 
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('Receiver operating characteristic')
plt.legend(loc='lower right')

plt.show()

# Calculate Accuracy, F1-score, Sensitivity, and Specificity for each disease
for disease in label_cols:
    cutoff = operating_points[disease]
    preds = df_ref[f'activation.{disease}'] > cutoff
    accuracy = accuracy_score(df_ref[f'label.{disease}'], preds)
    f1 = f1_score(df_ref[f'label.{disease}'], preds)
    tn, fp, fn, tp = confusion_matrix(df_ref[f'label.{disease}'], preds).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    print(f'Disease: {disease}, Cut-Off: {cutoff}, Accuracy: {accuracy:.3f}, F1-score: {f1:.3f}, Sensitivity: {sensitivity:.3f}, Specificity: {specificity:.3f}')
    

In [None]:
results_performance = {}  # Dictionary to hold dataframes for each disease
micro_labels = []  # List to collect all labels for micro averaging
micro_preds = []  # List to collect all binarized predictions for micro averaging
micro_scores = []  # List to collect all scores for AUROC calculation

for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    disease_df = df_preds[['StudyDate', disease_label, disease_score]].copy()
    disease_df.set_index('StudyDate', inplace=True)

    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    optimal_cutoff = operating_points[disease]  # Get the optimal cutoff for the disease

    for window_start, window_end in generate_windows(start_date, end_date, window_size_days=30):
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        binarized_preds = (window_data[disease_score] > optimal_cutoff).astype(int)

        # Append to lists for micro averaging
        micro_labels.extend(window_data[disease_label].tolist())
        micro_preds.extend(binarized_preds.tolist())
        micro_scores.extend(window_data[disease_score].tolist())  # Raw scores for AUROC

        accuracy = accuracy_score(window_data[disease_label], binarized_preds)
        f1 = f1_score(window_data[disease_label], binarized_preds, zero_division=1)
        auroc = roc_auc_score(window_data[disease_label], window_data[disease_score]) if window_data[disease_label].nunique() > 1 else float('nan')

        scores_data.append((window_start, accuracy, f1, auroc))

    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{disease}_Accuracy', f'{disease}_F1', f'{disease}_AUROC'])
    results_performance[disease] = scores_df



performance_df['Macro_Accuracy'] = performance_df.filter(like='_Accuracy').mean(axis=1)
performance_df['Macro_F1'] = performance_df.filter(like='_F1').mean(axis=1)
performance_df['Macro_AUROC'] = performance_df.filter(like='_AUROC').mean(axis=1)


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.dates as mdates


date_format = mdates.DateFormatter('%Y-%m')
month_locator = mdates.MonthLocator(interval=3)
# Dont include training data in plot
filtered_df = performance_df#[performance_df.index >= pd.Timestamp('2019-10-01')]

n_diseases = len(label_cols)  # Assuming label_cols list exists with the names of diseases
cols = 2
rows = int(np.ceil((n_diseases + 2) / cols))  # Adding two more plots for micro and macro averages

plt.figure(figsize=(cols * 6, rows * 4))

# Plot metrics for each disease from October 2019 onwards
for i, disease in enumerate(label_cols, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(filtered_df.index, filtered_df[f'{disease}_Accuracy'], label='Accuracy', color='blue')
    ax.plot(filtered_df.index, filtered_df[f'{disease}_F1'], label='F1 Score', color='green')
    ax.plot(filtered_df.index, filtered_df[f'{disease}_AUROC'], label='AUROC', color='red')

    ax.set_title(f'Metrics for {disease}')
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('Metric Values')
    ax.set_xlim(pd.to_datetime('2019-10-01').date(), pd.to_datetime('2021-07-01').date())
    ax.xaxis.set_major_locator(month_locator)
    ax.xaxis.set_major_formatter(date_format)

    plt.xticks(rotation=45)
    ax.legend()

# Macro average plot
ax = plt.subplot(rows, cols, n_diseases + 2)
ax.plot(filtered_df.index, filtered_df['Macro_Accuracy'], label='Macro Accuracy', color='blue')
ax.plot(filtered_df.index, filtered_df['Macro_F1'], label='Macro F1', color='green')
ax.plot(filtered_df.index, filtered_df['Macro_AUROC'], label='Macro AUROC', color='red')
ax.set_xlim(pd.to_datetime('2019-10-01').date(), pd.to_datetime('2021-07-01').date())
ax.xaxis.set_major_locator(month_locator)
ax.xaxis.set_major_formatter(date_format)

ax.set_title('Macro Average Metrics')
ax.set_xlabel('Window Start Date')
ax.set_ylabel('Metric Values')
plt.xticks(rotation=45)
ax.legend()

plt.tight_layout()
# Uncomment the next line to save the figure if needed
# plt.savefig(os.path.join(output_dir_groups, 'performance_findings.png'))
plt.show()


In [None]:
# model calibration
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.linear_model import LogisticRegression



# Calibrate your model
X_ref = df_ref[['activation.Support Devices']].values  # Make sure 'activation' is in the correct shape
y_ref = df_ref['label.Support Devices'].values

X_logistic = X_ref.reshape(-1, 1)  # Ensure it's 2D
y_logistic = y_ref

# Fit logistic regression for calibration
log_reg = LogisticRegression().fit(X_logistic, y_logistic)

# Apply the calibration
calibrated_probs_log = log_reg.predict_proba(df_preds['activation.Support Devices'].values.reshape(-1, 1))[:, 1]

In [None]:
df_preds[['activation.Support Devices', 'calibrated_probs.Support Devices', 'label.Support Devices', ]].head(20)

In [None]:
df_preds['calibrated_probs.Support Devices'] = calibrated_probs_log

# calculate performance
results_performance = {}  # Dictionary to hold dataframes for each disease


binarized_preds = (df_preds['activation.Support Devices'] > 0.3).astype(int)
binarized_preds_cal = (df_preds['calibrated_probs.Support Devices'] > 0.3).astype(int)

score = roc_auc_score(df_preds['label.Support Devices'], df_preds['activation.Support Devices'])
score_cal = roc_auc_score(df_preds['label.Support Devices'], df_preds['calibrated_probs.Support Devices'])

print(f'Accuracy before calibration: {score:.2f}')
print(f'Accuracy after calibration: {score_cal:.2f}')

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Assuming df_preds is your DataFrame, and it includes 'StudyDate', 'label.DiseaseA', and 'activation.DiseaseA'
disease = 'Support Devices'
disease_label = f'label.{disease}'
disease_score = f'calibrated_probs.{disease}'

# Assuming cut_offs is your dictionary with optimal cutoffs
optimal_cutoff = cut_offs[disease]['Optimal Cutoff']

# Filter data for the specific disease
disease_df = df_preds[['StudyDate', disease_label, disease_score]].copy()
disease_df['StudyDate'] = pd.to_datetime(disease_df['StudyDate'])
disease_df.set_index('StudyDate', inplace=True)

# Sort the DataFrame by date to ensure chronological order
disease_df.sort_index(inplace=True)

# Define the start and end dates if not done already
start_date = disease_df.index.min()
end_date = disease_df.index.max()

# Function to generate windows (assuming you have this function)
# If not, here's a simple implementation:

# Calculate accuracy in each window
scores_data = []
for window_start, window_end in generate_windows(start_date, end_date, window_size_days=14):
    window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
    binarized_preds = (window_data[disease_score] > optimal_cutoff).astype(int)
    
    score = accuracy_score(window_data[disease_label], binarized_preds)
    scores_data.append((window_start, score))

# Create a DataFrame for the collected scores and dates
scores_df = pd.DataFrame(scores_data, columns=['WindowStart', 'Accuracy'])

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(scores_df['WindowStart'], scores_df['Accuracy'], linestyle='-')
plt.title(f'Accuracy Over Time for {disease}')
plt.xlabel('Window Start Date')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()


# Further Digging into Metadata

Here we are looking at the largest site: MGH IMG XR ER MG WHT1, about 60000 images (2/3) of our dataset come from here

In [None]:
# make a new label for point of care that combines all the OPs
df_preds['Point of Care_combined'] = df_preds['Point of Care'].apply(lambda x: 'MGH IMG XR OPX' if 'OP' in x else x)


In [None]:
# look at one specific site 

site = 'MGH IMG XR ER MG WHT1'
site_disp = site.replace(' ', '~')

print(df_preds['Point of Care'].value_counts())

df_er = df_preds[df_preds['Point of Care'] == site].copy()

df_er['StudyDate'] = pd.to_datetime(df_er['StudyDate'])
df_er.set_index('StudyDate', inplace=True)


In [None]:
exam_counts = {}

# Get the overall start and end dates
start_date = df_er.index.min()
end_date = df_er.index.max()

window_size_days = 30 
# Loop through each time window and count exams
for window_start, window_end in generate_windows(start_date, end_date, window_size_days=window_size_days):
    window_data = df_er[(df_er.index >= window_start) & (df_er.index < window_end)]
    count = window_data.shape[0]/window_size_days  # Count the number of rows (exams) in the window, normalized by window size
    exam_counts[window_start] = {'ExamCount': count}

# Convert the exam counts dictionary to a DataFrame
exam_counts_df = pd.DataFrame(exam_counts).T

# If needed, fill missing values with 0
exam_counts_df.fillna(0, inplace=True)

# Display the resulting DataFrame
plt.figure(figsize=(10, 6))  # Set the figure size for better readability

plt.plot(exam_counts_df.index, exam_counts_df['ExamCount'])

plt.title(f'Exams per day at $\mathbf{{{site_disp}}}$')
plt.xlabel('Window Start Date')
plt.ylabel('Count per Day')
plt.xticks(rotation=45)  
plt.tight_layout()  


plt.show()


In [None]:
df_er.columns

In [None]:
plt.xticks(rotation=45)
plt.hist(df_er['Patient Age'], bins=10)


In [None]:
parameter = 'ViewPosition'


# check the machines used
print(df_er[parameter].value_counts())

# plot the proportion of the machines used over time for that location


# Get the overall start and end dates
start_date = df_er.index.min()
end_date = df_er.index.max()

proportion_data = {}

for window_start, window_end in generate_windows(start_date, end_date, window_size_days=30):
    window_data = df_er[(df_er.index >= window_start) & (df_er.index < window_end)]
    proportions = window_data[parameter].value_counts(normalize=False)
    proportion_data[window_start] = proportions/30
    
machine_df = pd.DataFrame(proportion_data).T

machine_df.fillna(0, inplace=True)


In [None]:
# drop the Fluorospot, which has only two non nans rows

#count non nans in each column
print(machine_df.count())

#machine_df.drop(columns=['Fluorospot Compact FD'], inplace=True, errors='ignore')

In [None]:
plt.figure(figsize=(10, 6))  # Set the figure size for better readability

for machine in machine_df.columns:
    plt.plot(machine_df.index, machine_df[machine], label=machine)

plt.title(f'{parameter} over Time at $\mathbf{{{site_disp}}}$')
plt.xlabel('Window Start Date')
plt.ylabel('Count per Day')
plt.xticks(rotation=45)  
plt.legend()
plt.tight_layout()  

# plt.savefig(os.path.join(output_dir, 'performance_findings.png'))

plt.show()


In [None]:
plt.figure(figsize=(10, 6))

palettes = ['deep', 'bright', 'pastel', 'muted', 'dark', 'colorblind']
colors = []

machine_df_prop = machine_df.div(machine_df.sum(axis=1), axis=0).copy()
for pal in palettes:
    # Extend the color list with colors from each palette
    colors.extend(sns.color_palette(pal))

plt.stackplot(machine_df_prop.index, *machine_df_prop.T.values, labels=machine_df_prop.columns, colors=colors)
plt.xlim(machine_df_prop.index.min(), machine_df_prop.index.max())
plt.ylim(0, 1)  

plt.title(f'{parameter} over Time at $\mathbf{{{site_disp}}}$')
plt.xlabel('Window Start Date')
plt.ylabel('Proportion')
plt.xticks(rotation=45)
plt.legend()  
plt.tight_layout()

# plt.savefig(os.path.join(output_dir, 'performance_findings_stacked.png'))

plt.show()

In [None]:
# select studies that were done on the Varian_4343R machine and look at the view positions

for machine in machine_df.columns:
    print(f'\nMachine: {machine}')
    print(df_er[df_er['ManufacturerModelName'] == machine].ViewPosition.value_counts())

#for machine in machine_df.columns:
#    print(f'\nMachine: {machine}')
#    print(df_er[df_er['ManufacturerModelName'] == machine].PhotometricInterpretation.value_counts())

#for machine in machine_df.columns:
#    print(f'\nMachine: {machine}')
#    print(df_er[df_er['ManufacturerModelName'] == machine].XRayTubeCurrent.median())

for machine in machine_df.columns:
    print(f'\nMachine: {machine}')
    print(df_er[df_er['ManufacturerModelName'] == machine]['PhotometricInterpretation'].value_counts())


In [None]:
parameter = 'PhotometricInterpretation'

machine_bits = {}
for machine in machine_df.columns:

    print(f'\nMachine: {machine}')
    machine_df_sel = df_er[df_er['ManufacturerModelName'] == machine]

    proportion_data_view = {}

    for window_start, window_end in generate_windows(start_date, end_date, window_size_days=30):
        window_data = machine_df_sel[(machine_df_sel.index >= window_start) & (machine_df_sel.index < window_end)]
        
        proportions = window_data[parameter].value_counts()
        
        proportion_data_view[window_start] = proportions/30
        
    
    machine_bits[machine] = pd.DataFrame(proportion_data_view).T


In [None]:
# iterate over items in the dict and plot each df
for machine, df in machine_bits.items():
    df.fillna(0, inplace=True)
    plt.figure(figsize=(10, 6))

    #palette = sns.color_palette("flare", n_colors=len(df.columns))
    #colors = sns.color_palette(palette, n_colors=len(df.columns))
   
    palettes = ['deep', 'bright', 'pastel', 'muted', 'dark', 'colorblind']
    colors = []

    for pal in palettes:
        # Extend the color list with colors from each palette
        colors.extend(sns.color_palette(pal))

    df_prop = df.div(df.sum(axis=1), axis=0).copy()

    plt.stackplot(df_prop.index, *df_prop.T.values, labels=df_prop.columns, colors=colors)
    plt.xlim(df.index.min(), df.index.max())
    plt.ylim(0, 1)  
    machine = machine.replace('_', '\_')
    plt.title(f'{parameter} over Time for $\mathbf{{{machine}}}$ at $\mathbf{{{site_disp}}}$')
    plt.xlabel('Window Start Date')
    plt.ylabel('Proportion')
    plt.xticks(rotation=45)
    plt.legend(loc='lower left')  
    plt.tight_layout()

    # plt.savefig(os.path.join(output_dir, 'performance_findings_stacked.png'))

    plt.show()

In [None]:
# iterate over items in the dict and plot each df
for machine, df in machine_bits.items():
    df.fillna(0, inplace=True)
    plt.figure(figsize=(10, 6))

    #palette = sns.color_palette("flare", n_colors=len(df.columns))
    #colors = sns.color_palette(palette, n_colors=len(df.columns))
   
    palettes = ['deep', 'bright', 'pastel', 'muted', 'dark', 'colorblind']
    colors = []

    for pal in palettes:
        # Extend the color list with colors from each palette
        colors.extend(sns.color_palette(pal))
    # select only top 5
    cols_use = df.sum().nlargest(5).keys().tolist()
    df = df[cols_use]
    plt.plot(df.index, df, )
    #plt.xlim(df.index.min(), df.index.max())
    #plt.ylim(0, 1)  
    machine = machine.replace('_', '\_')
    plt.title(f'{parameter} over Time for $\mathbf{{{machine}}}$ at $\mathbf{{{site_disp}}}$')
    plt.xlabel('Window Start Date')
    plt.ylabel('Count per day')
    plt.xticks(rotation=45)
    plt.legend(cols_use)  
    plt.tight_layout()

    # plt.savefig(os.path.join(output_dir, 'performance_findings_stacked.png'))

    plt.show()

In [None]:
## Performance for that machine
from sklearn.metrics import accuracy_score

df_er_machine = df_er[df_er['PhotometricInterpretation'] == 'MONOCHROME2'].copy()

results_performance = {}  # Dictionary to hold dataframes for each disease


for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    # Create a DataFrame for each disease containing only necessary columns
    disease_df = df_er[[ disease_label, disease_score]].copy()
    #disease_df.set_index('StudyDate', inplace=True) 
    
    # Get the overall start and end dates
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date, window_size_days=60):
    # Select data for the current window
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]

        try:
            score = roc_auc_score(window_data[disease_label], window_data[disease_score])
        except ValueError:
            score = np.nan
        scores_data.append((window_start, score))

    # Create a DataFrame for the collected scores and dates
    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{disease}'])
    results_performance[disease] = scores_df    

    # Merge all the dataframes into one
for disease, df in results_performance.items():
    df.set_index('WindowStart', inplace=True)  
    #df.rename(columns={'Prevalence': disease}, inplace=True) 

performance_df = pd.concat(results_performance.values(), axis=1)

In [None]:
n_diseases = len(performance_df.columns)
cols = 6
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(performance_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(performance_df.index, performance_df[disease])
    ax.set_title(f'{disease} at $\mathbf{{{site_disp}}}$')
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('AUROC')
    plt.xticks(rotation=45)

plt.tight_layout()
#plt.savefig(os.path.join(output_dir_groups, 'performance_findings.png'))
plt.show()

In [None]:
df_er_machine = df_er[df_er['ManufacturerModelName'] == 'Pixium_4343E_CSI'].copy()


results_prevalence = {}  # Dictionary to hold dataframes for each disease



for i, disease in tqdm(enumerate(label_cols), total=len(label_cols)):
    disease_label = f'label.{disease}'
    disease_score = f'activation.{disease}'
    disease_df = df_er[disease_label].copy()
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()
    proportions_data = []  # List to store the proportions and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date, window_size_days=30):
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]
        positive_cases = window_data.sum()
        total_cases = window_data.shape[0]
        proportion_positive = positive_cases / total_cases if total_cases > 0 else 0
        proportions_data.append((window_start, proportion_positive))

    # Create a DataFrame for the collected proportions and dates
    proportions_df = pd.DataFrame(proportions_data, columns=['WindowStart', 'Prevalence'])
    results_prevalence[disease] = proportions_df

# Merge all the dataframes into one
for disease, df in results_prevalence.items():
    df.set_index('WindowStart', inplace=True)  
    df.rename(columns={'Prevalence': disease}, inplace=True) 

prevalence_df = pd.concat(results_prevalence.values(), axis=1)

In [None]:
df_preds.columns

In [None]:
n_diseases = len(prevalence_df.columns)
cols = 6
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(prevalence_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(prevalence_df.index, prevalence_df[disease])
    ax.set_title(f'{disease} at $\mathbf{{{site_disp}}}$')
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel(f'Prevalence in Window')
    plt.xticks(rotation=45)

plt.tight_layout()
#plt.savefig(os.path.join(output_dir_groups, 'performance_findings.png'))
plt.show()

In [None]:
categories = df_er['ViewPosition'].dropna().unique().tolist()
categories

In [None]:
categories = df_er['ManufacturerModelName'].dropna().unique().tolist()

results_performance = {}  # Dictionary to hold dataframes for each disease


for cat_value in categories:
    disease_label = f'label.Support Devices'
    disease_score = f'activation.Support Devices'
    # Create a DataFrame for each disease containing only necessary columns
    disease_df = df_er[df_er['ManufacturerModelName'] == cat_value][[ disease_label, disease_score]].copy()
    #disease_df = df_er[[ disease_label, disease_score]].copy()
    #disease_df.set_index('StudyDate', inplace=True) 
    
    # Get the overall start and end dates
    start_date = disease_df.index.min()
    end_date = disease_df.index.max()

    scores_data = []  # List to store the scores and window starts for this disease

    for window_start, window_end in generate_windows(start_date, end_date, window_size_days=60):
    # Select data for the current window
        window_data = disease_df[(disease_df.index >= window_start) & (disease_df.index < window_end)]

        try:
            score = roc_auc_score(window_data[disease_label], window_data[disease_score])
        except ValueError:
            score = np.nan
        scores_data.append((window_start, score))

    # Create a DataFrame for the collected scores and dates
    scores_df = pd.DataFrame(scores_data, columns=['WindowStart', f'{cat_value}'])
    results_performance[cat_value] = scores_df    

    # Merge all the dataframes into one
for disease, df in results_performance.items():
    df.set_index('WindowStart', inplace=True)  
    #df.rename(columns={'Prevalence': disease}, inplace=True) 

performance_df = pd.concat(results_performance.values(), axis=1)

In [None]:
n_diseases = len(performance_df.columns)
cols = 6
rows = int(np.ceil(n_diseases / cols))

plt.figure(figsize=(cols * 6, rows * 4))

for i, disease in enumerate(performance_df.columns, start=1):
    ax = plt.subplot(rows, cols, i)
    ax.plot(performance_df.index, performance_df[disease])
    ax.set_title(f'{disease} at $\mathbf{{{site_disp}}}$')
    ax.set_xlabel('Window Start Date')
    ax.set_ylabel('AUROC')
    ax.set_xlim(pd.to_datetime('2019-10-01').date(), pd.to_datetime('2021-07-01').date())
    plt.xticks(rotation=45)
    ax.xaxis.set_major_locator(month_locator)
    ax.xaxis.set_major_formatter(date_format)



plt.tight_layout()
#plt.savefig(os.path.join(output_dir_groups, 'performance_findings.png'))
plt.show()

In [None]:
import matplotlib.dates as mdates

date_format = mdates.DateFormatter('%Y-%m')
month_locator = mdates.MonthLocator(interval=3)

In [None]:
plt.figure(figsize=(10, 6))  # Set the figure size for better readability

for machine in performance_df.columns:
    plt.plot(performance_df.index, performance_df[machine], label=machine)

plt.title(f'Performance over Time at $\mathbf{{{site_disp}}}$')
plt.xlabel('Window Start Date')
plt.ylabel('Count per Day')
plt.xticks(rotation=45)  
plt.xlim(pd.to_datetime('2019-10-01').date(), pd.to_datetime('2021-07-01').date())

plt.legend()
plt.tight_layout()  

# plt.savefig(os.path.join(output_dir, 'performance_findings.png'))

plt.show()
