In [None]:
import pickle, json

def load_obj_pickle(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
def load_obj_json(name):
    with open(name + '.json', 'r') as f:
        return json.load(f)

In [None]:
import pandas as pd
market_data = pd.read_csv('../Data/standardized_data_new.csv')

In [None]:
len(list(market_data.columns))

In [None]:
list(market_data.columns)

In [None]:
len(market_data)

In [None]:
market_data["pastrating"].value_counts()

In [None]:
ratings = sorted(list(market_data["pastrating"].unique()))

In [None]:
ratings

In [None]:
ratings_to_idx = {rating: i for i, rating in enumerate(ratings)}

In [None]:
ratings_to_idx

In [None]:
# Apply the ratings_to_idx dictionary to the "pastrating" column
market_data["pastrating"] = market_data["pastrating"].apply(lambda x: ratings_to_idx[x])

In [None]:
market_data.head()

In [None]:
market_data["pastrating"].value_counts()

In [None]:
tmfg_graphs = load_obj_pickle('../Data/tmfg_graphs_new_4')
tic_to_idx = load_obj_pickle('../Data/tic_to_index')
synth_tics = load_obj_pickle('../Data/synth_tics')

In [None]:
import torch
from torch_geometric.data import Data

def graph_items_to_data_seq_w_mask(graphs, unseen_node_dict, seq_len=4):
    data_list = []

    for i in range(len(graphs) - seq_len):  # Loop through each sequence of 4 graphs
        features_list = []
        edge_index_list = []
        edge_attr_list = []
        mask_list = []
        unseen_mask_list = []

        for j in range(seq_len):  # Extract features for 4 consecutive dates
            date, graph = graphs[i + j]
            node_features = [graph.nodes[k] for k in graph.nodes]
            node_features = torch.tensor([list(node.values()) for node in node_features], dtype=torch.float32)
            edge_index = torch.tensor(list(graph.edges)).t().contiguous()
            edge_attr = torch.tensor([graph.edges[edge]["weight"] for edge in graph.edges], dtype=torch.float32)
            features_list.append(node_features)
            edge_index_list.append(edge_index)
            edge_attr_list.append(edge_attr)

            # Create a mask to identify synthetic nodes based on the 'pastrating' being -1.0
            mask = torch.tensor([graph.nodes[k]['pastrating'] != -1.0 for k in graph.nodes], dtype=torch.bool)
            mask_list.append(mask)

            # Create an unseen mask based on whether the node is part of the unseen_node_dict for the current date
            unseen_mask = torch.tensor([k in unseen_node_dict and date in unseen_node_dict[k] for k in graph.nodes], dtype=torch.bool)
            unseen_mask_list.append(unseen_mask)
            
        _, graph = graphs[i + seq_len]
        ratings = [graph.nodes[k]["pastrating"] for k in graph.nodes]
        y = torch.tensor(ratings, dtype=torch.long)

        # Aggregate masks for all timesteps, considering a node synthetic if it is marked as synthetic in any timestep
        combined_mask = torch.stack(mask_list).all(dim=0)
        unseen_mask = torch.stack(unseen_mask_list).any(dim=0)  # True if the node is unseen at any of the time steps

        data = Data(x=features_list, edge_index=edge_index_list, edge_attr=edge_attr_list, y=y, mask=combined_mask, unseen_mask=unseen_mask)
        data_list.append(data)

    return data_list

In [None]:
import torch
from torch_geometric.data import Data

def parse_quarter(quarter):
    print(quarter)
    year, q = quarter.split('Q')
    print(year, q)
    return int(year), int(q)

def quarter_sequence(year, quarter, length=4):
    """Generate a list of previous quarters including the current one."""
    quarters = []
    for i in range(length, -1, -1):  # Include the current and go back four quarters
        q = quarter - i
        y = year
        while q <= 0:
            q += 4
            y -= 1
        quarters.append(f"{y}Q{q}")
    return quarters

def graph_items_to_data_seq_w_mask2(graphs, unseen_node_dict, seq_len=4):
    data_list = []

    for i in range(len(graphs) - seq_len):  # Loop through each sequence of 4 graphs
        features_list = []
        edge_index_list = []
        edge_attr_list = []
        mask_list = []
        unseen_mask_list = []

        for j in range(seq_len):  # Extract features for 4 consecutive dates
            date, graph = graphs[i + j]
            year, quarter = parse_quarter(date)
            previous_quarters = quarter_sequence(year, quarter)

            node_features = [graph.nodes[k] for k in graph.nodes]
            node_features = torch.tensor([list(node.values()) for node in node_features], dtype=torch.float32)
            edge_index = torch.tensor(list(graph.edges)).t().contiguous()
            edge_attr = torch.tensor([graph.edges[edge]["weight"] for edge in graph.edges], dtype=torch.float32)
            features_list.append(node_features)
            edge_index_list.append(edge_index)
            edge_attr_list.append(edge_attr)

            # mask = torch.tensor([graph.nodes[k]['pastrating'] != -1.0 for k in graph.nodes], dtype=torch.bool)
            mask = torch.tensor([not k in unseen_node_dict for k in graph.nodes], dtype=torch.bool)
            mask_list.append(mask)

            # Update unseen_mask calculation
            unseen_mask = torch.tensor([k in unseen_node_dict and set(previous_quarters).issubset(set(unseen_node_dict[k])) for k in graph.nodes], dtype=torch.bool)
            unseen_mask_list.append(unseen_mask)

        _, graph = graphs[i + seq_len]
        ratings = [graph.nodes[k]["pastrating"] for k in graph.nodes]
        y = torch.tensor(ratings, dtype=torch.long)

        combined_mask = torch.stack(mask_list).all(dim=0)
        unseen_mask = torch.stack(unseen_mask_list).any(dim=0)  # True if the node is unseen at any of the time steps

        data = Data(x=features_list, edge_index=edge_index_list, edge_attr=edge_attr_list, y=y, mask=combined_mask, unseen_mask=unseen_mask)
        data_list.append(data)

    return data_list


In [None]:
from copy import deepcopy

graphs = deepcopy(tmfg_graphs)

In [None]:
graph_items = sorted(graphs.items())

In [None]:
from collections import defaultdict

unseen_nodes = defaultdict(list)

for date, graph in graph_items:
    year = int(date[:4])
    if year >= 2018:
        for node in graph.nodes:
             if graph.nodes[node]["tic"] in synth_tics and graph.nodes[node]['pastrating'] != -1:
                unseen_nodes[node].append(date)

In [None]:
unseen_nodes

In [None]:
len(unseen_nodes)

In [None]:
def parse_quarter_cons(quarter):
    year, q = quarter.split('Q')
    return int(year) * 4 + int(q) - 1

def is_consecutive(quarters):
    quarters = sorted(set(parse_quarter_cons(q) for q in quarters))  # Unique and sorted
    consecutive_sequences = []
    current_sequence = [quarters[0]]

    for i in range(1, len(quarters)):
        if quarters[i] == quarters[i - 1] + 1:
            current_sequence.append(quarters[i])
        else:
            if len(current_sequence) >= 5:
                consecutive_sequences.extend(current_sequence)
            current_sequence = [quarters[i]]

    if len(current_sequence) >= 5:
        consecutive_sequences.extend(current_sequence)

    # Convert back to original format
    return ['{}Q{}'.format(q // 4, q % 4 + 1) for q in consecutive_sequences]

result = defaultdict(list)
for key, values in unseen_nodes.items():
    consecutive = is_consecutive(values)
    if consecutive:
        result[key] = consecutive
        
result

In [None]:
result == unseen_nodes

In [None]:
# Delete keys tic, year, quarter from the nodes
for date, graph in graph_items:
    for node in graph.nodes:
        del graph.nodes[node]["tic"]
        del graph.nodes[node]["year"]
        del graph.nodes[node]["quarter"]

In [None]:
data_list = graph_items_to_data_seq_w_mask2(graph_items, unseen_nodes, 4)

In [None]:
len(graph_items)

In [None]:
len(data_list)

In [None]:
train_items_w_ratings, val_items_w_ratings, test_items_w_ratings = data_list[:28], data_list[28:32], data_list[32:]

In [None]:
len(train_items_w_ratings), len(val_items_w_ratings), len(test_items_w_ratings)

In [None]:
from collections import Counter

def compute_class_weights(train_items):
    all_labels = []

    for i, data in enumerate(train_items):
        labels = data.y.cpu().numpy()[data.mask.cpu().numpy()]
        all_labels.extend(labels)

    class_counts = Counter(all_labels)
    total_samples = sum(class_counts.values())
    class_weights = {cls: total_samples / (len(class_counts) * count) for cls, count in class_counts.items()}

    return class_weights

In [None]:
class_weights = compute_class_weights(data_list[:len(data_list)-1])

In [None]:
class_weights

In [None]:
class_weights = [class_weights[rating] for rating in sorted(class_weights)]
class_weights

In [None]:
class_weights = torch.tensor(class_weights, dtype=torch.float32).cuda()
class_weights

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from torcheval.metrics.functional import multiclass_auprc
from torcheval.metrics import MulticlassAUPRC
from rgnns import GConvLSTMModel2, GConvGRUModel2


def bootstrap_preds_multiclass(probs, true_labels, num_classes=8, num_boot=10000):
    boot_means_auc = np.zeros(num_boot)
    boot_means_auprc = np.zeros(num_boot)
    boot_means_f1 = np.zeros(num_boot)

    np.random.seed(0)
    for i in range(num_boot):
        # Generate indices for resampling
        indices = np.random.choice(range(len(probs)), size=len(probs), replace=True)
        # Resample labels and predictions
        resampled_labels = true_labels[indices]
        resampled_probs = probs[indices]

        # Recalculate AUC, AUPRC, and F1 for resampled data
        resampled_labels_binarized = label_binarize(resampled_labels, classes=np.arange(num_classes))

        try:
            # AUC calculation
            auc_score = roc_auc_score(resampled_labels_binarized, resampled_probs, average='macro', multi_class='ovr')
            boot_means_auc[i] = auc_score

            # AUPRC calculation
            boot_means_auprc[i] = multiclass_auprc(torch.tensor(resampled_probs), torch.tensor(resampled_labels), num_classes=num_classes)

            # F1 calculation
            preds_resampled = resampled_probs.argmax(axis=1)
            _, _, f1_score, _ = precision_recall_fscore_support(resampled_labels, preds_resampled, average='macro', zero_division=0)
            boot_means_f1[i] = f1_score

        except Exception as e:
            print(f"Error in bootstrap iteration {i}: {e}")
            boot_means_auc[i] = 0
            boot_means_auprc[i] = 0
            boot_means_f1[i] = 0

    return boot_means_auc, boot_means_auprc, boot_means_f1

def train(model, loader, criterion, optimizer, h0_n, h0_g):
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for batch in loader:
        optimizer.zero_grad()

        batch = batch.to(model.device)

        if type(model) is RGNN_RNN:
            out, h0_n, h0_g = model(batch, h0_n, h0_g)
            if type(model.GNN) is GConvLSTMModel:
                h0_g = [tuple(h.detach() for h in layer) for layer in h0_g]
            else:
                h0_g = [h.detach() for h in h0_g]
        elif type(model) is GConvLSTMModel2:
            out, h0_g = model(batch.edge_index, batch.edge_attr, h0_g)
            h0_g = [tuple(h.detach() for h in layer) for layer in h0_g]
        elif type(model) is GConvGRUModel2:
            out, h0_g = model(batch.edge_index, batch.edge_attr, h0_g)
            h0_g = [h.detach() for h in h0_g]
        else: # RNN
            out, h0_n = model(batch, h0_n, False)

        loss = criterion(out[batch.mask], batch.y[batch.mask])
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        total_loss += loss.item()

        correct_predictions += (out[batch.mask].argmax(dim=1) == batch.y[batch.mask]).sum().item()
        total_samples += batch.y[batch.mask].shape[0]   # batch.mask.sum().item()
        
    avg_loss = total_loss / len(loader)
    accuracy = correct_predictions / total_samples

    return avg_loss, accuracy, h0_n, h0_g

def evaluate(model, loader, criterion, h0_n, h0_g, inferencing=False):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0
    predictions = []
    true_labels = []
    probs = []
    unseen_predictions = []
    unseen_true_labels = []
    unseen_probs = []
    unseen_correct_predictions = 0
    unseen_total_samples = 0

    with torch.no_grad():
        for i, batch in enumerate(loader):
            batch = batch.to(model.device)

            if type(model) is RGNN_RNN:
                out, h0_n, h0_g = model(batch, h0_n, h0_g)
                if type(model.GNN) is GConvLSTMModel:
                    h0_g = [tuple(h.detach() for h in layer) for layer in h0_g]
                else:
                    h0_g = [h.detach() for h in h0_g]
            elif type(model) is GConvLSTMModel2:
                out, h0_g = model(batch.edge_index, batch.edge_attr, h0_g)
                h0_g = [tuple(h.detach() for h in layer) for layer in h0_g]
            elif type(model) is GConvGRUModel2:
                out, h0_g = model(batch.edge_index, batch.edge_attr, h0_g)
                h0_g = [h.detach() for h in h0_g]
            else: # RNN
                out, h0_n = model(batch, h0_n, False)
    
            loss = criterion(out[batch.mask], batch.y[batch.mask])
            total_loss += loss.item()

            y_pred = out[batch.mask].argmax(dim=1)
            y_true = batch.y[batch.mask]
            correct_predictions += (y_pred == y_true).sum().item()
            total_samples += batch.y[batch.mask].shape[0]
            
            predictions.append(y_pred.cpu().numpy())
            true_labels.append(y_true.cpu().numpy())
            probs.append(out[batch.mask].cpu().numpy())
  
            # Inference for unseen nodes
            if inferencing:
                y_pred_unseen = out[batch.unseen_mask].argmax(dim=1)
                y_true_unseen = batch.y[batch.unseen_mask]
                unseen_correct_predictions += (y_pred_unseen == y_true_unseen).sum().item()
                unseen_total_samples += batch.unseen_mask.sum().item()
                unseen_predictions.append(y_pred_unseen.cpu().numpy())
                unseen_true_labels.append(y_true_unseen.cpu().numpy())
                unseen_probs.append(out[batch.unseen_mask].cpu().numpy())

    # Flatten true labels and predictions
    true_labels_flat = np.concatenate(true_labels)
    probs_flat = np.concatenate(probs)
    predictions_flat = np.concatenate(predictions)

    # Binarize true labels for multiclass classification
    true_labels_binarized = label_binarize(true_labels_flat, classes=np.unique(true_labels_flat))
    
    # Calculate AUC score
    try:
        # auc = roc_auc_score(true_labels_binarized, probs_flat[:, 1:], average='macro', multi_class='ovr')
        auc = roc_auc_score(true_labels_binarized, probs_flat, average='macro', multi_class='ovr')
    except ValueError:
        auc = 0

    # Calculate AUPRC score
    try:
        auprc = multiclass_auprc(torch.tensor(probs_flat), torch.tensor(true_labels_flat), num_classes=8)
    except ValueError:
        auprc = 0

    if inferencing:
        unseen_predictions_flat = np.concatenate(unseen_predictions)
        unseen_true_labels_flat = np.concatenate(unseen_true_labels)
        unseen_probs_flat = np.concatenate(unseen_probs)
        accuracy_unseen = unseen_correct_predictions / unseen_total_samples
        precision_unseen, recall_unseen, f1_unseen, _ = precision_recall_fscore_support(unseen_true_labels_flat, unseen_predictions_flat, average='macro', zero_division=0)

        true_labels_binarized_unseen = label_binarize(unseen_true_labels_flat, classes=np.unique(unseen_true_labels_flat))
        auc_unseen = roc_auc_score(true_labels_binarized_unseen, unseen_probs_flat, average='macro', multi_class='ovr')
        auprc_unseen = multiclass_auprc(torch.tensor(unseen_probs_flat), torch.tensor(unseen_true_labels_flat), num_classes=8)

        auc_boot_means, auprc_boot_means, f1_boot_means = bootstrap_preds_multiclass(probs_flat, true_labels_flat, num_classes=8, num_boot=10000)
        auc_boot_means_unseen, auprc_boot_means_unseen, f1_boot_means_unseen = bootstrap_preds_multiclass(unseen_probs_flat, unseen_true_labels_flat, num_classes=8, num_boot=10000)
    else:
        auc_boot_means = auprc_boot_means = f1_boot_means = np.zeros(1)
        auc_boot_means_unseen = auprc_boot_means_unseen = f1_boot_means_unseen = np.zeros(1)
        accuracy_unseen = precision_unseen = recall_unseen = f1_unseen = auc_unseen = auprc_unseen = 0


    avg_loss = total_loss / len(loader)
    accuracy = correct_predictions / total_samples
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels_flat, predictions_flat, average='macro', zero_division=0)
    
    return avg_loss, accuracy, precision, recall, f1, auc, auprc, auc_boot_means, auprc_boot_means, f1_boot_means, accuracy_unseen, precision_unseen, recall_unseen, f1_unseen, auc_unseen, auprc_unseen, auc_boot_means_unseen, auprc_boot_means_unseen, f1_boot_means_unseen

In [None]:
torch.cuda.is_available()

In [None]:
from models import RGNN_RNN, RNN
from rnns import LSTMModel, GRUModel, TransformerModel
from rgnns import GConvGRUModel, GConvLSTMModel, GConvGRUModel2, GConvLSTMModel2
import torch.nn as nn

in_channels = 201
rnn_hidden_channels = 16
gnn_hidden_channels = 64
num_classes = 8
num_heads = 4
num_gnn_layers = 2
num_rnn_layers = 8
edge_dim = 1
num_nodes = train_items_w_ratings[0].x[0].shape[0]

torch.manual_seed(0)

# Multi-modal RGNN-RNN model
model = RGNN_RNN(num_features=in_channels, rnn_hidden_dim=rnn_hidden_channels, gnn_hidden_dim=gnn_hidden_channels,  num_classes=num_classes, num_gnn_layers=num_gnn_layers, num_rnn_layers=num_rnn_layers, edge_dim=edge_dim, num_heads=num_heads, num_nodes=num_nodes, rgnn_model=GConvLSTMModel, rnn_model=LSTMModel).cuda()

# Uni-modal RNN model
# model = RNN(num_features=in_channels, hidden_dim=rnn_hidden_channels, num_classes=num_classes, num_rnn_layers=num_rnn_layers, num_nodes=num_nodes, rnn_model=GRUModel).cuda()

# Uni-modal RGNN models
# model = GConvLSTMModel2(input_dim=in_channels, hidden_dim=gnn_hidden_channels, output_dim=num_classes, n_layers=num_gnn_layers, n_nodes=num_nodes).cuda()

# model = GConvGRUModel2(input_dim=in_channels, hidden_dim=gnn_hidden_channels, output_dim=num_classes, n_layers=num_gnn_layers, n_nodes=num_nodes).cuda()

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

criterion = nn.CrossEntropyLoss(weight=class_weights)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=10, factor=0.1, min_lr=1e-6)

In [None]:
# Training loop
epochs = 100

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
aucs = []
aucprcs = []
val_unseen_aucs = []
val_unseen_aucprcs = []
attention_scores = []
h0_n, h0_g = None, None

for epoch in range(epochs):
    train_loss, train_accuracy, h0_n, h0_g = train(model, train_items_w_ratings, criterion, optimizer, h0_n, h0_g)
    train_losses.append(train_loss)
    train_accuracies.append(train_accuracy) 
    print(f'Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, LR: {optimizer.param_groups[0]["lr"]:.6f}')

    val_loss, val_accuracy, precision, recall, f1, auc, auprc, _, _, _, _, _, _, _, _, _, _, _, _, = evaluate(model, val_items_w_ratings, criterion, h0_n, h0_g, False)
    scheduler.step(val_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)
    aucs.append(auc)
    aucprcs.append(auprc)
    print(f'Epoch {epoch + 1}/{epochs} - Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, AUPRC: {auprc:.4f}')

# Testing loop
test_loss, test_accuracy, precision, recall, f1, auc, auprc, auc_boot_means, auprc_boot_means, f1_boot_means, accuracy_unseen, precision_unseen, recall_unseen, f1_unseen, auc_unseen, auprc_unseen, auc_boot_means_unseen, auprc_boot_means_unseen, f1_boot_means_unseen = evaluate(model, test_items_w_ratings[:-1], criterion, h0_n, h0_g, True)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, AUC: {auc:.4f}, AUPRC: {auprc:.4f}, Accuracy Unseen: {accuracy_unseen:.4f}, Precision Unseen: {precision_unseen:.4f}, Recall Unseen: {recall_unseen:.4f}, F1 Unseen: {f1_unseen:.4f}, AUC Unseen: {auc_unseen:.4f}, AUPRC Unseen: {auprc_unseen:.4f}')
print(f"Mean AUC: {auc_boot_means.mean():.4f}, Mean AUPRC: {auprc_boot_means.mean():.4f}, Mean F1: {f1_boot_means.mean():.4f}, Mean AUC Unseen: {auc_boot_means_unseen.mean():.4f}, Mean AUPRC Unseen: {auprc_boot_means_unseen.mean():.4f}, Mean F1 Unseen: {f1_boot_means_unseen.mean():.4f}")

In [None]:
# Get the auc results with 95% confidence interval
auc_lower = np.percentile(auc_boot_means, 2.5)
auc_upper = np.percentile(auc_boot_means, 97.5)
auc_mean = auc_boot_means.mean()
auc_mean_dist_lower = auc_mean - auc_lower
auc_mean_dist_upper = auc_upper - auc_mean

# Get the auprc results with 95% confidence interval
auprc_lower = np.percentile(auprc_boot_means, 2.5)
auprc_upper = np.percentile(auprc_boot_means, 97.5)
auprc_mean = auprc_boot_means.mean()
auprc_mean_dist_lower = auprc_mean - auprc_lower
auprc_mean_dist_upper = auprc_upper - auprc_mean

# Get the f1 results with 95% confidence interval
f1_lower = np.percentile(f1_boot_means, 2.5)
f1_upper = np.percentile(f1_boot_means, 97.5)
f1_mean = f1_boot_means.mean()
f1_mean_dist_lower = f1_mean - f1_lower
f1_mean_dist_upper = f1_upper - f1_mean

# Get the auc results with 95% confidence interval for unseen nodes
auc_unseen_lower = np.percentile(auc_boot_means_unseen, 2.5)
auc_unseen_upper = np.percentile(auc_boot_means_unseen, 97.5)
auc_unseen_mean = auc_boot_means_unseen.mean()
auc_unseen_mean_dist_lower = auc_unseen_mean - auc_unseen_lower
auc_unseen_mean_dist_upper = auc_unseen_upper - auc_unseen_mean

# Get the auprc results with 95% confidence interval for unseen nodes
auprc_unseen_lower = np.percentile(auprc_boot_means_unseen, 2.5)
auprc_unseen_upper = np.percentile(auprc_boot_means_unseen, 97.5)
auprc_unseen_mean = auprc_boot_means_unseen.mean()
auprc_unseen_mean_dist_lower = auprc_unseen_mean - auprc_unseen_lower
auprc_unseen_mean_dist_upper = auprc_unseen_upper - auprc_unseen_mean

# Get the f1 results with 95% confidence interval for unseen nodes
f1_unseen_lower = np.percentile(f1_boot_means_unseen, 2.5)
f1_unseen_upper = np.percentile(f1_boot_means_unseen, 97.5)
f1_unseen_mean = f1_boot_means_unseen.mean()
f1_unseen_mean_dist_lower = f1_unseen_mean - f1_unseen_lower
f1_unseen_mean_dist_upper = f1_unseen_upper - f1_unseen_mean

print(f"AUC: {auc_mean:.4f} ({auc_mean_dist_lower:.4f}, {auc_mean_dist_upper:.4f})")
print(f"AUPRC: {auprc_mean:.4f} ({auprc_mean_dist_lower:.4f}, {auprc_mean_dist_upper:.4f})")
print(f"F1: {f1_mean:.4f} ({f1_mean_dist_lower:.4f}, {f1_mean_dist_upper:.4f})")
print(f"AUC Unseen: {auc_unseen_mean:.4f} ({auc_unseen_mean_dist_lower:.4f}, {auc_unseen_mean_dist_upper:.4f})")
print(f"AUPRC Unseen: {auprc_unseen_mean:.4f} ({auprc_unseen_mean_dist_lower:.4f}, {auprc_unseen_mean_dist_upper:.4f})")
print(f"F1 Unseen: {f1_unseen_mean:.4f} ({f1_unseen_mean_dist_lower:.4f}, {f1_unseen_mean_dist_upper:.4f})")