### Fusion Strategies Comparison

This experiment compares three multimodal fusion strategies:
  1. Early Fusion (concatenate embeddings)
  2. Late Fusion (ensemble predictions)
  3. Intermediate Fusion (cross-attention)

Goal: Determine which fusion strategy works best for stance classification.

Research Questions:
  - Does allowing modalities to interact improve performance?
  - Is the added complexity of cross-attention worth it?
  - Can late fusion ensemble provide benefits without retraining?

In [1]:
#Libraries
import os
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix, classification_report
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel,get_linear_schedule_with_warmup
from torchvision import models, transforms
from PIL import Image
import warnings
warnings.filterwarnings('ignore')


# Random seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Seed:  {SEED}")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Seed:  42
Using device: cpu


In [3]:
#Paths
DATA_PATH = "../../../data/"
IMG_PATH = "../../../data/images"
OUTPUT_DIR = "../../../results/multimodal/baseline_multimodal/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

train_path = os.path.join(DATA_PATH,"train.csv")
dev_path   = os.path.join(DATA_PATH,"dev.csv")
test_path  = os.path.join(DATA_PATH,"test.csv")

#Load Data
df_train = pd.read_csv(train_path)
df_dev   = pd.read_csv(dev_path)
df_test  = pd.read_csv(test_path)

# Map labels to ints
stance_2id = {"oppose": 0, "support": 1}
pers_2id = {"no": 0, "yes": 1}

for df in [df_train, df_dev, df_test]:
    df["label"] = df["stance"].map(stance_2id)
    df["persuasiveness_label"] = df["persuasiveness"].map(pers_2id)


print(f"\n Train label distribution:")
print(f"\n Stance: \n Oppose: {(df_train['label']==0).sum()}\n Support: {(df_train['label']==1).sum()}")
print(f"\n\n  Persuasiveness \n No: {(df_train['persuasiveness_label']==0).sum()}\n Yes: {(df_train['persuasiveness_label']==1).sum()}")


df_train.head()


 Train label distribution:

 Stance: 
 Oppose: 1095
 Support: 719


  Persuasiveness 
 No: 1285
 Yes: 529


Unnamed: 0,tweet_id,tweet_url,tweet_text,stance,persuasiveness,split,label,persuasiveness_label
0,1148501065308004357,https://t.co/VQP1FHaWAg,Let's McGyver some Sanity in America!\n\nYou a...,support,no,train,1,0
1,1103872992537276417,https://t.co/zsyXYSeBkp,A child deserves a chance at life. A child des...,oppose,no,train,0,0
2,1151528583623585794,https://t.co/qSWvDX5MnM,"Dear prolifers: girls as young as 10, 11, 12 a...",support,no,train,1,0
3,1100166844026109953,https://t.co/hxH8tFIHUu,The many States will attempt to amend their co...,support,no,train,1,0
4,1021830413550067713,https://t.co/5whvEEtoQR,"Every #abortion is wrong, no matter what metho...",oppose,yes,train,0,1


In [4]:
#Models
TEXT_MODEL_NAME = "microsoft/deberta-v3-base"
VISION_MODEL_NAME = "resnet50"

In [5]:
 # Training hyperparameters
BATCH_SIZE = 16
NUM_EPOCHS = 15
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-4
WARMUP_RATIO = 0.1

# Early stopping
PATIENCE = 5

# Image preprocessing
IMG_SIZE = 384
IMG_MEAN = [0.485, 0.456, 0.406]
IMG_STD = [0.229, 0.224, 0.225]

# Text preprocessing
MAX_TEXT_LENGTH = 105

# Other
NUM_WORKERS = 1
PIN_MEMORY = True if torch.cuda.is_available() else False
os.makedirs(OUTPUT_DIR, exist_ok=True)

###  Multimodal Dataset
We create a MultimodalDataset that will return:
- tokenized text (input_ids, attention_mask)
- image tensor (transforms applied)
- label (stance)

We will handle corrupted images safely (blank image).

In [6]:
# Image transforms (no augmentation for baseline)
image_transforms = transforms.Compose([
    transforms.Resize(IMG_SIZE),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMG_MEAN, std=IMG_STD)
])

class MultimodalDataset(Dataset):
    """
    Dataset that returns (image, text, label) for multimodal learning.
    """
    
    def __init__(
        self,
        dataframe: pd.DataFrame,
        img_dir: str,
        tokenizer,
        image_transform,
        max_length: int = 128
    ):
        self.df = dataframe.reset_index(drop=True)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.image_transform = image_transform
        self.max_length = max_length
        
        print(f"  Dataset created: {len(self)} samples")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load Image
        img_path = os.path.join(self.img_dir, str(row['tweet_id']) + ".jpg")

        try:
            image = Image.open(img_path).convert('RGB')
            image = self.image_transform(image)
        except Exception as e:
            print(f"Warning: could not load image {img_path}. Using blank image instead.")
            image = Image.new("RGB", (224, 224), color=(0, 0, 0))
            image = self.image_transform(image)
        
        
        # Load Text and Tokenize 
        text = str(row['tweet_text'])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt')
        
        # Our Label
        label = row['label']
        
        return {
            'image': image,
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long),
            'tweet_id': str(row['tweet_id']),
            'text': text}

In [7]:
# Load Tokenized
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
print(f"Tokenizer loaded: {TEXT_MODEL_NAME}")

Tokenizer loaded: microsoft/deberta-v3-base


In [9]:
# Create datasets
train_dataset = MultimodalDataset(df_train, IMG_PATH, tokenizer, image_transforms, MAX_TEXT_LENGTH)
dev_dataset = MultimodalDataset(df_dev, IMG_PATH, tokenizer, image_transforms, MAX_TEXT_LENGTH)
test_dataset = MultimodalDataset(df_test, IMG_PATH, tokenizer, image_transforms, MAX_TEXT_LENGTH)


# Create dataloaders
train_loader = DataLoader(train_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=NUM_WORKERS,pin_memory=True)
dev_loader = DataLoader(dev_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=NUM_WORKERS,pin_memory=True)
test_loader = DataLoader(test_dataset,batch_size=BATCH_SIZE,shuffle=True,num_workers=NUM_WORKERS,pin_memory=True)

  Dataset created: 1814 samples
  Dataset created: 200 samples
  Dataset created: 300 samples


### 1. Early Fusion

In [11]:
class EarlyFusionModel(nn.Module):
    """Early Fusion: Concatenate embeddings before classification."""
    
    def __init__(self, text_model_name, vision_model_name, num_classes=2, 
                 freeze_encoders=True, dropout=0.3):
        super().__init__()
        
        # Text encoder
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_dim = self.text_encoder.config.hidden_size
        
        # Vision encoder
        self.vision_encoder = models.resnet50(pretrained=True)
        self.vision_encoder = nn.Sequential(*list(self.vision_encoder.children())[:-1])
        self.vision_dim = 2048
        
        # Freeze encoders
        if freeze_encoders:
            for param in self.text_encoder.parameters():
                param.requires_grad = False
            for param in self.vision_encoder.parameters():
                param.requires_grad = False
        
        # Fusion
        self.fusion_dim = self.text_dim + self.vision_dim
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.fusion_dim, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(dropout/2),
            nn.Linear(128, num_classes)
        )
        
        print(f"   Early Fusion initialized: {self.fusion_dim}D â†’ {num_classes}")
    
    def forward(self, input_ids, attention_mask, images):
        # Text embedding
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = text_out.last_hidden_state[:, 0, :]  # [batch, 768]
        
        # Image embedding
        img_feat = self.vision_encoder(images).squeeze(-1).squeeze(-1)  # [batch, 2048]
        
        # Concatenate
        fused = torch.cat([text_emb, img_feat], dim=1)  # [batch, 2816]
        
        # Classify
        logits = self.classifier(fused)
        
        return logits


### 2. Late Fusion

In [12]:


class LateFusionModel(nn.Module):
    """
    Late Fusion: Train separate classifiers, then ensemble predictions.
    
    Two approaches:
      1. Fixed weights: Î± * text_logits + (1-Î±) * image_logits
      2. Learned weights: Network learns Î± during training
    """
    
    def __init__(self, text_model_name, vision_model_name, num_classes=2,
                 freeze_encoders=True, dropout=0.3, learn_fusion_weight=True):
        super().__init__()
        
        # Text branch
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_dim = self.text_encoder.config.hidden_size
        self.text_classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.text_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )
        
        # Image branch
        self.vision_encoder = models.resnet50(pretrained=True)
        self.vision_encoder = nn.Sequential(*list(self.vision_encoder.children())[:-1])
        self.vision_dim = 2048
        self.vision_classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.vision_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, num_classes)
        )
        
        # Freeze encoders
        if freeze_encoders:
            for param in self.text_encoder.parameters():
                param.requires_grad = False
            for param in self.vision_encoder.parameters():
                param.requires_grad = False
        
        # Fusion weight
        self.learn_fusion_weight = learn_fusion_weight
        if learn_fusion_weight:
            # Learnable weight Î± âˆˆ [0, 1]
            self.fusion_weight = nn.Parameter(torch.tensor(0.5))
        else:
            # Fixed weight
            self.register_buffer('fusion_weight', torch.tensor(0.5))
        
        print(f"    Late Fusion initialized")
        print(f"     - Learn fusion weight: {learn_fusion_weight}")
    
    def forward(self, input_ids, attention_mask, images):
        # Text logits
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = text_out.last_hidden_state[:, 0, :]
        text_logits = self.text_classifier(text_emb)  # [batch, num_classes]
        
        # Image logits
        img_feat = self.vision_encoder(images).squeeze(-1).squeeze(-1)
        img_logits = self.vision_classifier(img_feat)  # [batch, num_classes]
        
        # Weighted fusion
        alpha = torch.sigmoid(self.fusion_weight)  # Ensure [0, 1]
        fused_logits = alpha * text_logits + (1 - alpha) * img_logits
        
        return fused_logits
    
    def get_fusion_weight(self):
        if self.learn_fusion_weight:
            return torch.sigmoid(self.fusion_weight).item()
        else:
            return self.fusion_weight.item()

### 3. Intermediate Fusion (Cross-Attention)

In [None]:
class CrossAttentionFusion(nn.Module):
    """
    Cross-attention module for multimodal fusion.
    
    Allows text and image to attend to each other.
    """
    
    def __init__(self, text_dim, image_dim, hidden_dim=512, num_heads=8, dropout=0.1):
        super().__init__()
        
        # Project to same dimension
        self.text_proj = nn.Linear(text_dim, hidden_dim)
        self.image_proj = nn.Linear(image_dim, hidden_dim)
        
        # Multi-head attention
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )
        
        # Normalization
        self.norm = nn.LayerNorm(hidden_dim)
        
        # Feedforward
        self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, hidden_dim)
        )
        
        self.norm2 = nn.LayerNorm(hidden_dim)
    
    def forward(self, text_emb, image_emb):
        """
        Args:
            text_emb: [batch, text_dim]
            image_emb: [batch, image_dim]
        
        Returns:
            attended_text: [batch, hidden_dim]
            attended_image: [batch, hidden_dim]
        """
        batch_size = text_emb.size(0)
        
        # Project to same dimension
        text_proj = self.text_proj(text_emb).unsqueeze(1)  # [batch, 1, hidden_dim]
        image_proj = self.image_proj(image_emb).unsqueeze(1)  # [batch, 1, hidden_dim]
        
        # Text attends to Image
        attended_text, _ = self.cross_attention(
            query=text_proj,
            key=image_proj,
            value=image_proj
        )
        attended_text = attended_text.squeeze(1)  # [batch, hidden_dim]
        attended_text = self.norm(attended_text + text_proj.squeeze(1))
        
        # Feedforward
        attended_text = attended_text + self.ffn(attended_text)
        attended_text = self.norm2(attended_text)
        
        # Image attends to Text (optional, for symmetry)
        attended_image, _ = self.cross_attention(
            query=image_proj,
            key=text_proj,
            value=text_proj
        )
        attended_image = attended_image.squeeze(1)
        attended_image = self.norm(attended_image + image_proj.squeeze(1))
        attended_image = attended_image + self.ffn(attended_image)
        attended_image = self.norm2(attended_image)
        
        return attended_text, attended_image

class IntermediateFusionModel(nn.Module):
    """Intermediate Fusion: Cross-attention between text and image."""
    
    def __init__(self, text_model_name, vision_model_name, num_classes=2,
                 freeze_encoders=True, dropout=0.3, num_heads=8):
        super().__init__()
        
        # Text encoder
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_dim = self.text_encoder.config.hidden_size
        
        # Vision encoder
        self.vision_encoder = models.resnet50(pretrained=True)
        self.vision_encoder = nn.Sequential(*list(self.vision_encoder.children())[:-1])
        self.vision_dim = 2048
        
        # Freeze encoders
        if freeze_encoders:
            for param in self.text_encoder.parameters():
                param.requires_grad = False
            for param in self.vision_encoder.parameters():
                param.requires_grad = False
        
        # Cross-attention fusion
        self.fusion_dim = 512
        self.cross_attention = CrossAttentionFusion(
            text_dim=self.text_dim,
            image_dim=self.vision_dim,
            hidden_dim=self.fusion_dim,
            num_heads=num_heads,
            dropout=dropout
        )
        
        # Classifier (takes concatenated attended features)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.fusion_dim * 2, 256),  # *2 because we concat text+image
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout/2),
            nn.Linear(128, num_classes)
        )
        
        print(f"    Intermediate Fusion initialized")
        print(f"     - Attention heads: {num_heads}")
        print(f"     - Hidden dim: {self.fusion_dim}")
    
    def forward(self, input_ids, attention_mask, images):
        # Text embedding
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_emb = text_out.last_hidden_state[:, 0, :]  # [batch, 768]
        
        # Image embedding
        img_feat = self.vision_encoder(images).squeeze(-1).squeeze(-1)  # [batch, 2048]
        
        # Cross-attention fusion
        attended_text, attended_image = self.cross_attention(text_emb, img_feat)
        
        # Concatenate attended features
        fused = torch.cat([attended_text, attended_image], dim=1)  # [batch, 1024]
        
        # Classify
        logits = self.classifier(fused)
        
        return logits

#### Training Function

In [14]:

def train_model(
    model,
    train_loader,
    dev_loader,
    model_name="model",
    num_epochs=15,
    learning_rate=2e-5,
    weight_decay=1e-4,
    patience=5,
    device=device):

    model = model.to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    # Scheduler
    num_training_steps = len(train_loader) * num_epochs
    num_warmup_steps = int(num_training_steps * WARMUP_RATIO)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)
    
    # History
    history = {'train_loss': [], 'dev_f1': [], 'dev_acc': []}
    
    # Early stopping
    best_f1 = 0.0
    best_model_state = None
    epochs_without_improvement = 0
    
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        print("-" * 40)
        
        # Training
        model.train()
        train_loss = 0.0
        
        for batch in tqdm(train_loader, desc="Training", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask, images)
            loss = criterion(logits, labels)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            
            train_loss += loss.item() * input_ids.size(0)
        
        train_loss /= len(train_loader.dataset)
        
        # Validation
        model.eval()
        dev_preds = []
        dev_labels = []
        
        with torch.no_grad():
            for batch in tqdm(dev_loader, desc="Validation", leave=False):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                images = batch['image'].to(device)
                labels = batch['label']
                
                logits = model(input_ids, attention_mask, images)
                preds = torch.argmax(logits, dim=1)
                
                dev_preds.extend(preds.cpu().numpy())
                dev_labels.extend(labels.numpy())
        
        dev_acc = accuracy_score(dev_labels, dev_preds)
        dev_f1 = f1_score(dev_labels, dev_preds, average='weighted')
        
        history['train_loss'].append(train_loss)
        history['dev_f1'].append(dev_f1)
        history['dev_acc'].append(dev_acc)
        
        print(f"Train Loss: {train_loss:.4f} | Dev Acc: {dev_acc:.4f} | Dev F1: {dev_f1:.4f}")
        
        # Check for fusion weight (Late Fusion)
        if hasattr(model, 'get_fusion_weight'):
            alpha = model.get_fusion_weight()
            print(f"Fusion weight Î±: {alpha:.4f} (text={alpha:.2f}, image={1-alpha:.2f})")
        
        if dev_f1 > best_f1:
            best_f1 = dev_f1
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            epochs_without_improvement = 0
            print(f"  New best F1: {best_f1:.4f}")
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print(f"\n  Early stopping at epoch {epoch + 1}")
                break
    
    # Load best
    model.load_state_dict(best_model_state)
    model = model.to(device)
    
    print(f"\n  Training complete! Best Dev F1: {best_f1:.4f}")
    
    return model, history

def evaluate_model(model, dataloader, device=device):
    """Evaluate model and return metrics."""
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label']
            
            logits = model(input_ids, attention_mask, images)
            preds = torch.argmax(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.numpy())
    
    return {
        'accuracy': accuracy_score(all_labels, all_preds),
        'f1': f1_score(all_labels, all_preds, average='weighted'),
        'y_true': np.array(all_labels),
        'y_pred': np.array(all_preds)
    }

In [None]:
#Training Early Fusion
results = {}

print("MODEL 1: EARLY FUSION")
early_model = EarlyFusionModel(
    text_model_name=TEXT_MODEL_NAME,
    vision_model_name=VISION_MODEL_NAME,
    num_classes=2,
    freeze_encoders=True)

early_model, early_history = train_model(
    early_model, train_loader, dev_loader,
    model_name="Early Fusion",
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    patience=PATIENCE)

early_test_results = evaluate_model(early_model, test_loader)
results['Early Fusion'] = early_test_results

torch.save(early_model.state_dict(), os.path.join(OUTPUT_DIR, 'early_fusion_best.pth'))
print(f"\n  Early Fusion Test F1: {early_test_results['f1']:.4f}")


In [None]:
#Training Late Fusion
print("MODEL 2: LATE FUSION")

late_model = LateFusionModel(
    text_model_name=TEXT_MODEL_NAME,
    vision_model_name=VISION_MODEL_NAME,
    num_classes=2,
    freeze_encoders=True,
    learn_fusion_weight=True)

late_model, late_history = train_model(
    late_model, train_loader, dev_loader,
    model_name="Late Fusion",
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    patience=PATIENCE)

late_test_results = evaluate_model(late_model, test_loader)
results['Late Fusion'] = late_test_results

# We print final fusion weight
final_alpha = late_model.get_fusion_weight()
print(f"\n  Learned fusion weight: Î±={final_alpha:.4f}")
print(f"   â†’ Text contribution: {final_alpha*100:.1f}%")
print(f"   â†’ Image contribution: {(1-final_alpha)*100:.1f}%")

torch.save(late_model.state_dict(), os.path.join(OUTPUT_DIR, 'late_fusion_best.pth'))
print(f"\nðŸ“Š Late Fusion Test F1: {late_test_results['f1']:.4f}")

In [None]:
# Intermediate Fusion
print("MODEL 3: INTERMEDIATE FUSION (CROSS-ATTENTION)")

intermediate_model = IntermediateFusionModel(
    text_model_name=TEXT_MODEL_NAME,
    vision_model_name=VISION_MODEL_NAME,
    num_classes=2,
    freeze_encoders=True,
    num_heads=8)

intermediate_model, intermediate_history = train_model(
    intermediate_model, train_loader, dev_loader,
    model_name="Intermediate Fusion",    
    num_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    patience=PATIENCE)

intermediate_test_results = evaluate_model(intermediate_model, test_loader)
results['Intermediate Fusion'] = intermediate_test_results

torch.save(intermediate_model.state_dict(), os.path.join(OUTPUT_DIR, 'intermediate_fusion_best.pth'))
print(f"\n  Intermediate Fusion Test F1: {intermediate_test_results['f1']:.4f}")