In [1]:
# Enhanced Installation with latest versions
!pip install transformers accelerate -q
!pip install optuna scikit-learn torchmetrics -q

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, RobustScaler
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import os
import warnings
warnings.filterwarnings('ignore')

# Enhanced device setup with mixed precision support
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üöÄ Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    # Enable cudnn benchmarking for faster training
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True

üöÄ Using device: cuda
GPU: Tesla T4
Memory: 15.64 GB


In [2]:
# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')
DATA_PATH = "/kaggle/input/datasets/jasindavid/shifteddataset/label_shifted_fin_causality_dataset.csv" if IS_KAGGLE else r"D:\NLP_ResearchPaper_work\final_financial_causality_dataset.csv"

print(f"üìç Running on {'Kaggle' if IS_KAGGLE else 'Local'}")

# Load with enhanced parsing
df = pd.read_csv(DATA_PATH)
df["date"] = pd.to_datetime(df["date"])

# Enhanced temporal split (80/20 with buffer to prevent data leakage)
split_date = df["date"].quantile(0.8)
train_df = df[df["date"] <= split_date].copy()
test_df = df[df["date"] > split_date].copy()

# Shuffle training data to prevent temporal bias
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Train: {len(train_df)} | Test: {len(test_df)}")
print(f"Date range - Train: {train_df['date'].min()} to {train_df['date'].max()}")
print(f"Date range - Test: {test_df['date'].min()} to {test_df['date'].max()}")

# Check class distribution
print("\nClass Distribution:")
print(train_df["causal_label"].value_counts(normalize=True))

üìç Running on Kaggle
Train: 29442 | Test: 7230
Date range - Train: 2000-11-27 00:00:00 to 2018-04-26 00:00:00
Date range - Test: 2018-04-27 00:00:00 to 2025-04-30 00:00:00

Class Distribution:
causal_label
1    0.78551
0    0.21449
Name: proportion, dtype: float64


In [3]:
# Calculate class weights using effective number sampling (better for imbalanced data)
train_counts = train_df["causal_label"].value_counts().sort_index()
total_samples = len(train_df)

# Effective number weighting (reduces impact of majority class more aggressively)
beta = 0.9999
effective_num = 1.0 - np.power(beta, train_counts.values)
class_weights = (1.0 - beta) / effective_num
class_weights = class_weights / class_weights.sum() * len(train_counts)

print("\nüéØ Effective Number Class Weights:")
for label, weight in zip(train_counts.index, class_weights):
    print(f"Class {label}: {weight:.4f} (n={train_counts[label]})")

weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# Create weighted sampler for balanced batches
sample_weights = train_df["causal_label"].map(lambda x: class_weights[x]).values
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(train_df), replacement=True)


üéØ Effective Number Class Weights:
Class 0: 1.3161 (n=6315)
Class 1: 0.6839 (n=23127)


In [4]:
MODEL_NAME = "ProsusAI/finbert"

# Enhanced tokenizer with financial domain specifics
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Load base model with gradient checkpointing for memory efficiency
base_model = AutoModel.from_pretrained(MODEL_NAME)
base_model.gradient_checkpointing_enable()  # Saves memory, allows larger batch sizes

print(f"‚úÖ Loaded {MODEL_NAME}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Max position embeddings: {base_model.config.max_position_embeddings}")

# Test tokenization on sample
sample_text = train_df["clean_text"].iloc[0]
tokens = tokenizer(sample_text, truncation=True, max_length=256)
print(f"\nSample tokenization length: {len(tokens['input_ids'])}")

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: ProsusAI/finbert
Key                          | Status     |  | 
-----------------------------+------------+--+-
classifier.weight            | UNEXPECTED |  | 
bert.embeddings.position_ids | UNEXPECTED |  | 
classifier.bias              | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


‚úÖ Loaded ProsusAI/finbert
Vocab size: 30522
Max position embeddings: 512

Sample tokenization length: 256


In [5]:
class AdvancedTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256, augment=False):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Simple text augmentation for minority class (random word deletion simulation via truncation)
        if self.augment and self.labels[idx] == 1 and np.random.random() > 0.5:
            words = text.split()
            if len(words) > 10:
                # Randomly truncate to simulate augmentation
                keep_ratio = np.random.uniform(0.8, 1.0)
                text = ' '.join(words[:int(len(words) * keep_ratio)])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

class AdvancedMultimodalDataset(Dataset):
    def __init__(self, texts, numerical, labels, tokenizer, max_len=256, augment=False):
        self.texts = texts.tolist() if hasattr(texts, 'tolist') else texts
        self.numerical = numerical
        self.labels = labels.tolist() if hasattr(labels, 'tolist') else labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # Augmentation for minority class
        if self.augment and self.labels[idx] == 1 and np.random.random() > 0.5:
            words = text.split()
            if len(words) > 10:
                keep_ratio = np.random.uniform(0.8, 1.0)
                text = ' '.join(words[:int(len(words) * keep_ratio)])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "numerical": torch.tensor(self.numerical[idx], dtype=torch.float),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [6]:
class AttentionPooling(nn.Module):
    """Learned attention pooling for better sequence representation"""
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.Tanh(),
            nn.Linear(hidden_size // 2, 1)
        )
    
    def forward(self, hidden_states, attention_mask):
        # hidden_states: [batch, seq_len, hidden]
        scores = self.attention(hidden_states).squeeze(-1)  # [batch, seq_len]
        scores = scores.masked_fill(~attention_mask.bool(), float('-inf'))
        weights = torch.softmax(scores, dim=1).unsqueeze(-1)  # [batch, seq_len, 1]
        pooled = (hidden_states * weights).sum(dim=1)  # [batch, hidden]
        return pooled

class EnhancedFinBERTClassifier(nn.Module):
    def __init__(self, base_model, dropout_rate=0.3):
        super().__init__()
        self.bert = base_model
        self.hidden_size = base_model.config.hidden_size
        
        # Multi-head attention pooling instead of just CLS token
        self.attention_pool = AttentionPooling(self.hidden_size)
        
        # Layer normalization for stability
        self.layer_norm = nn.LayerNorm(self.hidden_size)
        
        # Deeper classifier with residual connections
        self.classifier = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size // 2),
            nn.LayerNorm(self.hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(self.hidden_size // 2, self.hidden_size // 4),
            nn.LayerNorm(self.hidden_size // 4),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(self.hidden_size // 4, 2)
        )
        
        # Initialize weights
        self._init_weights()
    
    def _init_weights(self):
        for module in self.classifier:
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.zeros_(module.bias)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use attention pooling over all tokens
        pooled = self.attention_pool(outputs.last_hidden_state, attention_mask)
        pooled = self.layer_norm(pooled)
        
        return self.classifier(pooled)

class EnhancedMultimodalFinBERT(nn.Module):
    def __init__(self, base_model, num_numerical=3, dropout_rate=0.3):
        super().__init__()
        self.bert = base_model
        self.hidden_size = base_model.config.hidden_size
        
        # Attention pooling
        self.attention_pool = AttentionPooling(self.hidden_size)
        self.layer_norm = nn.LayerNorm(self.hidden_size)
        
        # Enhanced numerical processing with deeper layers
        self.num_processor = nn.Sequential(
            nn.Linear(num_numerical, 64),
            nn.LayerNorm(64),
            nn.GELU(),
            nn.Dropout(dropout_rate / 2),
            nn.Linear(64, 32),
            nn.LayerNorm(32),
            nn.GELU()
        )
        
        # Fusion layer with gating mechanism
        self.fusion_gate = nn.Sequential(
            nn.Linear(self.hidden_size + 32, self.hidden_size + 32),
            nn.Sigmoid()
        )
        
        # Classifier
        combined_size = self.hidden_size + 32
        self.classifier = nn.Sequential(
            nn.Linear(combined_size, combined_size // 2),
            nn.LayerNorm(combined_size // 2),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(combined_size // 2, combined_size // 4),
            nn.LayerNorm(combined_size // 4),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(combined_size // 4, 2)
        )
        
        self._init_weights()
    
    def _init_weights(self):
        for module in [self.num_processor, self.classifier]:
            for layer in module:
                if isinstance(layer, nn.Linear):
                    nn.init.xavier_uniform_(layer.weight)
                    nn.init.zeros_(layer.bias)
    
    def forward(self, input_ids, attention_mask, numerical):
        # Text branch
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.attention_pool(outputs.last_hidden_state, attention_mask)
        text_features = self.layer_norm(text_features)
        
        # Numerical branch
        num_features = self.num_processor(numerical)
        
        # Fusion with gating
        combined = torch.cat((text_features, num_features), dim=1)
        gate = self.fusion_gate(combined)
        combined = combined * gate  # Gated fusion
        
        return self.classifier(combined)

In [7]:
# Text-only datasets with augmentation for training
train_text_ds = AdvancedTextDataset(
    train_df["clean_text"], 
    train_df["causal_label"], 
    tokenizer, 
    max_len=256,
    augment=True  # Enable augmentation for minority class
)
test_text_ds = AdvancedTextDataset(
    test_df["clean_text"], 
    test_df["causal_label"], 
    tokenizer, 
    max_len=256,
    augment=False
)

# Use weighted sampler for balanced training
train_text_loader = DataLoader(
    train_text_ds, 
    batch_size=32,  # Increased batch size due to gradient checkpointing
    sampler=sampler,  # Use weighted sampler instead of shuffle
    num_workers=2,
    pin_memory=True
)
test_text_loader = DataLoader(
    test_text_ds, 
    batch_size=64, 
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# Enhanced numerical scaling using RobustScaler (better for financial outliers)
scaler = RobustScaler()
num_train = scaler.fit_transform(train_df[["return_t1", "return_t5", "volatility_5"]])
num_test = scaler.transform(test_df[["return_t1", "return_t5", "volatility_5"]])

# Multimodal datasets
train_mm_ds = AdvancedMultimodalDataset(
    train_df["clean_text"], 
    num_train, 
    train_df["causal_label"], 
    tokenizer,
    max_len=256,
    augment=True
)
test_mm_ds = AdvancedMultimodalDataset(
    test_df["clean_text"], 
    num_test, 
    test_df["causal_label"], 
    tokenizer,
    max_len=256,
    augment=False
)

train_mm_loader = DataLoader(
    train_mm_ds, 
    batch_size=32, 
    sampler=sampler,
    num_workers=2,
    pin_memory=True
)
test_mm_loader = DataLoader(
    test_mm_ds, 
    batch_size=64, 
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

print(f"‚úÖ DataLoaders ready")
print(f"Text batches: {len(train_text_loader)} | MM batches: {len(train_mm_loader)}")

‚úÖ DataLoaders ready
Text batches: 921 | MM batches: 921


In [13]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001, mode='max'):
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        
    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif self._is_improvement(score):
            self.best_score = score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        return self.early_stop
    
    def _is_improvement(self, score):
        if self.mode == 'max':
            return score > self.best_score + self.min_delta
        return score < self.best_score - self.min_delta

def train_model_enhanced(model, loader, val_loader, epochs=10, class_weights=None, model_name="model"):
    model.to(device)
    
    # Discriminative learning rates
    no_decay = ['bias', 'LayerNorm.weight']
    bert_params = list(model.bert.named_parameters())
    classifier_params = []
    
    if hasattr(model, 'classifier'):
        classifier_params += list(model.classifier.parameters())
    if hasattr(model, 'num_processor'):
        classifier_params += list(model.num_processor.parameters())
    if hasattr(model, 'attention_pool'):
        classifier_params += list(model.attention_pool.parameters())
    if hasattr(model, 'fusion_gate'):
        classifier_params += list(model.fusion_gate.parameters())
    
    optimizer_grouped_parameters = [
        {
            'params': [p for n, p in bert_params if not any(nd in n for nd in no_decay)],
            'weight_decay': 0.01,
            'lr': 1e-5
        },
        {
            'params': [p for n, p in bert_params if any(nd in n for nd in no_decay)],
            'weight_decay': 0.0,
            'lr': 1e-5
        },
        {
            'params': classifier_params,
            'weight_decay': 0.01,
            'lr': 5e-5
        }
    ]
    
    optimizer = AdamW(optimizer_grouped_parameters)
    
    # Scheduler
    num_training_steps = len(loader) * epochs
    num_warmup_steps = int(0.1 * num_training_steps)
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    # Loss with label smoothing
    if class_weights is not None:
        criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
    else:
        criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    
    # Mixed precision
    scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None
    
    early_stopping = EarlyStopping(patience=3, mode='max')
    best_f1 = 0.0
    best_model_state = None
    
    history = {'train_loss': [], 'val_f1': []}
    
    for epoch in range(epochs):
        # Training
        model.train()
        total_loss = 0
        progress_bar = tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch in progress_bar:
            optimizer.zero_grad()
            
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            
            # Mixed precision
            if scaler:
                with torch.cuda.amp.autocast():
                    if "numerical" in batch:
                        numerical = batch["numerical"].to(device)
                        outputs = model(input_ids, attention_mask, numerical)
                    else:
                        outputs = model(input_ids, attention_mask)
                    loss = criterion(outputs, labels)
                
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
            else:
                if "numerical" in batch:
                    numerical = batch["numerical"].to(device)
                    outputs = model(input_ids, attention_mask, numerical)
                else:
                    outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
            
            scheduler.step()
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        avg_loss = total_loss / len(loader)
        history['train_loss'].append(avg_loss)
        
        # Validation - FIXED: Now returns dict directly
        val_metrics = evaluate_model_enhanced(model, val_loader)
        val_f1 = val_metrics['f1']  # This will work now!
        history['val_f1'].append(val_f1)
        
        print(f"\nüìä Epoch {epoch+1} | Loss: {avg_loss:.4f} | Val F1: {val_f1:.4f} | LR: {scheduler.get_last_lr()[0]:.2e}")
        
        # Save best model
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            print(f"‚úÖ New best model saved (F1: {best_f1:.4f})")
        
        # Early stopping
        if early_stopping(val_f1):
            print(f"‚èπÔ∏è Early stopping triggered at epoch {epoch+1}")
            break
    
    # Load best model
    if best_model_state:
        model.load_state_dict(best_model_state)
        print(f"\nüèÜ Loaded best model with F1: {best_f1:.4f}")
    
    return model, history

print("‚úÖ Fixed training function loaded")

‚úÖ Fixed training function loaded


In [14]:
# FIXED: Evaluation function now returns dictionary properly
def evaluate_model_enhanced(model, loader):
    model.eval()
    all_preds, all_probs, all_true = [], [], []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            if "numerical" in batch:
                numerical = batch["numerical"].to(device)
                outputs = model(input_ids, attention_mask, numerical)
            else:
                outputs = model(input_ids, attention_mask)
            
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_probs.extend(probs.cpu().numpy())
            all_true.extend(batch["label"].numpy())
    
    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(all_true, all_preds),
        'precision': precision_score(all_true, all_preds, average='binary', zero_division=0),
        'recall': recall_score(all_true, all_preds, average='binary', zero_division=0),
        'f1': f1_score(all_true, all_preds, average='binary', zero_division=0),
        'auc': roc_auc_score(all_true, [p[1] for p in all_probs]) if len(set(all_true)) > 1 else 0.5
    }
    
    return metrics  # Returns dict, not tuple!

# For final evaluation when you need predictions back
def evaluate_model_full(model, loader):
    model.eval()
    all_preds, all_probs, all_true = [], [], []
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            if "numerical" in batch:
                numerical = batch["numerical"].to(device)
                outputs = model(input_ids, attention_mask, numerical)
            else:
                outputs = model(input_ids, attention_mask)
            
            probs = torch.softmax(outputs, dim=1)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_probs.extend(probs.cpu().numpy())
            all_true.extend(batch["label"].numpy())
    
    metrics = {
        'accuracy': accuracy_score(all_true, all_preds),
        'precision': precision_score(all_true, all_preds, average='binary', zero_division=0),
        'recall': recall_score(all_true, all_preds, average='binary', zero_division=0),
        'f1': f1_score(all_true, all_preds, average='binary', zero_division=0),
        'auc': roc_auc_score(all_true, [p[1] for p in all_probs]) if len(set(all_true)) > 1 else 0.5
    }
    
    return metrics, all_preds, all_probs, all_true

print("‚úÖ Fixed evaluation functions loaded")

‚úÖ Fixed evaluation functions loaded


In [15]:
# Continue from where you left off - reinitialize the model and train
print("üîÑ Reinitializing model and continuing training...")

# Re-initialize the multimodal model
model_mm = EnhancedMultimodalFinBERT(base_model, num_numerical=3, dropout_rate=0.4)

# Recreate data splits
val_split = int(0.9 * len(train_mm_ds))
train_mm_subset, val_mm_subset = torch.utils.data.random_split(
    train_mm_ds, [val_split, len(train_mm_ds) - val_split]
)

# Create loaders
train_mm_loader_split = DataLoader(
    train_mm_subset, 
    batch_size=32, 
    sampler=WeightedRandomSampler(
        [sample_weights[i] for i in train_mm_subset.indices], 
        len(train_mm_subset), 
        replacement=True
    ),
    num_workers=2,
    pin_memory=True
)

val_mm_loader = DataLoader(
    val_mm_subset, 
    batch_size=64, 
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# Train with fixed function
model_mm, history_mm = train_model_enhanced(
    model_mm, 
    train_mm_loader_split, 
    val_mm_loader,
    epochs=10, 
    class_weights=weights_tensor,
    model_name="Multimodal"
)

print("\n‚úÖ Training completed successfully!")

üîÑ Reinitializing model and continuing training...


Epoch 1/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:09<00:00,  1.93it/s, loss=0.1239]
                                                           


üìä Epoch 1 | Loss: 0.4637 | Val F1: 0.9611 | LR: 1.00e-05
‚úÖ New best model saved (F1: 0.9611)


Epoch 2/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:07<00:00,  1.94it/s, loss=0.3245]
                                                           


üìä Epoch 2 | Loss: 0.2860 | Val F1: 0.9826 | LR: 9.70e-06
‚úÖ New best model saved (F1: 0.9826)


Epoch 3/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:05<00:00,  1.95it/s, loss=0.1198]
                                                           


üìä Epoch 3 | Loss: 0.2520 | Val F1: 0.9926 | LR: 8.83e-06
‚úÖ New best model saved (F1: 0.9926)


Epoch 4/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:05<00:00,  1.95it/s, loss=0.1265]
                                                           


üìä Epoch 4 | Loss: 0.2396 | Val F1: 0.9910 | LR: 7.50e-06


Epoch 5/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:02<00:00,  1.96it/s, loss=0.3215]
                                                           


üìä Epoch 5 | Loss: 0.2341 | Val F1: 0.9943 | LR: 5.87e-06
‚úÖ New best model saved (F1: 0.9943)


Epoch 6/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:02<00:00,  1.96it/s, loss=0.3254]
                                                           


üìä Epoch 6 | Loss: 0.2303 | Val F1: 0.9950 | LR: 4.13e-06
‚úÖ New best model saved (F1: 0.9950)


Epoch 7/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:01<00:00,  1.97it/s, loss=0.3216]
                                                           


üìä Epoch 7 | Loss: 0.2298 | Val F1: 0.9954 | LR: 2.50e-06
‚úÖ New best model saved (F1: 0.9954)


Epoch 8/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:01<00:00,  1.97it/s, loss=0.3214]
                                                           


üìä Epoch 8 | Loss: 0.2257 | Val F1: 0.9943 | LR: 1.17e-06


Epoch 9/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:02<00:00,  1.96it/s, loss=0.3246]
                                                           


üìä Epoch 9 | Loss: 0.2270 | Val F1: 0.9945 | LR: 3.02e-07


Epoch 10/10: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 829/829 [07:00<00:00,  1.97it/s, loss=0.1210]
                                                           


üìä Epoch 10 | Loss: 0.2279 | Val F1: 0.9950 | LR: 0.00e+00
‚èπÔ∏è Early stopping triggered at epoch 10

üèÜ Loaded best model with F1: 0.9954

‚úÖ Training completed successfully!


In [16]:
# Final evaluation using the full evaluation function
test_metrics_mm, mm_preds, mm_probs, mm_true = evaluate_model_full(model_mm, test_mm_loader)

print(f"\n{'='*50}")
print(f"üéØ Enhanced Multimodal Results")
print(f"{'='*50}")
print(f"Accuracy:  {test_metrics_mm['accuracy']:.4f}")
print(f"Precision: {test_metrics_mm['precision']:.4f}")
print(f"Recall:    {test_metrics_mm['recall']:.4f}")
print(f"F1-Score:  {test_metrics_mm['f1']:.4f}")
print(f"AUC-ROC:   {test_metrics_mm['auc']:.4f}")
print(f"{'='*50}")

print("\nDetailed Classification Report:")
print(classification_report(mm_true, mm_preds, target_names=['Non-Causal', 'Causal']))


üéØ Enhanced Multimodal Results
Accuracy:  0.8228
Precision: 0.8617
Recall:    0.8824
F1-Score:  0.8719
AUC-ROC:   0.8962

Detailed Classification Report:
              precision    recall  f1-score   support

  Non-Causal       0.73      0.69      0.71      2288
      Causal       0.86      0.88      0.87      4942

    accuracy                           0.82      7230
   macro avg       0.80      0.79      0.79      7230
weighted avg       0.82      0.82      0.82      7230



In [18]:
import torch
import os
import pickle
import json
from datetime import datetime

# Create save directory
save_dir = "/kaggle/working/models" if IS_KAGGLE else r"D:\NLP_ResearchPaper_work\models"
os.makedirs(save_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# ============================================
# SAVE MULTIMODAL MODEL - STATE DICT ONLY (WORKING METHOD)
# ============================================

# 1. Save model weights only (RECOMMENDED - avoids pickle errors)
mm_state_path = os.path.join(save_dir, f"multimodal_state_{timestamp}.pth")
torch.save(model_mm.state_dict(), mm_state_path)
print(f"‚úÖ Model weights saved: {mm_state_path}")

# 2. Save config needed to reconstruct model
config = {
    'hidden_size': model_mm.hidden_size,
    'num_numerical': 3,
    'dropout_rate': 0.4,
    'base_model_name': "ProsusAI/finbert",
    'timestamp': timestamp
}
config_path = os.path.join(save_dir, f"multimodal_config_{timestamp}.json")
with open(config_path, 'w') as f:
    json.dump(config, f)
print(f"‚úÖ Config saved: {config_path}")

# 3. Save scaler
scaler_path = os.path.join(save_dir, f"scaler_{timestamp}.pkl")
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"‚úÖ Scaler saved: {scaler_path}")

# 4. Save tokenizer
tokenizer_path = os.path.join(save_dir, f"tokenizer_{timestamp}")
tokenizer.save_pretrained(tokenizer_path)
print(f"‚úÖ Tokenizer saved: {tokenizer_path}")

print(f"\n{'='*60}")
print(f"üíæ ALL FILES SAVED IN: {save_dir}")
print(f"{'='*60}")
print("Files to download:")
print(f"  1. multimodal_state_{timestamp}.pth (model weights)")
print(f"  2. multimodal_config_{timestamp}.json (config)")
print(f"  3. scaler_{timestamp}.pkl (numerical scaler)")
print(f"  4. tokenizer_{timestamp}/ folder (tokenizer)")

‚úÖ Model weights saved: /kaggle/working/models/multimodal_state_20260220_061356.pth
‚úÖ Config saved: /kaggle/working/models/multimodal_config_20260220_061356.json
‚úÖ Scaler saved: /kaggle/working/models/scaler_20260220_061356.pkl
‚úÖ Tokenizer saved: /kaggle/working/models/tokenizer_20260220_061356

üíæ ALL FILES SAVED IN: /kaggle/working/models
Files to download:
  1. multimodal_state_20260220_061356.pth (model weights)
  2. multimodal_config_20260220_061356.json (config)
  3. scaler_20260220_061356.pkl (numerical scaler)
  4. tokenizer_20260220_061356/ folder (tokenizer)


In [19]:
import torch
import os
from datetime import datetime

# Create save directory
save_dir = "/kaggle/working/models" if IS_KAGGLE else r"D:\NLP_ResearchPaper_work\models"
os.makedirs(save_dir, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# ============================================
# SAVE EVERYTHING IN ONE FILE
# ============================================

# Prepare complete checkpoint
checkpoint = {
    # Model weights
    'model_state_dict': model_mm.state_dict(),
    
    # Model architecture config
    'hidden_size': model_mm.hidden_size,
    'num_numerical': 3,
    'dropout_rate': 0.4,
    
    # Training info
    'history': history_mm if 'history_mm' in locals() else None,
    
    # Scaler
    'scaler': scaler,
    
    # Tokenizer info
    'tokenizer_name': "ProsusAI/finbert",
    'max_len': 256,
    
    # Metadata
    'timestamp': timestamp,
    'model_type': 'EnhancedMultimodalFinBERT'
}

# Save single file
single_model_path = os.path.join(save_dir, f"complete_multimodal_model_{timestamp}.pth")
torch.save(checkpoint, single_model_path)

print(f"‚úÖ Complete model saved: {single_model_path}")
print(f"File size: {os.path.getsize(single_model_path) / (1024**2):.2f} MB")

‚úÖ Complete model saved: /kaggle/working/models/complete_multimodal_model_20260220_061547.pth
File size: 422.86 MB
