In [3]:
import numpy as np
import json
import os
from tqdm import tqdm

In [24]:
# MAB utils
import requests
import os
import json
# from SPARQLWrapper import SPARQLWrapper, JSON
import re
import argparse
import torch
import numpy as np
import random
import torch.nn.functional as F
import functools
import pandas as pd
import ast  
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

class Environment(object):
    def __init__(self, arms, dataset, args=None, preding=False):
        self.arms = arms
        self.dataset = dataset
        self.preding = preding
        self.index = -1
        self.alpha_values = [0.0, 0.25, 0.5, 0.75, 1.0]
        self.skip_dataset = []
        self._update_state()
        

    def _update_state(self):
        self.index += 1
        if self.index >= len(self.dataset):
            self.index = 0
        
        while self.dataset[self.index]['dataset_name'] in self.skip_dataset and not self.preding:
            self.index += 1
            if self.index >= len(self.dataset):
                self.index = 0

        self.state = self.dataset[self.index]['text']
        
        # self.state = np.random.randint(0, self.arms)
    def _index_to_arm(self,index):
        if type(index) == np.ndarray:
            assert len(index) == 1
            index = index[0]
        return self.alpha_values[int(index)]
        
    def get_state(self):
        return self.state
        # return self.state

    def _get_reward(self, arm):
        """
        Returns the pre-computed reward for the selected arm.
        arm (int): Index 0-4 corresponding to alpha values [0.0, 0.25, 0.5, 0.75, 1.0]
        """
        query_data = self.dataset[self.index]
        rewards = query_data.get("rewards", [0.0] * 5)
        
        # Validate arm index
        if arm < 0 or arm >= len(rewards):
            print(f"Warning: Arm {arm} out of range for rewards list of length {len(rewards)}")
            return 0.0
        
        return float(rewards[int(arm)])
                               

    def _get_recall(self,arm):
        raise NotImplementedError
        method = self._index_to_arm(arm)
        return self.dataset[self.index][method+'_eval']['recall']

    def choose_arm(self, arm):
        reward = self._get_reward(arm)
        # recall = self._get_recall(arm)
        self._update_state()
        return reward
    
    def __len__(self):
        return len(self.dataset)

In [25]:
import numpy as np

class LinUCBAgent:
    """
    Disjoint Linear Upper Confidence Bound (LinUCB) Agent.
    
    References:
        Li et al., "A Contextual-Bandit Approach to Personalized News Article Recommendation", WWW 2010.
        (Algorithm 1)
    """
    def __init__(self, n_arms, n_features, alpha=0.1):
        """
        Args:
            n_arms (int): Number of distinct actions (fusion weights).
            n_features (int): Dimension of the context vector.
            alpha (float): Exploration hyperparameter. Higher alpha = more exploration.
        """
        self.n_arms = n_arms
        self.n_features = n_features
        self.alpha = alpha
        
        # Initialize disjoint matrices for each arm
        # A: Covariance matrix (d x d), initialized to Identity for Ridge Regularization
        # b: Reward-weighted feature vector (d x 1), initialized to zeros
        self.A = [np.identity(n_features) for _ in range(n_arms)]
        self.b = [np.zeros(n_features) for _ in range(n_arms)]

    def select_arm(self, context_vector):
        """
        Selects an arm based on the Upper Confidence Bound (UCB) of the estimated reward.
        
        Args:
            context_vector (np.array): Shape (n_features,)
            
        Returns:
            int: Index of the selected arm.
        """
        p = np.zeros(self.n_arms)
        
        for a in range(self.n_arms):
            # 1. Compute the inverse of A (Ridge Regression covariance)
            # In production, use np.linalg.solve or update inverse iteratively for speed
            A_inv = np.linalg.inv(self.A[a])
            
            # 2. Compute the estimated coefficient (theta)
            # theta = A^-1 * b
            theta = A_inv @ self.b[a]
            
            # 3. Calculate the standard deviation (uncertainty width)
            # std = sqrt(x.T * A^-1 * x)
            std_dev = np.sqrt(context_vector.T @ A_inv @ context_vector)
            
            # 4. Calculate UCB
            # Prediction + Exploration Bonus
            p[a] = theta @ context_vector + self.alpha * std_dev
            
        # Tie-breaking: randomly choose among max if multiple arms share the same score
        # (np.argmax usually takes the first occurrence, which is fine here)
        return np.argmax(p)

    def update(self, arm, context_vector, reward):
        """
        Updates the internal matrices A and b for the specific arm that was chosen.
        
        Args:
            arm (int): The arm index that was selected.
            context_vector (np.array): The feature vector observed.
            reward (float): The actual reward (NDCG) received.
        """
        # Outer product of context vector (d x d)
        self.A[arm] += np.outer(context_vector, context_vector)
        
        # Update bias vector
        self.b[arm] += reward * context_vector

In [32]:
def rbf_kernel(x, y, sigma=1.0):
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    return np.exp(-np.linalg.norm(x - y) ** 2 / (2.0 * sigma ** 2))

class FastKernelUCBAgent:
    """
    NumPy-only Fast Kernel UCB agent (incrementally updates kernel inverse).
    Compatible with existing train_agent usage: select_arm(context: np.array) -> int,
    update(arm: int, x_new: np.array, reward: float).
    """
    def __init__(self, n_arms, kernel_fn=rbf_kernel, alpha=1.0, lambda_reg=1.0, device='cpu', sigma=1.0):
        self.n_arms = n_arms
        self.kernel_fn = kernel_fn
        self.alpha = alpha
        self.lambda_reg = float(lambda_reg)
        self.sigma = sigma

        # Per-arm stored data
        self.X = [[] for _ in range(n_arms)]      # list of lists of np.arrays
        self.y = [[] for _ in range(n_arms)]      # list of lists of floats
        self.K_inv = [None for _ in range(n_arms)]  # list of np.ndarray or None

    def _kernel_vector(self, x, X_list):
        x = np.asarray(x, dtype=float)
        return np.array([self.kernel_fn(x, xi, sigma=self.sigma) for xi in X_list], dtype=float)

    def select_arm(self, context_vector):
        x = np.asarray(context_vector, dtype=float)
        p = np.full((self.n_arms,), -np.inf, dtype=float)

        for arm in range(self.n_arms):
            if len(self.X[arm]) == 0:
                p[arm] = np.inf  # force exploration of unseen arms
                continue

            X_arm = self.X[arm]
            y_arm = np.array(self.y[arm], dtype=float).flatten()  # shape (n_old,)
            K_inv = self.K_inv[arm]  # shape (n_old, n_old)

            k_vec = self._kernel_vector(x, X_arm)  # shape (n_old,)

            mean = float(k_vec.dot(K_inv.dot(y_arm)))
            k_xx = float(self.kernel_fn(x, x, sigma=self.sigma))

            var = k_xx - float(k_vec.dot(K_inv.dot(k_vec)))
            var = max(var, 1e-9)
            std = np.sqrt(var)

            p[arm] = mean + self.alpha * std

        return int(np.argmax(p))

    def update(self, arm, x_new, reward):
        x = np.asarray(x_new, dtype=float)
        X_arm = self.X[arm]
        K_inv = self.K_inv[arm]

        # Append new sample and reward
        X_arm.append(x)
        self.y[arm].append(float(reward))

        if len(X_arm) == 1:
            k_xx = float(self.kernel_fn(x, x, sigma=self.sigma))
            self.K_inv[arm] = np.array([[1.0 / (k_xx + self.lambda_reg)]], dtype=float)
            return

        # Compute quantities for block matrix inverse update
        k_vec = self._kernel_vector(x, X_arm[:-1])  # shape (n_old,)
        k_xx = float(self.kernel_fn(x, x, sigma=self.sigma))

        # Ensure K_inv is np.ndarray
        K_inv = np.asarray(K_inv, dtype=float)
        term1 = K_inv.dot(k_vec)  # shape (n_old,)
        c = float(k_xx + self.lambda_reg - k_vec.dot(term1))
        c = max(c, 1e-9)

        n_old = len(X_arm) - 1
        K_inv_new = np.zeros((n_old + 1, n_old + 1), dtype=float)

        K_inv_new[:n_old, :n_old] = K_inv + np.outer(term1, term1) / c
        v = -term1 / c
        K_inv_new[:n_old, n_old] = v
        K_inv_new[n_old, :n_old] = v
        K_inv_new[n_old, n_old] = 1.0 / c

        self.K_inv[arm] = K_inv_new

In [33]:
def train_agent(dataset=None,fusion=None,algo='linucb'):
    # Configuration


    DATA_PATH = f"data/bandit_data_train_{dataset}_{fusion}.jsonl"
    OUTPUT_PATH = f"data/{algo}_training_history_{dataset}_{fusion}.jsonl"
    N_ARMS = 5
    N_FEATURES = 5
    ALPHA = 1  # Exploration parameter
    TOTAL_STEPS = 50000  # Adjust based on dataset size

    print(f"Initializing Environment from {DATA_PATH}...")
    try:
        dataset = []
        with open(DATA_PATH, 'r') as f:
            for line in f:
                # add 'dataset_name' field
                data_entry = json.loads(line)
                data_entry['dataset_name'] = "bandit_data_train"
                dataset.append(data_entry)
        train_env = Environment(arms=N_ARMS, dataset=dataset)
    except FileNotFoundError:
        print(f"Error: Could not find {DATA_PATH}. Please run Phase 1 (Data Generation) first.")
        return

    if algo == 'linucb':
        print(f"Initializing LinUCB Agent (Arms={N_ARMS}, Features={N_FEATURES}, Alpha={ALPHA})...")
        agent = LinUCBAgent(n_arms=N_ARMS, n_features=N_FEATURES, alpha=ALPHA)
    elif algo == 'fku':
        print(f"Initializing FastKernelUCB Agent (Arms={N_ARMS}, Alpha={ALPHA})...")
        agent = FastKernelUCBAgent(n_arms=N_ARMS, alpha=ALPHA, device='cpu')
    else:
        print(f"Error: Unknown algorithm '{algo}'. Supported: 'linucb', 'fku'.")
        return

    history = []
    cumulative_reward = 0.0

    print("Starting Training Loop...")
    # tqdm provides a progress bar
    for step in tqdm(range(TOTAL_STEPS), desc="Training"):
        # 1. Get Context
        # The Environment cycles through the pre-computed dataset
        query_data = train_env.dataset[train_env.index]
        context = np.array(query_data.get('features', np.random.rand(N_FEATURES)))
        
        # 2. Select Action (Bandit Decision)
        chosen_arm = agent.select_arm(context)
        
        # 3. Get Reward (Simulate Partial Feedback)
        # We only reveal the reward for the arm we actually picked
        # print("Chosen Arm:", chosen_arm)
        reward = train_env.choose_arm(chosen_arm)
        
        # 4. Update Policy
        agent.update(chosen_arm, context, reward)
        
        # 5. Logging
        cumulative_reward += reward
        
        log_entry = {
            'step': step,
            'query_id': query_data.get('query_id', 'unknown'),
            'chosen_arm': int(chosen_arm),
            'reward': float(reward),
            'optimal_arm': int(query_data.get('optimal_arm', -1)),
            'regret': float(query_data['rewards'][query_data['optimal_arm']] - reward),
            'cumulative_reward': cumulative_reward
        }
        history.append(log_entry)

    # Save training history for analysis (Plotting Regret/Arm Distribution)
    print(f"Saving training history to {OUTPUT_PATH}...")
    with open(OUTPUT_PATH, 'w') as f:
        for entry in history:
            f.write(json.dumps(entry) + '\n')
    
    print("Training Complete.")

In [34]:
# train_agent(dataset='msmarco',fusion='minmax',algo='linucb')
# train_agent(dataset='msmarco', fusion='rrf', algo='linucb')
# train_agent(dataset='msmarco', fusion='zscore', algo='linucb')
train_agent(dataset='msmarco',fusion='minmax',algo='fku')


Initializing Environment from data/bandit_data_train_msmarco_minmax.jsonl...
Initializing FastKernelUCB Agent (Arms=5, Alpha=1)...
Starting Training Loop...


Training:  17%|█▋        | 8548/50000 [02:16<11:03, 62.50it/s] 


KeyboardInterrupt: 

In [26]:
def analyze_training_results():
    """Verify training results against original dataset."""
    HISTORY_PATH = "../MABhybrid/data/linucb_training_history.jsonl"
    DATA_PATH = "../MABhybrid/data/bandit_data_train.jsonl"
    
    print("Loading training history and dataset...")
    history = []
    with open(HISTORY_PATH, 'r') as f:
        for line in f:
            history.append(json.loads(line))
    
    dataset = {}
    with open(DATA_PATH, 'r') as f:
        for line in f:
            entry = json.loads(line)
            dataset[entry['query_id']] = entry
    
    print(f"\n=== Training Analysis ===")
    print(f"Total steps: {len(history)}")
    print(f"Dataset queries: {len(dataset)}")
    
    # Check 1: Verify rewards match dataset
    mismatches = 0
    for log in history[:100]:  # Check first 100
        qid = log['query_id']
        arm = log['chosen_arm']
        logged_reward = log['reward']
        
        if qid in dataset:
            expected_reward = dataset[qid]['rewards'][arm]
            if abs(logged_reward - expected_reward) > 1e-6:
                print(f"MISMATCH Step {log['step']}: Query {qid}, Arm {arm}")
                print(f"  Logged: {logged_reward}, Expected: {expected_reward}")
                mismatches += 1
    
    print(f"\nReward mismatches: {mismatches}")
    
    # Check 2: Verify optimal_arm matches
    optimal_mismatches = 0
    for log in history[:100]:
        qid = log['query_id']
        if qid in dataset:
            logged_optimal = log['optimal_arm']
            expected_optimal = dataset[qid]['optimal_arm']
            if logged_optimal != expected_optimal:
                print(f"OPTIMAL MISMATCH Step {log['step']}: Query {qid}")
                print(f"  Logged: {logged_optimal}, Expected: {expected_optimal}")
                optimal_mismatches += 1
    
    print(f"Optimal arm mismatches: {optimal_mismatches}")
    
    # Check 3: Regret calculation
    print(f"\n=== Regret Statistics ===")
    regrets = [log['regret'] for log in history]
    print(f"Mean regret: {np.mean(regrets):.6f}")
    print(f"Min regret: {np.min(regrets):.6f}")
    print(f"Max regret: {np.max(regrets):.6f}")
    print(f"Total cumulative regret: {sum(regrets):.6f}")
    
    # Check 4: Arm distribution
    print(f"\n=== Arm Selection Distribution ===")
    arm_counts = {}
    for log in history:
        arm = log['chosen_arm']
        arm_counts[arm] = arm_counts.get(arm, 0) + 1
    
    for arm in sorted(arm_counts.keys()):
        pct = 100 * arm_counts[arm] / len(history)
        print(f"Arm {arm}: {arm_counts[arm]} times ({pct:.2f}%)")
    
    # Check 5: Cumulative reward trend
    print(f"\n=== Reward Trend ===")
    final_cumulative = history[-1]['cumulative_reward']
    print(f"Final cumulative reward: {final_cumulative:.6f}")
    print(f"Average reward per step: {final_cumulative / len(history):.6f}")
    
    # Check monotonicity
    non_monotonic = 0
    for i in range(1, len(history)):
        if history[i]['cumulative_reward'] < history[i-1]['cumulative_reward']:
            non_monotonic += 1
    print(f"Non-monotonic steps (should be 0): {non_monotonic}")

analyze_training_results()

Loading training history and dataset...

=== Training Analysis ===
Total steps: 50000
Dataset queries: 50000

Reward mismatches: 0
Optimal arm mismatches: 0

=== Regret Statistics ===
Mean regret: 0.089998
Min regret: 0.000000
Max regret: 1.000000
Total cumulative regret: 4499.887314

=== Arm Selection Distribution ===
Arm 0: 6101 times (12.20%)
Arm 1: 30889 times (61.78%)
Arm 2: 12044 times (24.09%)
Arm 3: 684 times (1.37%)
Arm 4: 282 times (0.56%)

=== Reward Trend ===
Final cumulative reward: 19520.384885
Average reward per step: 0.390408
Non-monotonic steps (should be 0): 0


In [6]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sys

# Configuration
HISTORY_FILE = "../MABhybrid/data/linucb_training_history.jsonl"
DATA_FILE = "../MABhybrid/data/bandit_data_train.jsonl"
OUTPUT_DIR = "../MABhybrid/fig"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_data():
    """Loads history and enriches it with baseline rewards from ground truth."""
    if not os.path.exists(HISTORY_FILE) or not os.path.exists(DATA_FILE):
        print("Error: Missing history or data file.")
        return None

    print("Loading training history...")
    history = []
    with open(HISTORY_FILE, 'r') as f:
        for line in f:
            history.append(json.loads(line))
    df = pd.DataFrame(history)

    print("Loading ground truth data for baselines...")
    ground_truth = []
    with open(DATA_FILE, 'r') as f:
        for line in f:
            ground_truth.append(json.loads(line))
    
    # Calculate Baseline Rewards
    # We assume the training loop iterated through ground_truth sequentially
    n_data = len(ground_truth)
    
    static_rewards = []
    random_rewards = []
    optimal_rewards = []

    for step in df['step']:
        # Map step to index in the original data file (handling wrap-around if any)
        idx = step % n_data
        entry = ground_truth[idx]
        
        # Static Baseline: Always choose Arm 2 (Index 2 corresponds to alpha=0.5)
        static_rewards.append(entry['rewards'][2])
        
        # Random Baseline: Expected value is the mean of all arms
        random_rewards.append(np.mean(entry['rewards']))
        
        # Optimal Reward (for calculating regret)
        optimal_rewards.append(max(entry['rewards']))

    df['static_reward'] = static_rewards
    df['random_reward'] = random_rewards
    df['optimal_reward'] = optimal_rewards
    
    # Calculate Regrets
    df['static_regret'] = df['optimal_reward'] - df['static_reward']
    df['random_regret'] = df['optimal_reward'] - df['random_reward']
    # 'regret' column already exists for the agent in the history file
    
    return df, ground_truth

def plot_comparative_cumulative_reward(df):
    """
    Plots Cumulative Mean Reward: Agent vs Static vs Random
    """
    plt.figure(figsize=(10, 6))
    
    # Calculate expanding means (cumulative average)
    df['agent_cum_mean'] = df['reward'].expanding().mean()
    df['static_cum_mean'] = df['static_reward'].expanding().mean()
    df['random_cum_mean'] = df['random_reward'].expanding().mean()
    
    plt.plot(df['step'], df['agent_cum_mean'], label='LinUCB Agent', color='blue', linewidth=2)
    plt.plot(df['step'], df['static_cum_mean'], label='Static (α=0.5)', color='green', linestyle='--')
    plt.plot(df['step'], df['random_cum_mean'], label='Random', color='gray', linestyle=':')
    
    plt.title('Performance Comparison: Cumulative Mean Reward (NDCG@10)')
    plt.xlabel('Training Steps')
    plt.ylabel('Average Reward (NDCG)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    output_path = os.path.join(OUTPUT_DIR, 'comparative_reward.png')
    plt.savefig(output_path)
    print(f"Saved Comparative Reward plot to {output_path}")
    plt.close()

def plot_comparative_cumulative_regret(df):
    """
    Plots Cumulative Regret: Agent vs Static vs Random
    Lower is better.
    """
    plt.figure(figsize=(10, 6))
    
    plt.plot(df['step'], df['regret'].cumsum(), label='LinUCB Agent', color='red', linewidth=2)
    plt.plot(df['step'], df['static_regret'].cumsum(), label='Static (α=0.5)', color='green', linestyle='--')
    plt.plot(df['step'], df['random_regret'].cumsum(), label='Random', color='gray', linestyle=':')
    
    plt.title('Performance Comparison: Cumulative Regret')
    plt.xlabel('Training Steps')
    plt.ylabel('Cumulative Regret (Lost NDCG)')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    output_path = os.path.join(OUTPUT_DIR, 'comparative_regret.png')
    plt.savefig(output_path)
    print(f"Saved Comparative Regret plot to {output_path}")
    plt.close()

def plot_arm_distribution(df):
    plt.figure(figsize=(8, 6))
    arm_counts = df['chosen_arm'].value_counts().sort_index()
    arms = arm_counts.index.tolist()
    counts = arm_counts.values.tolist()
    alpha_map = {0: '0.0 (Dense)', 1: '0.25', 2: '0.5 (Hybrid)', 3: '0.75', 4: '1.0 (Sparse)'}
    labels = [alpha_map.get(a, str(a)) for a in arms]
    
    plt.bar(arms, counts, color='purple', alpha=0.7)
    plt.xticks(arms, labels, rotation=45)
    plt.title('Agent Choice Distribution')
    plt.xlabel('Arm (Alpha Value)')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'arm_distribution.png'))
    plt.close()

def analyze_feature_importance(history_df, ground_truth_data):
    """
    Reconstructs the agent to analyze feature weights.
    """
    # Import locally to avoid issues if MABhybrid isn't in pythonpath
    # sys.path.append(os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'MABhybrid'))
    # try:
    #     from linucb import LinUCBAgent
    # except ImportError:
    #     print("Could not import LinUCBAgent for feature analysis.")
    #     return

    print("\n--- Reconstructing Agent for Feature Analysis ---")
    # 5 features: Length, MaxIDF, AvgIDF, QFlag, Bias
    agent = LinUCBAgent(n_arms=5, n_features=5, alpha=0.1)
    
    # Replay history to update agent
    n_data = len(ground_truth_data)
    
    # Limit reconstruction to first 50k steps or full length to save time if large
    steps_to_replay = min(len(history_df), 50000) 
    
    for i in range(steps_to_replay):
        row = history_df.iloc[i]
        step_idx = int(row['step']) % n_data
        
        # Get context from ground truth
        features = np.array(ground_truth_data[step_idx]['features'])
        chosen_arm = int(row['chosen_arm'])
        reward = float(row['reward'])
        
        agent.update(chosen_arm, features, reward)

    feature_names = ['Length', 'MaxIDF', 'AvgIDF', 'QFlag', 'Bias']
    
    # Print weights for Sparse (Arm 4) vs Dense (Arm 0)
    print("\n[Interpretability Result] Learned Feature Weights:")
    
    # Helper to print vector
    def print_arm_weights(arm_idx, name):
        theta = np.linalg.inv(agent.A[arm_idx]) @ agent.b[arm_idx]
        print(f"\nArm {arm_idx} ({name}):")
        for f, w in zip(feature_names, theta):
            print(f"  {f:10s}: {w: .4f}")

    print_arm_weights(0, "Dense Retrieval")
    print_arm_weights(4, "Sparse Retrieval")

def main():
    result = load_data()
    if result is None: return
    df, ground_truth = result
    
    print(f"Loaded {len(df)} training steps.")
    
    plot_comparative_cumulative_reward(df)
    plot_comparative_cumulative_regret(df)
    plot_arm_distribution(df)
    analyze_feature_importance(df, ground_truth)
    
    print(f"\nAnalysis complete. Check {OUTPUT_DIR} for plots.")

if __name__ == "__main__":
    main()

Loading training history...
Loading ground truth data for baselines...
Loaded 50000 training steps.
Saved Comparative Reward plot to ../MABhybrid/fig/comparative_reward.png
Saved Comparative Regret plot to ../MABhybrid/fig/comparative_regret.png

--- Reconstructing Agent for Feature Analysis ---

[Interpretability Result] Learned Feature Weights:

Arm 0 (Dense Retrieval):
  Length    : -0.0134
  MaxIDF    : -0.0068
  AvgIDF    :  0.0102
  QFlag     :  0.0103
  Bias      :  0.4463

Arm 4 (Sparse Retrieval):
  Length    :  0.0157
  MaxIDF    : -0.0033
  AvgIDF    :  0.0212
  QFlag     : -0.1010
  Bias      : -0.0785

Analysis complete. Check ../MABhybrid/fig for plots.
