In [8]:
import numpy as np
import json
import os
from tqdm import tqdm

In [9]:
# MAB utils
import requests
import os
import json
# from SPARQLWrapper import SPARQLWrapper, JSON
import re
import argparse
import torch
import numpy as np
import random
import torch.nn.functional as F
import functools
import pandas as pd
import ast  
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

class Environment(object):
    def __init__(self, arms, dataset, args=None, preding=False):
        self.arms = arms
        self.dataset = dataset
        self.preding = preding
        self.index = -1
        self.alpha_values = [0.0, 0.25, 0.5, 0.75, 1.0]
        self.skip_dataset = []
        self._update_state()
        

    def _update_state(self):
        self.index += 1
        if self.index >= len(self.dataset):
            self.index = 0
        
        while self.dataset[self.index]['dataset_name'] in self.skip_dataset and not self.preding:
            self.index += 1
            if self.index >= len(self.dataset):
                self.index = 0

        self.state = self.dataset[self.index]['text']
        
        # self.state = np.random.randint(0, self.arms)
    def _index_to_arm(self,index):
        if type(index) == np.ndarray:
            assert len(index) == 1
            index = index[0]
        return self.alpha_values[int(index)]
        
    def get_state(self):
        return self.state
        # return self.state

    def _get_reward(self, arm):
        """
        Returns the pre-computed reward for the selected arm.
        arm (int): Index 0-4 corresponding to alpha values [0.0, 0.25, 0.5, 0.75, 1.0]
        """
        query_data = self.dataset[self.index]
        rewards = query_data.get("rewards", [0.0] * 5)
        
        # Validate arm index
        if arm < 0 or arm >= len(rewards):
            print(f"Warning: Arm {arm} out of range for rewards list of length {len(rewards)}")
            return 0.0
        
        return float(rewards[int(arm)])
                               

    def _get_recall(self,arm):
        raise NotImplementedError
        method = self._index_to_arm(arm)
        return self.dataset[self.index][method+'_eval']['recall']

    def choose_arm(self, arm):
        reward = self._get_reward(arm)
        # recall = self._get_recall(arm)
        self._update_state()
        return reward
    
    def __len__(self):
        return len(self.dataset)

In [10]:
import numpy as np

class LinUCBAgent:
    """
    Disjoint Linear Upper Confidence Bound (LinUCB) Agent.
    
    References:
        Li et al., "A Contextual-Bandit Approach to Personalized News Article Recommendation", WWW 2010.
        (Algorithm 1)
    """
    def __init__(self, n_arms, n_features, alpha=0.1):
        """
        Args:
            n_arms (int): Number of distinct actions (fusion weights).
            n_features (int): Dimension of the context vector.
            alpha (float): Exploration hyperparameter. Higher alpha = more exploration.
        """
        self.n_arms = n_arms
        self.n_features = n_features
        self.alpha = alpha
        
        # Initialize disjoint matrices for each arm
        # A: Covariance matrix (d x d), initialized to Identity for Ridge Regularization
        # b: Reward-weighted feature vector (d x 1), initialized to zeros
        self.A = [np.identity(n_features) for _ in range(n_arms)]
        self.b = [np.zeros(n_features) for _ in range(n_arms)]

    def select_arm(self, context_vector):
        """
        Selects an arm based on the Upper Confidence Bound (UCB) of the estimated reward.
        
        Args:
            context_vector (np.array): Shape (n_features,)
            
        Returns:
            int: Index of the selected arm.
        """
        p = np.zeros(self.n_arms)
        
        for a in range(self.n_arms):
            # 1. Compute the inverse of A (Ridge Regression covariance)
            # In production, use np.linalg.solve or update inverse iteratively for speed
            A_inv = np.linalg.inv(self.A[a])
            
            # 2. Compute the estimated coefficient (theta)
            # theta = A^-1 * b
            theta = A_inv @ self.b[a]
            
            # 3. Calculate the standard deviation (uncertainty width)
            # std = sqrt(x.T * A^-1 * x)
            std_dev = np.sqrt(context_vector.T @ A_inv @ context_vector)
            
            # 4. Calculate UCB
            # Prediction + Exploration Bonus
            p[a] = theta @ context_vector + self.alpha * std_dev
            
        # Tie-breaking: randomly choose among max if multiple arms share the same score
        # (np.argmax usually takes the first occurrence, which is fine here)
        return np.argmax(p)

    def update(self, arm, context_vector, reward):
        """
        Updates the internal matrices A and b for the specific arm that was chosen.
        
        Args:
            arm (int): The arm index that was selected.
            context_vector (np.array): The feature vector observed.
            reward (float): The actual reward (NDCG) received.
        """
        # Outer product of context vector (d x d)
        self.A[arm] += np.outer(context_vector, context_vector)
        
        # Update bias vector
        self.b[arm] += reward * context_vector

In [24]:
def train_agent():
    # Configuration
    DATA_PATH = "../MABhybrid/data/bandit_data_train.jsonl"
    OUTPUT_PATH = "../MABhybrid/data/linucb_training_history.jsonl"
    N_ARMS = 5
    N_FEATURES = 5
    ALPHA = 1  # Exploration parameter
    TOTAL_STEPS = 50000  # Adjust based on dataset size

    print(f"Initializing Environment from {DATA_PATH}...")
    try:
        dataset = []
        with open(DATA_PATH, 'r') as f:
            for line in f:
                # add 'dataset_name' field
                data_entry = json.loads(line)
                data_entry['dataset_name'] = "bandit_data_train"
                dataset.append(data_entry)
        train_env = Environment(arms=N_ARMS, dataset=dataset)
    except FileNotFoundError:
        print(f"Error: Could not find {DATA_PATH}. Please run Phase 1 (Data Generation) first.")
        return

    print(f"Initializing LinUCB Agent (Arms={N_ARMS}, Features={N_FEATURES}, Alpha={ALPHA})...")
    agent = LinUCBAgent(n_arms=N_ARMS, n_features=N_FEATURES, alpha=ALPHA)

    history = []
    cumulative_reward = 0.0

    print("Starting Training Loop...")
    # tqdm provides a progress bar
    for step in tqdm(range(TOTAL_STEPS), desc="Training"):
        # 1. Get Context
        # The Environment cycles through the pre-computed dataset
        query_data = train_env.dataset[train_env.index]
        context = np.array(query_data.get('features', np.random.rand(N_FEATURES)))
        
        # 2. Select Action (Bandit Decision)
        chosen_arm = agent.select_arm(context)
        
        # 3. Get Reward (Simulate Partial Feedback)
        # We only reveal the reward for the arm we actually picked
        # print("Chosen Arm:", chosen_arm)
        reward = train_env.choose_arm(chosen_arm)
        
        # 4. Update Policy
        agent.update(chosen_arm, context, reward)
        
        # 5. Logging
        cumulative_reward += reward
        
        log_entry = {
            'step': step,
            'query_id': query_data.get('query_id', 'unknown'),
            'chosen_arm': int(chosen_arm),
            'reward': float(reward),
            'optimal_arm': int(query_data.get('optimal_arm', -1)),
            'regret': float(query_data['rewards'][query_data['optimal_arm']] - reward),
            'cumulative_reward': cumulative_reward
        }
        history.append(log_entry)

    # Save training history for analysis (Plotting Regret/Arm Distribution)
    print(f"Saving training history to {OUTPUT_PATH}...")
    with open(OUTPUT_PATH, 'w') as f:
        for entry in history:
            f.write(json.dumps(entry) + '\n')
    
    print("Training Complete.")

In [25]:
train_agent()

Initializing Environment from ../MABhybrid/data/bandit_data_train.jsonl...
Initializing LinUCB Agent (Arms=5, Features=5, Alpha=1)...
Starting Training Loop...


Training: 100%|██████████| 50000/50000 [00:01<00:00, 29441.29it/s]


Saving training history to ../MABhybrid/data/linucb_training_history.jsonl...
Training Complete.


In [26]:
def analyze_training_results():
    """Verify training results against original dataset."""
    HISTORY_PATH = "../MABhybrid/data/linucb_training_history.jsonl"
    DATA_PATH = "../MABhybrid/data/bandit_data_train.jsonl"
    
    print("Loading training history and dataset...")
    history = []
    with open(HISTORY_PATH, 'r') as f:
        for line in f:
            history.append(json.loads(line))
    
    dataset = {}
    with open(DATA_PATH, 'r') as f:
        for line in f:
            entry = json.loads(line)
            dataset[entry['query_id']] = entry
    
    print(f"\n=== Training Analysis ===")
    print(f"Total steps: {len(history)}")
    print(f"Dataset queries: {len(dataset)}")
    
    # Check 1: Verify rewards match dataset
    mismatches = 0
    for log in history[:100]:  # Check first 100
        qid = log['query_id']
        arm = log['chosen_arm']
        logged_reward = log['reward']
        
        if qid in dataset:
            expected_reward = dataset[qid]['rewards'][arm]
            if abs(logged_reward - expected_reward) > 1e-6:
                print(f"MISMATCH Step {log['step']}: Query {qid}, Arm {arm}")
                print(f"  Logged: {logged_reward}, Expected: {expected_reward}")
                mismatches += 1
    
    print(f"\nReward mismatches: {mismatches}")
    
    # Check 2: Verify optimal_arm matches
    optimal_mismatches = 0
    for log in history[:100]:
        qid = log['query_id']
        if qid in dataset:
            logged_optimal = log['optimal_arm']
            expected_optimal = dataset[qid]['optimal_arm']
            if logged_optimal != expected_optimal:
                print(f"OPTIMAL MISMATCH Step {log['step']}: Query {qid}")
                print(f"  Logged: {logged_optimal}, Expected: {expected_optimal}")
                optimal_mismatches += 1
    
    print(f"Optimal arm mismatches: {optimal_mismatches}")
    
    # Check 3: Regret calculation
    print(f"\n=== Regret Statistics ===")
    regrets = [log['regret'] for log in history]
    print(f"Mean regret: {np.mean(regrets):.6f}")
    print(f"Min regret: {np.min(regrets):.6f}")
    print(f"Max regret: {np.max(regrets):.6f}")
    print(f"Total cumulative regret: {sum(regrets):.6f}")
    
    # Check 4: Arm distribution
    print(f"\n=== Arm Selection Distribution ===")
    arm_counts = {}
    for log in history:
        arm = log['chosen_arm']
        arm_counts[arm] = arm_counts.get(arm, 0) + 1
    
    for arm in sorted(arm_counts.keys()):
        pct = 100 * arm_counts[arm] / len(history)
        print(f"Arm {arm}: {arm_counts[arm]} times ({pct:.2f}%)")
    
    # Check 5: Cumulative reward trend
    print(f"\n=== Reward Trend ===")
    final_cumulative = history[-1]['cumulative_reward']
    print(f"Final cumulative reward: {final_cumulative:.6f}")
    print(f"Average reward per step: {final_cumulative / len(history):.6f}")
    
    # Check monotonicity
    non_monotonic = 0
    for i in range(1, len(history)):
        if history[i]['cumulative_reward'] < history[i-1]['cumulative_reward']:
            non_monotonic += 1
    print(f"Non-monotonic steps (should be 0): {non_monotonic}")

analyze_training_results()

Loading training history and dataset...

=== Training Analysis ===
Total steps: 50000
Dataset queries: 50000

Reward mismatches: 0
Optimal arm mismatches: 0

=== Regret Statistics ===
Mean regret: 0.089998
Min regret: 0.000000
Max regret: 1.000000
Total cumulative regret: 4499.887314

=== Arm Selection Distribution ===
Arm 0: 6101 times (12.20%)
Arm 1: 30889 times (61.78%)
Arm 2: 12044 times (24.09%)
Arm 3: 684 times (1.37%)
Arm 4: 282 times (0.56%)

=== Reward Trend ===
Final cumulative reward: 19520.384885
Average reward per step: 0.390408
Non-monotonic steps (should be 0): 0
